In [1]:
from google.colab import drive
import pandas as pd

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
file_path = '/content/gdrive/MyDrive/Tesi Magistrale/CDA/tweets_cda.csv'
tweets_cda = pd.read_csv(file_path)

#Calculating frequencies

**Document frequency of single words**

The aim of this program is calculating the document frequency of each word present in the corpus, that is the ratio between the number of tweets in which a word appears and the number of total tweets.

The main function "get_word_frequencies(df)" has a dictionary as output, which has each word present in the corpus as keys and the corresponding frequency scores as values and is sorted in a descending order.

In [3]:
import pandas as pd

In [6]:
def strip_symbols_from_word(word):
  cleaned_word = ''.join([letter for letter in word if letter.isalpha()])
  return cleaned_word

def strip_symbols_from_list_of_words(list_of_words):
  list_of_words_no_symbols = []
  for w in list_of_words:
    cleaned_word = strip_symbols_from_word(word = w)
    list_of_words_no_symbols.append(cleaned_word)
  return list_of_words_no_symbols

def get_word_frequencies(df):
  frequency_counts_dict = {}
  for index, row in df.iterrows():
   tweet = row["text"]
   tweet_words_list = strip_symbols_from_list_of_words(list_of_words = tweet.lower().split())
   set_tweets_words = set(tweet_words_list)
   for word in set_tweets_words:
    if word in frequency_counts_dict:
      frequency_counts_dict[word] += 1
    else:
      frequency_counts_dict[word] = 1
  frequency_scores_dict ={}
  for key, value in frequency_counts_dict.items():
    newvalue = value/len(df)
    frequency_scores_dict[key] = newvalue

  frequency_scores_dict = dict(sorted(frequency_scores_dict.items(), key=lambda item: item[1], reverse=True))
  return frequency_scores_dict

In [None]:
get_word_frequencies(df = tweets_cda)

{'of': 0.9387755102040817,
 'thrones': 0.8204081632653061,
 'game': 0.8204081632653061,
 'the': 0.5836734693877551,
 'and': 0.43945578231292515,
 'a': 0.4,
 'is': 0.38503401360544215,
 'to': 0.3687074829931973,
 'racist': 0.29523809523809524,
 'in': 0.2829931972789116,
 'sexist': 0.2653061224489796,
 'i': 0.25578231292517006,
 'that': 0.23945578231292516,
 '': 0.2108843537414966,
 'it': 0.18231292517006803,
 'about': 0.17959183673469387,
 'this': 0.17414965986394557,
 'are': 0.1727891156462585,
 'for': 0.17142857142857143,
 'but': 0.17006802721088435,
 'on': 0.1673469387755102,
 'not': 0.14285714285714285,
 'just': 0.13333333333333333,
 'be': 0.13197278911564625,
 'you': 0.12244897959183673,
 'with': 0.12108843537414966,
 'racism': 0.11700680272108843,
 'like': 0.11428571428571428,
 'show': 0.11428571428571428,
 'people': 0.11292517006802721,
 'misogyny': 0.11020408163265306,
 'was': 0.10884353741496598,
 'all': 0.10748299319727891,
 'so': 0.10612244897959183,
 'have': 0.10340136054421

In [None]:
#tests

def get_test_df():
  test_data = {
    'text': [
        "Game of Thrones is shit!",
        "I am in love with the show",
        "The show is sexist.",
        "I think the show is not sexist"
        ]
    }
  test_df = pd.DataFrame(test_data)
  return test_df

test_df = get_test_df()

def test_strip_punctuation_from_word():
  word = strip_symbols_from_word(word = "racist121@$")
  print(word)
  assert word == "racist"
  print("test passed")

test_strip_punctuation_from_word()

def test_strip_symbols_from_list_of_words():
  list_of_words = strip_symbols_from_list_of_words(list_of_words = ["Hi!", "ok?", "well.", "#hello", "by3"])
  assert list_of_words == ["Hi", "ok", "well", "hello", "by"]
  print("test passed")

test_strip_symbols_from_list_of_words()


def test_get_word_frequencies():
  frequency_scores_dict = get_word_frequencies(df = test_df)
  print(frequency_scores_dict)
  assert frequency_scores_dict == {
      'is': 0.75,
      'the': 0.75,
      'show': 0.75,
      'i': 0.5,
      'sexist': 0.5,
      'game': 0.25,
      'of': 0.25,
      'thrones': 0.25,
      'shit': 0.25,
      'am': 0.25,
      'in': 0.25,
      'love': 0.25,
      'with': 0.25,
      'think': 0.25,
      'not': 0.25
      }
  print("test passed")

test_get_word_frequencies()

racist
test passed
test passed
{'is': 0.75, 'show': 0.75, 'the': 0.75, 'i': 0.5, 'sexist': 0.5, 'of': 0.25, 'thrones': 0.25, 'shit': 0.25, 'game': 0.25, 'with': 0.25, 'in': 0.25, 'am': 0.25, 'love': 0.25, 'not': 0.25, 'think': 0.25}
test passed


**Document frequency of clusters of words**

The aim of this program is calculating the document frequency of each cluster of words present in the corpus, that is the ratio between the number of tweets in which a cluster appears and the number of total tweets.

The main function "get_clusters_frequency(df)" has a dictionary as output, which has each cluster present in the corpus as keys and the corresponding frequency scores as values and is sorted in a descending order.

In [8]:
def get_clusters_of_a_tweet(tweet, cluster_len):
  tweet_words_list = tweet.lower().split()
  tweet_clusters_list = []
  for i in range(len(tweet_words_list)-cluster_len+1):
    cluster = tweet_words_list[i:i+cluster_len]
    cluster = strip_symbols_from_list_of_words(list_of_words = cluster)
    cluster = ' '.join(cluster)
    tweet_clusters_list.append(cluster)
  return tweet_clusters_list

def get_all_clusters_list(df):
  all_clusters_list =[]
  all_clusters_list_flat = []
  for index, row in df.iterrows():
    clusters_of_single_tweet = get_clusters_of_a_tweet(tweet = row['text'], cluster_len = 3)
    all_clusters_list.append(clusters_of_single_tweet)

  for i in all_clusters_list:
    for cluster in i:
      all_clusters_list_flat.append(cluster)
  return all_clusters_list_flat

def get_clusters_frequencies(df):
  frequency_counts_dict = {}
  all_clusters_list = get_all_clusters_list(df = df)
  set_clusters = set(all_clusters_list)
  for cluster in all_clusters_list:
    if cluster in frequency_counts_dict:
      frequency_counts_dict[cluster] += 1
    else:
      frequency_counts_dict[cluster] = 1
  frequency_scores_dict ={}
  for key, value in frequency_counts_dict.items():
    doc_frequency = value/len(df)
    frequency_scores_dict[key] = doc_frequency

  frequency_scores_dict = dict(sorted(frequency_scores_dict.items(), key=lambda item: item[1], reverse=True))
  return frequency_scores_dict

In [10]:
get_clusters_frequencies(df = tweets_cda)

{'game of thrones': 0.8244897959183674,
 'of thrones is': 0.12108843537414966,
 'of game of': 0.09115646258503401,
 '  ': 0.050340136054421766,
 'the game of': 0.04081632653061224,
 'in game of': 0.036734693877551024,
 'of thrones and': 0.031292517006802724,
 'of thrones has': 0.02857142857142857,
 'about game of': 0.02857142857142857,
 'thrones is racist': 0.02857142857142857,
 'the middle ages': 0.02312925170068027,
 'like game of': 0.021768707482993196,
 'on game of': 0.02040816326530612,
 'of thrones but': 0.02040816326530612,
 'watch game of': 0.02040816326530612,
 'as game of': 0.02040816326530612,
 'of thrones would': 0.02040816326530612,
 'is game of': 0.01904761904761905,
 'ugame of thronesu': 0.01904761904761905,
 'as sexist as': 0.01904761904761905,
 'of thrones finale': 0.017687074829931974,
 'middle ages werenut': 0.017687074829931974,
 'ages werenut as': 0.017687074829931974,
 'werenut as sexist': 0.017687074829931974,
 'sexist as game': 0.017687074829931974,
 'thrones wo

In [None]:
#tests

def get_test_df():
  test_data = {'text':["I love Game of Thrones, but the final season is disappointing",
      "I hate Game of Thrones"]}
  test_df = pd.DataFrame(test_data)
  return test_df

test_df = get_test_df()

def test_get_clusters():
  clusters = get_clusters_of_a_tweet(tweet = "I love Game of Thrones, but the final season is disappointing", cluster_len = 3)
  assert clusters == ["i love game", "love game of", "game of thrones", "of thrones but", "thrones but the", "but the final", "the final season", "final season is", "season is disappointing"]
  print("test passed")

test_get_clusters()

def test_get_all_clusters_list():
  all_clusters_list = get_all_clusters_list(df = test_df)
  assert all_clusters_list == [
      'i love game', 'love game of', 'game of thrones', 'of thrones but',
      'thrones but the', 'but the final', 'the final season', 'final season is',
      'season is disappointing', 'i hate game', 'hate game of', 'game of thrones']
  print("test passed")

test_get_all_clusters_list()

def test_get_clusters_frequencies():
  frequencies_dict = get_clusters_frequencies(df = test_df)
  assert frequencies_dict == {'i love game': 0.5,
                             'love game of':0.5,
                              'game of thrones': 1,
                              'of thrones but':0.5,
                              'thrones but the':0.5,
                              'but the final':0.5,
                              'the final season':0.5,
                              'final season is':0.5,
                              'season is disappointing':0.5,
                              'i hate game':0.5,
                              'hate game of': 0.5}
  print("test passed")

test_get_clusters_frequencies()

test passed
test passed
test passed


#Keyness

In [33]:
df_positive = tweets_cda[tweets_cda['label'] == '+']
df_negative = tweets_cda[tweets_cda['label'] == '-']

df_positive = df_positive.reset_index(drop=True)
df_negative = df_negative.reset_index(drop=True)

In [34]:
get_word_frequencies(df = df_positive)

{'of': 0.9777777777777777,
 'thrones': 0.8888888888888888,
 'game': 0.8666666666666667,
 'the': 0.6888888888888889,
 'a': 0.5333333333333333,
 'and': 0.5333333333333333,
 'is': 0.4444444444444444,
 'in': 0.4222222222222222,
 'to': 0.37777777777777777,
 'that': 0.3333333333333333,
 'show': 0.3333333333333333,
 '': 0.3111111111111111,
 'i': 0.3111111111111111,
 'are': 0.28888888888888886,
 'sexist': 0.28888888888888886,
 'but': 0.26666666666666666,
 'for': 0.2,
 'this': 0.2,
 'its': 0.17777777777777778,
 'it': 0.17777777777777778,
 'not': 0.17777777777777778,
 'itus': 0.17777777777777778,
 'because': 0.15555555555555556,
 'women': 0.15555555555555556,
 'all': 0.15555555555555556,
 'about': 0.15555555555555556,
 'misogynistic': 0.15555555555555556,
 'racist': 0.15555555555555556,
 'was': 0.15555555555555556,
 'misogyny': 0.13333333333333333,
 'people': 0.13333333333333333,
 'characters': 0.13333333333333333,
 'got': 0.13333333333333333,
 'on': 0.13333333333333333,
 'just': 0.1333333333333

In [35]:
get_word_frequencies(df = df_negative)

{'of': 0.9387755102040817,
 'game': 0.8367346938775511,
 'thrones': 0.8326530612244898,
 'the': 0.5510204081632653,
 'is': 0.42448979591836733,
 'and': 0.42448979591836733,
 'to': 0.3673469387755102,
 'sexist': 0.3142857142857143,
 'a': 0.30612244897959184,
 'in': 0.27755102040816326,
 'racist': 0.2612244897959184,
 'this': 0.22857142857142856,
 'it': 0.22040816326530613,
 'i': 0.20816326530612245,
 'that': 0.20408163265306123,
 'but': 0.19183673469387755,
 '': 0.17551020408163265,
 'just': 0.17142857142857143,
 'for': 0.1673469387755102,
 'on': 0.16326530612244897,
 'not': 0.1510204081632653,
 'so': 0.14285714285714285,
 'season': 0.14285714285714285,
 'are': 0.13877551020408163,
 'show': 0.1346938775510204,
 'has': 0.1306122448979592,
 'misogynistic': 0.12653061224489795,
 'with': 0.12653061224489795,
 'how': 0.11836734693877551,
 'be': 0.11836734693877551,
 'all': 0.11428571428571428,
 'misogyny': 0.11020408163265306,
 'women': 0.11020408163265306,
 'about': 0.10612244897959183,
 'a

#Concordances

This program aims at printing every concordance line of a chosen term (called search term) in the corpus. Concordance lines show the context of a word each time it appears in a corpus. The context can be composed of a variable number of words present at the left and at the right of the search term.

The main function "get_concordances(df)" has as output a dictionary which has the tweet indexes as keys and the concordance lines as values.

In [None]:
def extract_context_from_tweet(tweet, word, n_words_left, n_words_right):
  contexts = []
  tweet_words = strip_symbols_from_list_of_words(list_of_words = tweet.lower().split())
  word_idx = [i for i, w in enumerate(tweet_words) if w == word]
  for idx in word_idx:
    start_idx = max(0, idx - n_words_left)
    end_idx = min(len(tweet_words), idx + n_words_right + 1)
    context =  ' '.join(tweet.split()[start_idx:end_idx])
    contexts.append(context)
  return contexts

def get_concordances(df, word, n_words_left, n_words_right):
  concordances_dict = {}
  for index, row in df.iterrows():
   contexts = extract_context_from_tweet(
       tweet = row['text'],
       word = word,
       n_words_left = n_words_left,
       n_words_right = n_words_right)
   if contexts:
      concordances_dict[index] = contexts
      for context in contexts:
        print(f"tweet id {index}: {context}")
  return concordances_dict

In [None]:
concordances_dict = get_concordances(df = tweets_cda, word = "sexist", n_words_left = 5, n_words_right = 5)

tweet id 9: titties than peen but ?? sexist ? hate it. bye bye
tweet id 12: Sexist tropes are ruining Game of
tweet id 16: @isabelsdieppa How is this sexist? https://t.co/OCvjdLuRFp It's shallow, sure. But
tweet id 17: profile of Miriam Toews; the sexist treatment of the US women's
tweet id 18: profile of Miriam Toews; the sexist treatment of the US women's
tweet id 21: Here's her take on how sexist tropes are ruining #GOT's female
tweet id 33: me get through the racist, sexist America that the mango moron
tweet id 34: Sexist tropes are ruining Game of
tweet id 37: in Game of Thrones is sexist apparently so yeah let them
tweet id 43: @Rus Yusupov is a creepy sexist monster, and the resulting internal
tweet id 51: of her hate is incredibly sexist.
tweet id 52: but so far it’s so sexist and the dialogue is super
tweet id 69: of thrones to be really sexist but did not anticipate how
tweet id 80: the sexist shit women have to deal
tweet id 81: thought game of thrones was sexist just becaus

In [None]:
concordances_dict = get_concordances(df = tweets_cda, word = "racist", n_words_left = 5, n_words_right = 5)

tweet id 2: of Thrones so much? Shit racist asl
tweet id 4: Gettin mad racist vibes from Game of Thrones
tweet id 7: Are the people of Winterfell racist? https://t.co/64OVpbjCMG
tweet id 8: hometown being a bit fucking racist” #gameofThrones @OzzyManReviews Ozzy Man Reviews:
tweet id 11: @RealJamesWoods Oooh! Now the racist word is being used against
tweet id 13: in Game of Thrones are racist af.
tweet id 14: - Pretty bad, also pretty racist Thanks for tuning in!
tweet id 15: “White Walkers” hbo is so racist
tweet id 20: likes this show is secretly racist https://t.co/cUMYDSEELK
tweet id 22: My dad is racist (no, I am not proud
tweet id 26: the local supermarkets. Typical xenophobic, racist, right wing, banter.
tweet id 27: is watching a lot of racist Game of Thrones...
tweet id 30: the deaths of two openly racist army men is a great
tweet id 33: helps me get through the racist, sexist America that the mango
tweet id 38: what he thought of his racist coworkers
tweet id 42: before and a

In [None]:
#tests
test_data = {'text': ["Game of Thrones is racist as shit, how can somebody like it?",
           "Game of Thrones is the best show I have ever watched!",
           "This series is so racist it disgusts me, I don't know why black people still watch it. It is so racist.",
           "I don't think the series is racist, I just think writers got lazy."]}

test_df = pd.DataFrame(test_data)

def test_extract_context_from_tweet():
  context = extract_context_from_tweet(tweet = "This series is so racist it disgusts me, I don't know why black people still watch it. It is so racist.",
                                       word = "racist", n_words_left = 2,
                                       n_words_right = 2)
  assert context == ["is so racist it disgusts",
                     "is so racist."]
  print("test passed")

test_extract_context_from_tweet()

def test_get_concordances():
  concordances_dict = get_concordances(df= test_df,
                                       word = "racist",
                                       n_words_left = 2,
                                       n_words_right = 2)
  assert concordances_dict == {
      0: ["Thrones is racist as shit,"],
      2: ["is so racist it disgusts", "is so racist."],
      3: ["series is racist, I just"]}
  print("test passed")

test_get_concordances()

test passed
tweet id 0: Thrones is racist as shit,
tweet id 2: is so racist it disgusts
tweet id 2: is so racist.
tweet id 3: series is racist, I just
test passed


#Collocates

The aim of this program is to derive the collocates of a selected search term each time it appears in the corpus. A collocate of a search term is a word that appears frequently near the search term itself. "Near" refers to a span of a certain number of words to the left and the right of the search term (context). Specifically, what the program does is to count how many times each word present in the context word list, extracted with the function "extract_context_from_tweet(tweet, word, n_words_left, n_words_right)", appears in the list itself.

The main function "get_collocates(df, word, n_words_left, n_words_right)" has a dictionary as ooutput, which has the collocates as keys and their respective frequency counts as values, sorted in a descending order.

In [None]:
def extract_context_word_lists_from_tweet(tweet, word, n_words_left, n_words_right):
  context_word_lists = []
  tweet_words = strip_symbols_from_list_of_words(list_of_words = tweet.lower().split())
  word_idx = [i for i, w in enumerate(tweet_words) if w == word]
  for idx in word_idx:
    start_idx = max(0, idx - n_words_left)
    end_idx = min(len(tweet_words), idx + n_words_right + 1)
    context_word_list = tweet_words[start_idx:end_idx]
    context_word_lists.append(context_word_list)
  return context_word_lists

def get_collocates(df, word, n_words_left, n_words_right):
  frequency_counts_dict = {}
  for index, row in df.iterrows():
    context_word_lists = extract_context_word_lists_from_tweet(
        tweet = row['text'],
        word = word,
        n_words_left = n_words_left,
        n_words_right = n_words_right)

    if context_word_lists:
      for word_list in context_word_lists:
        for w in word_list:
          if w != word:
            if w in frequency_counts_dict:
              frequency_counts_dict[w] += 1
            else:
              frequency_counts_dict[w] = 1

  frequency_counts_sorted = dict(sorted(frequency_counts_dict.items(), key=lambda item: item[1], reverse=True))
  return frequency_counts_sorted

In [None]:
get_collocates(df = tweets_cda, word = 'sexist', n_words_left = 5, n_words_right = 5)

{'of': 154,
 'the': 90,
 'thrones': 77,
 'game': 70,
 'is': 46,
 'and': 44,
 'as': 34,
 'a': 30,
 'on': 18,
 'tropes': 17,
 '': 16,
 'racist': 16,
 'in': 16,
 'i': 16,
 'would': 15,
 'ages': 15,
 'so': 14,
 'middle': 14,
 'was': 13,
 'werenut': 13,
 'are': 12,
 'treatment': 12,
 'to': 12,
 'but': 11,
 'it': 11,
 'how': 11,
 'its': 11,
 'that': 11,
 'laziest': 11,
 'for': 10,
 'mother': 10,
 'this': 9,
 'by': 9,
 'episodes': 9,
 'show': 9,
 'finale': 9,
 'women': 8,
 'were': 8,
 'crazy': 8,
 'gameofthrones': 8,
 'has': 7,
 'trope': 7,
 'lazy': 7,
 'you': 7,
 'about': 7,
 'season': 7,
 'thronesu': 7,
 'daenerys': 7,
 'ending': 7,
 'take': 6,
 'not': 6,
 'if': 6,
 'being': 6,
 'been': 6,
 'like': 6,
 'exgirlfrienduthe': 6,
 'aoc': 6,
 'than': 5,
 'ruining': 5,
 'be': 5,
 'have': 5,
 'because': 5,
 'an': 5,
 'they': 5,
 'misogynistic': 5,
 'rely': 5,
 'stereotypes': 5,
 'crazed': 5,
 'powerhungry': 5,
 'rip': 5,
 'ugame': 5,
 'us': 4,
 'really': 4,
 'more': 4,
 'got': 4,
 'now': 4,
 'or': 

In [None]:
get_collocates(df = tweets_cda, word = 'racist', n_words_left = 5, n_words_right = 5)

{'of': 116,
 'is': 77,
 'thrones': 74,
 'game': 64,
 'and': 57,
 'a': 55,
 'the': 54,
 'are': 24,
 'in': 21,
 'for': 19,
 'that': 19,
 'to': 19,
 '': 17,
 'i': 16,
 'sexist': 16,
 'its': 16,
 'show': 15,
 'not': 15,
 'this': 14,
 'it': 13,
 'people': 12,
 'white': 12,
 'they': 12,
 'with': 11,
 'being': 10,
 'as': 9,
 'their': 9,
 'but': 9,
 'how': 9,
 'misogynistic': 9,
 'winterfell': 8,
 'like': 8,
 'you': 8,
 'be': 8,
 'about': 8,
 'or': 8,
 'so': 7,
 'got': 7,
 'out': 7,
 'has': 7,
 'was': 7,
 'her': 6,
 'if': 6,
 'just': 6,
 'me': 5,
 'america': 5,
 'what': 5,
 'other': 5,
 'all': 5,
 'thing': 5,
 'im': 5,
 'most': 5,
 'were': 5,
 'ium': 5,
 'gameofthrones': 4,
 'now': 4,
 'right': 4,
 'called': 4,
 'will': 4,
 'we': 4,
 'tropes': 4,
 'care': 4,
 'on': 4,
 'donut': 4,
 'girls': 4,
 'black': 4,
 'more': 4,
 'some': 4,
 'because': 4,
 'at': 4,
 'stark': 4,
 'porn': 4,
 'pushing': 4,
 'agenda': 4,
 'billionaire': 4,
 'saving': 4,
 'your': 4,
 'writers': 4,
 'still': 4,
 'misogynist':

In [None]:
#tests

def get_test_df():
  test_data = {'text': ["Game of thrones is racist as shit, how can somebody like it? It is so racist.",
           "Game of Thrones is the best show I have ever watched!",
           "This series is so racist it disgusts me, I don't know why black people still watch it.",
           "I don't think the series is racist, I just think writers got lazy.",
            "Game of Thrones is shit and racist.",
            "Game of Thrones is shit, racist and shit again."]}

  test_df = pd.DataFrame(test_data)
  return test_df
test_df = get_test_df()

def test_extract_context_words_from_tweet():
  context = extract_context_word_lists_from_tweet(tweet = "Game of thrones is racist as shit, how can somebody like it? It is so racist.",
                                       word = "racist",
                                       n_words_left = 2,
                                       n_words_right = 2)

  assert context == [['thrones', 'is', 'racist', 'as', 'shit'], ["is", "so", "racist"]]
  print("test passed")

test_extract_context_words_from_tweet()


def test_get_collocates():
  collocates = get_collocates(df = test_df, word = "racist", n_words_left = 2, n_words_right = 2)
  print(collocates)
  assert collocates == {'is': 5,
                        'shit': 4,
                        'so': 2,
                        'and': 2,
                        'thrones': 1,
                        'as': 1,
                        'it': 1,
                        'disgusts': 1,
                        'series': 1,
                        'i': 1,
                        'just': 1}
  print("test passed")
test_get_collocates()

test passed
{'is': 5, 'shit': 4, 'so': 2, 'and': 2, 'thrones': 1, 'as': 1, 'it': 1, 'disgusts': 1, 'series': 1, 'i': 1, 'just': 1}
test passed


#Concordances of search term when it collocates with a certain word

The aim of this program is to derive and print each concordance line in which a selected search term appears near a collocate.

The function "extract_context(tweet, word, n_words_left, n_words_right, collocate)" extracts the context of a search term, but returns it only if the collocate is present in the context.

The main function "get_all_concordances(df, word, collocate, n_words_left, n_words_right)" returns a list of all the concordance lines in which the selected search term appears near a selected collocate, and prints all the concordance lines and the corresponding indexes of the tweets.

In [None]:
def extract_context_from_tweet_with_collocate(tweet, word, n_words_left, n_words_right, collocate):
  contexts = []
  tweet_words = strip_symbols_from_list_of_words(list_of_words = tweet.lower().split())
  word_idx = [i for i, w in enumerate(tweet_words) if w == word]
  for idx in word_idx:
    start_idx = max(0, idx - n_words_left)
    end_idx = min(len(tweet_words), idx + n_words_right + 1)
    context_word_list = tweet_words[start_idx:end_idx]
    if collocate in context_word_list:
      context =  ' '.join(tweet.split()[start_idx:end_idx])
      contexts.append(context)
  return contexts

def get_all_concordances(df, word, collocate, n_words_left, n_words_right):
  all_concordances = []
  for index, row in df.iterrows():
    contexts = extract_context_from_tweet_with_collocate(
        tweet = row['text'],
        word = word,
        n_words_left = n_words_left,
        n_words_right = n_words_right,
        collocate = collocate)
    if contexts:
      for context in contexts:
        print(f"tweet id: {index}: {context}")
        all_concordances.append(context)
  return all_concordances

In [45]:
all_concordances = get_all_concordances(df = tweets_cda, word = "racist", collocate = "black", n_words_left = 5, n_words_right = 5)

tweet id: 143: Winterfell Is Racist And Arya Grows Up: Black
tweet id: 154: shows with black people are racist. (Forgive him. He's 10% hotep.)
tweet id: 216: black characters. It<U+2019>s the most racist show in existence next to
tweet id: 334: a black person's death as racist. Probably because "white writers", right?


In [None]:
#tests

def get_test_df():
  test_data = {'text': ["Game of Thrones is racist as shit, how can somebody like it? It is simply racist as shit.",
           "Game of Thrones is the best show I have ever watched!",
           "This series is so racist it disgusts me, I don't know why black people still watch it.",
           "I don't think the series is racist, I just think writers got lazy.",
            "Game of Thrones is shit and racist."]}

  test_df = pd.DataFrame(test_data)
  return test_df

test_df = get_test_df()

def test_extract_context():
  context = extract_context_from_tweet_with_collocate(tweet = "Game of Thrones is racist as shit, how can somebody like it? It is simply racist as shit.",
                                                                   word = 'racist',
                                                                   n_words_left = 2,
                                                                   n_words_right = 2,
                                                                   collocate = "shit")
  assert context == ["Thrones is racist as shit,",
                     "is simply racist as shit."]
  print("test passed")
test_extract_context()

def test_get_all_concordances():
  all_concordances = get_all_concordances(df = test_df,
                                          word = "racist",
                                          collocate = "shit",
                                          n_words_left = 2,
                                          n_words_right = 2)

  assert all_concordances == ['Thrones is racist as shit,', 'is simply racist as shit.', 'shit and racist.']
  print('test passed')
test_get_all_concordances()

test passed
tweet id: 0: Thrones is racist as shit,
tweet id: 0: is simply racist as shit.
tweet id: 4: shit and racist.
test passed
