# What's in this notebook?

In this notebook, I've included everything that the code in the blog post will require to run successfully. I did so in an effort to reduce the amount of code in the final blog post. This notebook pulls from the analysis1.ipynb and the analysis2.ipynb notesbooks, which were too large in size to run efficiently in the final blog post.

## Set-up

In [4]:
%run functions.ipynb
%matplotlib inline

In [5]:
import tweepy
import configparser
import os
import json
import GetOldTweets3 as got
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import string
import random
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import csv
import math

from collections import Counter

In [6]:
jan_tweets = load_tweets('data/1/tweets_2020-01-01_to_2020-02-01.json')
feb_tweets = load_tweets('data/2/tweets_2020-02-01_to_2020-03-01.json')
mar_tweets = load_tweets('data/3/tweets_2020-03-01_to_2020-04-01.json')
apr_tweets = load_tweets('data/4/tweets_2020-04-01_to_2020-05-01.json')
all_time = load_tweets('data/all_time/tweets_2020-01-01_to_2020-05-01.json')
trump_tweets = load_tweets('data/all_time/realdonaldtrump_2020-01-01_to_2020-05-01.json')
pompeo_tweets = load_tweets('data/all_time/secpompeo_2020-01-01_to_2020-05-01.json')
racist_tweets = load_tweets('data/all_time/racist_tweets_2020-01-01_to_2020-05-01.json')

In [7]:
corpus1 = json.load(open('data/corpus_index1.json'))
corpus2 = json.load(open('data/corpus_index2.json'))
corpus3 = json.load(open('data/corpus_index3.json'))
corpus4 = json.load(open('data/corpus_index4.json'))
corp_all = json.load(open('data/corpus_index_all.json'))

## Tweet Cleaning

In [8]:
all_t_word_dist=Counter()
all_t_bigram_dist=Counter()
all_t_trigram_dist=Counter()

all_t_tokens = []
for tweet in all_time:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    all_t_tokens.extend(toks)

all_t_bigrams=get_ngram_tokens(all_t_tokens,2)
all_t_trigrams=get_ngram_tokens(all_t_tokens,3)

all_t_word_dist.update(all_t_tokens)
all_t_bigram_dist.update(all_t_bigrams)
all_t_trigram_dist.update(all_t_trigrams)

In [9]:
queries = ['asianamerican', 'asian', 'american', \
            'racism', 'racist', 'xenophobia', 'racism', 'racist', 'xenophobia', \
            'coronavirus', 'corona virus', 'covid19', 'covid 19', 'pandemic', 'virus', "chinese virus", "china virus", \
            'coronavirus', 'covid19', 'pandemic', 'chinavirus', 'chinesevirus']
stripped_tweets_tokens = all_t_tokens
words_to_remove= stopwords.words('english')+queries
for tweet in list(stripped_tweets_tokens):
    if tweet in words_to_remove:
        stripped_tweets_tokens.remove(tweet)

stripped_tweets_tokens = [x for x in stripped_tweets_tokens if not x.startswith('https')]
stripped_tweets_wfreq = Counter(stripped_tweets_tokens)

## Tweet Bi/Trigrams

**January**

In [10]:
jan_t_word_dist=Counter()
jan_t_bigram_dist=Counter()
jan_t_trigram_dist=Counter()

jan_t_tokens = []
for tweet in jan_tweets:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    jan_t_tokens.extend(toks)

jan_t_bigrams=get_ngram_tokens(jan_t_tokens,2)
jan_t_trigrams=get_ngram_tokens(jan_t_tokens,3)

jan_t_word_dist.update(jan_t_tokens)
jan_t_bigram_dist.update(jan_t_bigrams)
jan_t_trigram_dist.update(jan_t_trigrams)

**February**

In [11]:
feb_t_word_dist=Counter()
feb_t_bigram_dist=Counter()
feb_t_trigram_dist=Counter()

feb_t_tokens = []
for tweet in feb_tweets:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    feb_t_tokens.extend(toks)

feb_t_bigrams=get_ngram_tokens(feb_t_tokens,2)
feb_t_trigrams=get_ngram_tokens(feb_t_tokens,3)

feb_t_word_dist.update(feb_t_tokens)
feb_t_bigram_dist.update(feb_t_bigrams)
feb_t_trigram_dist.update(feb_t_trigrams)

In [12]:
feb_top_20_bigrams = feb_t_bigram_dist.most_common(20)
feb_top_20_trigrams = feb_t_trigram_dist.most_common(20)
feb_bigram_df = pd.DataFrame(feb_top_20_bigrams, columns = ['Bigram','Freq'])
feb_bigram_list = list(feb_bigram_df['Bigram'])
feb_trigram_df = pd.DataFrame(feb_top_20_trigrams, columns = ['Trigram','Freq'])
feb_trigram_list = list(feb_trigram_df['Trigram'])

In [13]:
rank = list(range(1, 21))
feb_bitrigram = pd.DataFrame(rank, columns = ['Rank'])
feb_bitrigram['Bigram']=feb_bigram_list
feb_bitrigram['Trigram']=feb_trigram_list
feb_bitrigram.set_index('Rank', inplace=True)

#### March

In [14]:
mar_t_word_dist=Counter()
mar_t_bigram_dist=Counter()
mar_t_trigram_dist=Counter()

mar_t_tokens = []
for tweet in mar_tweets:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    mar_t_tokens.extend(toks)

mar_t_bigrams=get_ngram_tokens(mar_t_tokens,2)
mar_t_trigrams=get_ngram_tokens(mar_t_tokens,3)

mar_t_word_dist.update(mar_t_tokens)
mar_t_bigram_dist.update(mar_t_bigrams)
mar_t_trigram_dist.update(mar_t_trigrams)

In [15]:
mar_top_20_bigrams = mar_t_bigram_dist.most_common(20)
mar_top_20_trigrams = mar_t_trigram_dist.most_common(20)
mar_bigram_df = pd.DataFrame(mar_top_20_bigrams, columns = ['Bigram','Freq'])
mar_bigram_list = list(mar_bigram_df['Bigram'])
mar_trigram_df = pd.DataFrame(mar_top_20_trigrams, columns = ['Trigram','Freq'])
mar_trigram_list = list(mar_trigram_df['Trigram'])

In [16]:
rank = list(range(1, 21))
mar_bitrigram = pd.DataFrame(rank, columns = ['Rank'])
mar_bitrigram['Bigram']=mar_bigram_list
mar_bitrigram['Trigram']=mar_trigram_list
mar_bitrigram.set_index('Rank', inplace=True)

#### April

In [17]:
apr_t_word_dist=Counter()
apr_t_bigram_dist=Counter()
apr_t_trigram_dist=Counter()

apr_t_tokens = []
for tweet in apr_tweets:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    apr_t_tokens.extend(toks)

apr_t_bigrams=get_ngram_tokens(apr_t_tokens,2)
apr_t_trigrams=get_ngram_tokens(apr_t_tokens,3)

apr_t_word_dist.update(apr_t_tokens)
apr_t_bigram_dist.update(apr_t_bigrams)
apr_t_trigram_dist.update(apr_t_trigrams)

In [18]:
apr_top_20_bigrams = apr_t_bigram_dist.most_common(20)
apr_top_20_trigrams = apr_t_trigram_dist.most_common(20)
apr_bigram_df = pd.DataFrame(apr_top_20_bigrams, columns = ['Bigram','Freq'])
apr_bigram_list = list(apr_bigram_df['Bigram'])
apr_trigram_df = pd.DataFrame(apr_top_20_trigrams, columns = ['Trigram','Freq'])
apr_trigram_list = list(apr_trigram_df['Trigram'])

In [19]:
rank = list(range(1, 21))
apr_bitrigram = pd.DataFrame(rank, columns = ['Rank'])
apr_bitrigram['Bigram']=apr_bigram_list
apr_bitrigram['Trigram']=apr_trigram_list
apr_bitrigram.set_index('Rank', inplace=True)

## Article distribution over time

In [20]:
jan_word_dist=Counter()
jan_bigram_dist=Counter()
jan_trigram_dist=Counter()

for article in corpus1:
    filename = article['Filename']
    text = open('data/text1/{}'.format(filename)).read()
    article['text'] = text
    
    jan_tokens = tokenize(article['text'], lowercase=True, strip_chars=string.punctuation)
    
    article['tokens'] = jan_tokens 
    article['token_cnt'] = len(jan_tokens)
    article['type_cnt'] = len(set(jan_tokens))
    
    jan_tokens = article['tokens']
    jan_bigrams=get_ngram_tokens(jan_tokens,2)
    jan_trigrams=get_ngram_tokens(jan_tokens,3)
    
    jan_word_dist.update(jan_tokens)
    jan_bigram_dist.update(jan_bigrams)
    jan_trigram_dist.update(jan_trigrams)

In [21]:
feb_word_dist=Counter()
feb_bigram_dist=Counter()
feb_trigram_dist=Counter()
feb_all_tokens = []
for article in corpus2:
    filename = article['Filename']
    text = open('data/text2/{}'.format(filename)).read()
    article['text'] = text
    
    feb_tokens = tokenize(article['text'], lowercase=True, strip_chars=string.punctuation)
    feb_all_tokens.extend(feb_tokens)
    
    article['tokens'] = feb_tokens 
    article['token_cnt'] = len(feb_tokens)
    article['type_cnt'] = len(set(feb_tokens))
    
    feb_tokens = article['tokens']
    feb_bigrams=get_ngram_tokens(feb_tokens,2)
    feb_trigrams=get_ngram_tokens(feb_tokens,3)
    
    feb_word_dist.update(feb_tokens)
    feb_bigram_dist.update(feb_bigrams)
    feb_trigram_dist.update(feb_trigrams)

In [22]:
mar_word_dist=Counter()
mar_bigram_dist=Counter()
mar_trigram_dist=Counter()

for article in corpus3:
    filename = article['Filename']
    text = open('data/text3/{}'.format(filename)).read()
    article['text'] = text
    
    mar_tokens = tokenize(article['text'], lowercase=True, strip_chars=string.punctuation)
    
    article['tokens'] = mar_tokens 
    article['token_cnt'] = len(mar_tokens)
    article['type_cnt'] = len(set(mar_tokens))
    
    mar_tokens = article['tokens']
    mar_bigrams=get_ngram_tokens(mar_tokens,2)
    mar_trigrams=get_ngram_tokens(mar_tokens,3)
    
    mar_word_dist.update(mar_tokens)
    mar_bigram_dist.update(mar_bigrams)
    mar_trigram_dist.update(mar_trigrams)

In [23]:
apr_word_dist=Counter()
apr_bigram_dist=Counter()
apr_trigram_dist=Counter()

for article in corpus4:
    filename = article['Filename']
    text = open('data/text4/{}'.format(filename)).read()
    article['text'] = text
    
    apr_tokens = tokenize(article['text'], lowercase=True, strip_chars=string.punctuation)
    
    article['tokens'] = apr_tokens 
    article['token_cnt'] = len(apr_tokens)
    article['type_cnt'] = len(set(apr_tokens))
    
    apr_tokens = article['tokens']
    apr_bigrams=get_ngram_tokens(apr_tokens,2)
    apr_trigrams=get_ngram_tokens(apr_tokens,3)
    
    apr_word_dist.update(apr_tokens)
    apr_bigram_dist.update(apr_bigrams)
    apr_trigram_dist.update(apr_trigrams)

## Recent tweets

In [24]:
recent_tweets = DictListUpdate(mar_tweets,apr_tweets)

In [25]:
recent_tweets_tokens = []
for tweet in recent_tweets:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    recent_tweets_tokens.extend(toks)

In [26]:
stripped_recenttweets_tokens = recent_tweets_tokens
words_to_remove= stopwords.words('english')+queries
for tweet in list(stripped_recenttweets_tokens):
    if tweet in words_to_remove:
        stripped_recenttweets_tokens.remove(tweet)

stripped_recenttweets_tokens = [x for x in stripped_recenttweets_tokens if not x.startswith('https')]
stripped_recenttweets_wfreq = Counter(stripped_recenttweets_tokens)

## Old tweets

In [27]:
old_tweets = DictListUpdate(jan_tweets,feb_tweets)

In [28]:
old_tweets_tokens = []
for tweet in old_tweets:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    old_tweets_tokens.extend(toks)

In [29]:
stripped_oldtweets_tokens = old_tweets_tokens
words_to_remove= stopwords.words('english')+queries
for tweet in list(stripped_oldtweets_tokens):
    if tweet in words_to_remove:
        stripped_oldtweets_tokens.remove(tweet)

stripped_oldtweets_tokens = [x for x in stripped_oldtweets_tokens if not x.startswith('https')]
stripped_oldtweets_wfreq = Counter(stripped_oldtweets_tokens)

## Keyness Analysis

In [30]:
old_size = len(stripped_oldtweets_tokens)
recent_size = len(stripped_recenttweets_tokens)
top_old = stripped_oldtweets_wfreq.most_common(30)
top_recent = stripped_recenttweets_wfreq.most_common(30)

## Trump and Pompeo's Tweets Visualization

In [31]:
d = Counter(tweet['date'][:10] for tweet in all_time)
dftweets_raw = pd.DataFrame.from_dict(d, orient='index').reset_index()
dftweets_cleaned = dftweets_raw.rename(columns = {"index": "date", 0: "count"})
dftweets = dftweets_cleaned.sort_values(by='date')

In [32]:
d = Counter(article['Date'] for article in corp_all)
dflexis_raw = pd.DataFrame.from_dict(d, orient='index').reset_index()
dflexis_cleaned = dflexis_raw.rename(columns = {"index": "date", 0: "count"})
dflexis = dflexis_cleaned.sort_values(by='date')

In [33]:
d = Counter(tweet['date'][:10] for tweet in trump_tweets)
dftrump_raw = pd.DataFrame.from_dict(d, orient='index').reset_index()
dftrump_cleaned = dftrump_raw.rename(columns = {"index": "date", 0: "count"})
dftrump = dftrump_cleaned.sort_values(by='date')

In [34]:
d = Counter(tweet['date'][:10] for tweet in pompeo_tweets)
dfpompeo_raw = pd.DataFrame.from_dict(d, orient='index').reset_index()
dfpompeo_cleaned = dfpompeo_raw.rename(columns = {"index": "date", 0: "count"})
dfpompeo = dfpompeo_cleaned.sort_values(by='date')

In [35]:
d = Counter(tweet['date'][:10] for tweet in racist_tweets)
dfracist_tweets_raw = pd.DataFrame.from_dict(d, orient='index').reset_index()
dfracist_tweets_cleaned = dfracist_tweets_raw.rename(columns = {"index": "date", 0: "count"})
dfracist_tweets = dfracist_tweets_cleaned.sort_values(by='date')

## Silent Tweets vs. Discussion Creators

In [36]:
silent_tweets = []
for tweet in all_time:
    if tweet['replies']==0:
        silent_tweets.append(tweet)

In [37]:
raw_silent_tokens = []
for tweet in silent_tweets:
    text = tweet['text']
    toks = tokenize(text, lowercase=True, strip_chars='')
    raw_silent_tokens.extend(toks)

In [38]:
silent_tokens = []
for tweet in silent_tweets:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    silent_tokens.extend(toks)

In [39]:
stripped_silent_tokens = silent_tokens
words_to_remove= stopwords.words('english')+queries
for tweet in list(stripped_silent_tokens):
    if tweet in words_to_remove:
        stripped_silent_tokens.remove(tweet)

stripped_silent_tokens = [x for x in stripped_silent_tokens if not x.startswith('https')]
stripped_silent_wfreq = Counter(stripped_silent_tokens)

In [40]:
discussion_creators = []
for tweet in all_time:
    if tweet['replies']>0:
        discussion_creators.append(tweet)

In [41]:
raw_discussion_tokens = []
for tweet in discussion_creators:
    text = tweet['text']
    toks = tokenize(text, lowercase=True, strip_chars='')
    raw_discussion_tokens.extend(toks)

In [42]:
discussion_tokens = []
for tweet in discussion_creators:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    discussion_tokens.extend(toks)

In [43]:
stripped_discussion_tokens = discussion_tokens
words_to_remove= stopwords.words('english')+queries
for tweet in list(stripped_discussion_tokens):
    if tweet in words_to_remove:
        stripped_discussion_tokens.remove(tweet)

stripped_discussion_tokens = [x for x in stripped_discussion_tokens if not x.startswith('https')]
stripped_discussion_wfreq = Counter(stripped_discussion_tokens)

## Shared vocab and word freq. visualization

In [44]:
all_word_dist=Counter()
all_bigram_dist=Counter()
all_trigram_dist=Counter()

all_tokens = []
for article in corp_all:
    filename = article['Filename']
    text = open('data/text_all/{}'.format(filename)).read()
    tokens = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    all_tokens.extend(tokens)
    
    article['token_cnt'] = len(tokens)
    article['type_cnt'] = len(set(tokens))
    
    bigrams=get_ngram_tokens(tokens,2)
    trigrams=get_ngram_tokens(tokens,3)
    
    all_word_dist.update(tokens)
    all_bigram_dist.update(bigrams)
    all_trigram_dist.update(trigrams)

In [45]:
s_all_tokens1 = all_tokens[:145449]

In [46]:
s_all_tokens2 = all_tokens[145449:290898]

In [47]:
s_all_tokens3 = all_tokens[290898:436347]

In [48]:
s_all_tokens4 = all_tokens[436347:581796]

In [49]:
s_all_tokens5 = all_tokens[581796:727245]

In [50]:
s_all_tokens6 = all_tokens[727245:872694]

In [51]:
s_all_tokens7 = all_tokens[872694:1018143]

In [52]:
s_all_tokens8 = all_tokens[1018143:1163592]

In [53]:
s_all_tokens9 = all_tokens[1163592:1309041]

In [54]:
s_all_tokens10 = all_tokens[1309041:]

In [55]:
queries = ['asianamerican', 'asian', 'american', \
            'racism', 'racist', 'xenophobia', 'racism', 'racist', 'xenophobia', \
            'coronavirus', 'corona virus', 'covid19', 'covid 19', 'pandemic', 'virus', "chinese virus", "china virus", \
            'coronavirus', 'covid19', 'pandemic', 'chinavirus', 'chinesevirus']

In [56]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens1:
    if article_word in words_to_remove:
        s_all_tokens1.remove(article_word)

In [57]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens2:
    if article_word in words_to_remove:
        s_all_tokens2.remove(article_word)

In [58]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens3:
    if article_word in words_to_remove:
        s_all_tokens3.remove(article_word)

In [59]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens4:
    if article_word in words_to_remove:
        s_all_tokens4.remove(article_word)

In [60]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens5:
    if article_word in words_to_remove:
        s_all_tokens5.remove(article_word)

In [61]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens6:
    if article_word in words_to_remove:
        s_all_tokens6.remove(article_word)

In [62]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens7:
    if article_word in words_to_remove:
        s_all_tokens7.remove(article_word)

In [63]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens8:
    if article_word in words_to_remove:
        s_all_tokens8.remove(article_word)

In [64]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens9:
    if article_word in words_to_remove:
        s_all_tokens9.remove(article_word)

In [65]:
words_to_remove= stopwords.words('english')+queries
for article_word in s_all_tokens10:
    if article_word in words_to_remove:
        s_all_tokens10.remove(article_word)

In [66]:
s_all_tokens = s_all_tokens1+s_all_tokens2+s_all_tokens3+s_all_tokens4+s_all_tokens5+s_all_tokens6+s_all_tokens7+s_all_tokens8+s_all_tokens9+s_all_tokens10

In [67]:
s_all_tokens_dist = Counter(s_all_tokens)

In [68]:
all_shared_items = [(item, value, s_all_tokens_dist.get(item)) 
                for item,value in stripped_tweets_wfreq.items() if s_all_tokens_dist.get(item)]

In [69]:
all_tweet_num_tokens = sum(stripped_tweets_wfreq.values())
all_article_num_tokens = sum(s_all_tokens_dist.values())

In [70]:
all_shared_items.sort(key=lambda i: i[1]+i[2], reverse=True)
cdf=pd.DataFrame(all_shared_items, columns=['word','Number of occurrences in Tweets','Number of occurrences in articles'])
cdf['Tweets_percent']=cdf['Number of occurrences in Tweets'] / all_tweet_num_tokens * 100
cdf['Articles_percent']=cdf['Number of occurrences in articles'] / all_article_num_tokens * 100

## "Community" and "Health" Collocation

In [71]:
article_community_colls = Counter()
article_community_colls.update(collocates(all_tokens, 'community',win=[5,5]))

In [72]:
tweet_community_colls = Counter()
tweet_community_colls.update(collocates(stripped_tweets_tokens, 'community',win=[5,5]))

In [73]:
article_health_colls = Counter()
article_health_colls.update(collocates(all_tokens, 'health',win=[5,5]))

In [74]:
tweet_health_colls = Counter()
tweet_health_colls.update(collocates(stripped_tweets_tokens, 'health',win=[5,5]))

## Sentiment Analyses

In [75]:
sid = SentimentIntensityAnalyzer()

**Tweets**


In [76]:
tweet_sid_scores=[]

for tweet in all_time:
    scores = sid.polarity_scores(tweet['text'])
    scores['text']=tweet['text']
    scores['date']=tweet['date']
    tweet_sid_scores.append(scores)

In [77]:
by_monthdate = {}
for tweet in tweet_sid_scores:
    ymd = tweet['date'][:10]
    try:
        by_monthdate[ymd].append(tweet['compound'])
    except:
        by_monthdate[ymd] = [tweet['compound']]

In [78]:
data_tweets = [{ 'date': y, 'avg_sent': sum(d)/len(d)}
        for y, d in by_monthdate.items() ]

**Articles**

In [79]:
article_sid_scores=[]

for article in corp_all:
    filename = article['Filename']
    text = open('data/text_all/{}'.format(filename)).read()
    article['text']=text
    scores = sid.polarity_scores(article['text'])
    scores['text']=article['text']
    scores['date']=article['Date']
    article_sid_scores.append(scores)

In [80]:
a_by_monthdate = {}
for article in article_sid_scores:
    ymd = article['date']
    try:
        a_by_monthdate[ymd].append(article['compound'])
    except:
        a_by_monthdate[ymd] = [article['compound']]

In [81]:
data_articles = [{ 'date': y, 'avg_sent': sum(d)/len(d)}
        for y, d in a_by_monthdate.items() ]

In [82]:
t = pd.DataFrame(data_tweets)
a = pd.DataFrame(data_articles)

## Racist Tweets

In [83]:
raw_racist_tokens = []
for tweet in racist_tweets:
    text = tweet['text']
    toks = tokenize(text, lowercase=False, strip_chars='')
    raw_racist_tokens.extend(toks)

In [84]:
racist_word_dist=Counter()
racist_bigram_dist=Counter()
racist_trigram_dist=Counter()

racist_tokens = []
for tweet in racist_tweets:
    text = tweet['text'].replace('&amp;', '&').replace('”', '').replace('\'', '').replace('’', '').replace('“', '')
    toks = tokenize(text, lowercase=True, strip_chars=string.punctuation)
    racist_tokens.extend(toks)

racist_bigrams=get_ngram_tokens(racist_tokens,2)
racist_trigrams=get_ngram_tokens(racist_tokens,3)

racist_word_dist.update(racist_tokens)
racist_bigram_dist.update(racist_bigrams)
racist_trigram_dist.update(racist_trigrams)

In [85]:
racist_queries = ["ching chong",'ching','chong', 'chink', 'chingchong', "kung flu",'kung','fu', "kung fu flu", "ching chong virus",'coronavirus', 'corona virus', 'covid19', 'covid 19']
s_racist_tweets_tokens = racist_tokens
words_to_remove= stopwords.words('english')+racist_queries
for tweet in list(s_racist_tweets_tokens):
    if tweet in words_to_remove:
        s_racist_tweets_tokens.remove(tweet)

s_racist_tweets_tokens = [x for x in s_racist_tweets_tokens if not x.startswith('https')]
racist_tweets_wfreq = Counter(s_racist_tweets_tokens)

In [86]:
s_racist_bigrams = get_ngram_tokens(s_racist_tweets_tokens,2)
s_racist_bigrams_dist = Counter(s_racist_bigrams)
s_racist_trigrams = get_ngram_tokens(s_racist_tweets_tokens,3)
s_racist_trigrams_dist = Counter(s_racist_trigrams)

In [87]:
tweet_funny_colls = Counter()
tweet_funny_colls.update(collocates(racist_tokens, 'funny',win=[5,5]))