The goal of this script is to create a few charts and visualizations analyzing our tweets

In [16]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

In [5]:
DATA_DIR = "../../data/"

In [6]:
tweets = pd.read_csv(DATA_DIR + "tweet_scrape_2020_11_25.csv")

### Preprocessing text

Use Porter stemmer + lemmatization: https://www.tutorialspoint.com/python_data_science/python_stemming_and_lemmatization.htm

Stemming: 

In [19]:
porter_stemmer = PorterStemmer()

In [24]:
tweets["stemmed_text"] = [porter_stemmer.stem(text) for text in tweets["text"]]

Lemmatization

In [25]:
wordnet_lemmatizer = WordNetLemmatizer()

In [27]:
tweets["stemmed_lemmatized_text"] = [wordnet_lemmatizer.lemmatize(text) for text in tweets["stemmed_text"]]

In [28]:
tweets["stemmed_lemmatized_text"][0]

'as a teacher i urge people to read this thread'

### Let's separate our tweets by depressed/anxious vs. not, and see if we can do some preliminary analysis seeing if there's a difference in the tweets

In [73]:
depressed_count = 0

In [74]:
depression_words_list = ["depression", "dejected", "desperate", "sadness", "anxiety", "unhappy", "gloomy", "hopeless", "worry", "woeful", "upset", "sorry", "misery"]

In [75]:
depression_words_list = [porter_stemmer.stem(text) for text in depression_words_list]

In [76]:
depression_words_list = [wordnet_lemmatizer.lemmatize(text) for text in depression_words_list]

In [77]:
depression_words_list

['depress',
 'deject',
 'desper',
 'sad',
 'anxieti',
 'unhappi',
 'gloomi',
 'hopeless',
 'worri',
 'woeful',
 'upset',
 'sorri',
 'miseri']

In [78]:
bool_arr_has_depressed_word = []

In [79]:
bool_word_depressed = False

In [80]:
for tweet in tweets["stemmed_lemmatized_text"]:
    for word in depression_words_list:
        if word in tweet:
            depressed_count = depressed_count + 1
            bool_arr_has_depressed_word.append(1)
            bool_word_depressed=True
            break
    if bool_word_depressed:
        bool_word_depressed=False
        continue
    bool_arr_has_depressed_word.append(0)

In [81]:
sum(bool_arr_has_depressed_word)

154

In [82]:
tweets["is_depressed"] = bool_arr_has_depressed_word

In [86]:
tweets[tweets["is_depressed"]==1][0:50]

Unnamed: 0,created_at,text,tweet_id,user_screen_name,user_name,user_id,user_followers_count,user_following_count,user_statuses_count,user_likes_given_count,...,tweet_retweet_count,tweet_favorite_count,tweet_reply_count,tweet_hashtags,tweet_urls,tweet_media,tokenized_text,stemmed_text,stemmed_lemmatized_text,is_depressed
25,2020-11-25 11:33:19+00:00,PMC hasn't announced policy\nfor the MDCAT asp...,1331561600520769541,Haidershaw1,Haider,985076786974212097,14,787,180,198,...,0,0,0,[],['https://twitter.com/i/web/status/13315616005...,,"[PMC, hasn't, announced, policy\nfor, the, MDC...",pmc hasn't announced policy\nfor the mdcat asp...,pmc hasn't announced policy\nfor the mdcat asp...,1
152,2020-11-25 11:34:08+00:00,IF SADIQ KHAN IS REMOVED FROM OFFICE MOST OF L...,1331561806800834562,jjcwow,Liverbird #,69732035,7597,7419,340657,205535,...,0,0,0,[],['https://twitter.com/i/web/status/13315618068...,,"[IF, SADIQ, KHAN, IS, REMOVED, FROM, OFFICE, M...",if sadiq khan is removed from office most of l...,if sadiq khan is removed from office most of l...,1
194,2020-11-25 11:34:23+00:00,This makes me really sad for what’s to come.,1331561871061684225,windsorjlj,windsorjlj,1283289798,1153,1283,5732,22037,...,0,0,0,[],[],,"[This, makes, me, really, sad, for, what’s, to...",this makes me really sad for what’s to come.,this makes me really sad for what’s to come.,1
230,2020-11-25 11:34:38+00:00,#DelayMDCAT2020 we are so many students reques...,1331561933133148160,KhanSah78768248,Khan Sahib,1319641201654587393,2,2,785,5,...,0,0,0,['DelayMDCAT2020'],['https://twitter.com/i/web/status/13315619331...,,"[#DelayMDCAT2020, we, are, so, many, students,...",#delaymdcat2020 we are so many students reques...,#delaymdcat2020 we are so many students reques...,1
267,2020-11-25 11:34:54+00:00,"Be on an island in the middle of the ocean, ha...",1331562000720162817,JoshBoeke,Josh Boeke,767027802,1081,518,79570,33647,...,0,0,0,[],['https://twitter.com/i/web/status/13315620007...,,"[Be, on, an, island, in, the, middle, of, the,...","be on an island in the middle of the ocean, ha...","be on an island in the middle of the ocean, ha...",1
284,2020-11-25 11:35:00+00:00,I'm really struggling myself today with lockdo...,1331562026435551232,lisaathey,Lisa,19775751,208,496,2808,2888,...,0,0,0,[],['https://twitter.com/i/web/status/13315620264...,,"[I'm, really, struggling, myself, today, with,...",i'm really struggling myself today with lockdo...,i'm really struggling myself today with lockdo...,1
290,2020-11-25 11:35:03+00:00,"Me, so happy that Peter won #GBBO but wonderin...",1331562037206523905,moniquevictoria,Monique Victoria,45181305,2132,875,6892,2892,...,0,0,0,['GBBO'],['https://twitter.com/i/web/status/13315620372...,,"[Me,, so, happy, that, Peter, won, #GBBO, but,...","me, so happy that peter won #gbbo but wonderin...","me, so happy that peter won #gbbo but wonderin...",1
468,2020-11-25 11:36:30+00:00,Fuck #Cormann. Fuck #ScottyFomMarketing fuck h...,1331562402186293250,MyArgue,ArgueMyAss,966340681856724992,1336,3166,7617,2381,...,0,0,0,"['Cormann', 'ScottyFomMarketing', 'LNPCrimeFam...",['https://twitter.com/i/web/status/13315624021...,,"[Fuck, #Cormann., Fuck, #ScottyFomMarketing, f...",fuck #cormann. fuck #scottyfommarketing fuck h...,fuck #cormann. fuck #scottyfommarketing fuck h...,1
499,2020-11-25 11:36:42+00:00,@Asad_Umar @Shafqat_Mahmood kindly think about it,1331562450987196418,AwaisQureshi786,Muhammad Awais Qureshi.,2892019512,34,80,343,97,...,0,0,0,[],[],,"[@Asad_Umar, @Shafqat_Mahmood, kindly, think, ...",@asad_umar @shafqat_mahmood kindly think about it,@asad_umar @shafqat_mahmood kindly think about it,1
510,2020-11-25 11:36:47+00:00,So I optimistically thought I would avoid a lo...,1331562473493815297,skbarrone,Sarah Barron,200457310,660,409,12605,12586,...,0,0,0,[],['https://twitter.com/i/web/status/13315624734...,,"[So, I, optimistically, thought, I, would, avo...",so i optimistically thought i would avoid a lo...,so i optimistically thought i would avoid a lo...,1


In [87]:
depressive_wc = WordCloud(width = 512,height = 512, collocations=False, colormap="Blues").generate(tweets[tweets["is_depressed"]==1])
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(depressive_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

NameError: name 'WordCloud' is not defined

Tokenize text

In [10]:
tweets["tokenized_text"] = [tweet.split(sep=' ') for tweet in tweets["text"]]

Remove stopwords

False

In [None]:
num