# Finding the most frequent words in reddit posts for depression and anxiety 

In [119]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [120]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to /Users/miumiu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miumiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/miumiu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to /Users/miumiu/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

### For anxiety

In [121]:
data_anx = pd.read_csv("anxiety_reddit.csv")

In [122]:
# I will only leave the post and author in the data

data_anx = data_anx[["post", "author"]]

In [123]:
# Lengths of posts

data_anx["length"] = list(map(lambda x: len(str(x).split()), data_anx["post"]))

In [124]:
# All posts together

reviews_anx = [""]
corpus_anx=[]
for review in data_anx["post"]:
    reviews_anx.append(review)
    corpus_anx.append(nltk.sent_tokenize(review))

In [125]:
# Flattening the list

corpus_anx=[sent for sublist in corpus_anx for sent in sublist]

In [126]:
# How many sentences in corpus

len(corpus_anx)

93251

In [127]:
# Cleaning the sentences

for i in range(len(corpus_anx)):
    corpus_anx[i] = corpus_anx[i].lower()
    corpus_anx[i] = re.sub(r'\W+',' ',corpus_anx[i]) # Replace everything non-alpahnumeric by ' '
    corpus_anx[i] = re.sub(r'\s+',' ',corpus_anx[i]) # Replace one or more whitespaces by  ' '
    corpus_anx[i] = re.sub(r'\d+',' ',corpus_anx[i]) # Replace one or more digits by  ' '
    corpus_anx[i] = re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)'," ", corpus_anx[i]) # Replace e-mails by ' '
    # Replace urls by ''
    corpus_anx[i] = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', ' ' , corpus_anx[i]) 
    # Replace html tags by ''
    corpus_anx[i] = BeautifulSoup(corpus_anx[i], 'html.parser').get_text().strip()

In [128]:
# Just checking one example

print(corpus_anx[45])

i also want to make an appointment to begin therapy as soon as i get some extra money


In [129]:
# To get words and their frequences

wordfreq_anx = {}
for sentence in corpus_anx:
    words_anx = sentence.split()
    #tokens = nltk.word_tokenize(sentence) # To get the words, it can be also done with sentence.split()
    for word in words_anx:
        if ( word not in wordfreq_anx.keys() ): ## first time appearnce in the sentence
            wordfreq_anx[word] = 1 # We initialize the corresponding counter
        else: ## if the world is already existed in the dictionalry 
            wordfreq_anx[word] += 1 # We increase the corresponding counter

In [130]:
# How many words

len(list(wordfreq_anx.keys()))

22436

In [131]:
wordfreq_anx

{'does': 1775,
 'anyone': 3481,
 'else': 2245,
 'like': 10548,
 'taking': 1223,
 'long': 1505,
 'walks': 29,
 'while': 1461,
 'it': 28523,
 'snows': 2,
 'everything': 1758,
 'is': 14269,
 'quieter': 8,
 'meditation': 151,
 'making': 1006,
 'me': 18369,
 'anxious': 2795,
 'i': 114292,
 'do': 7585,
 'a': 34146,
 'couple': 679,
 'sessions': 66,
 'day': 3208,
 'on': 9762,
 'headspace': 14,
 'just': 10932,
 'opening': 55,
 'the': 36039,
 'app': 47,
 'gets': 546,
 'my': 33907,
 'heart': 1201,
 'racing': 266,
 'silence': 40,
 'can': 8690,
 't': 18113,
 'run': 274,
 'from': 4580,
 'worries': 127,
 'or': 8724,
 'drown': 22,
 'them': 2875,
 'out': 6761,
 's': 11133,
 'torture': 21,
 'and': 50459,
 'mind': 1241,
 'how': 5141,
 'until': 1078,
 'easier': 155,
 'm': 16534,
 'about': 8844,
 'two': 1121,
 'days': 1556,
 'in': 17588,
 'right': 1744,
 'now': 4851,
 'most': 1311,
 'dreaded': 11,
 'part': 711,
 'of': 24290,
 'rant': 130,
 'anxiety': 14202,
 'meds': 607,
 've': 7322,
 'been': 6650,
 'self'

In [132]:
# Checking stopwords

from nltk.corpus import stopwords

stop_words = list(stopwords.words('english')) 
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [134]:
# Cleaning stopwords

for i in range(len(stop_words)):
    stop_words[i] = re.sub(r"\s*'\s*\w*","",stop_words[i])

#stop_words = [word for word in list(np.unique(stop_words)) if len(word) > 1]
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'you',
 'you',
 'you',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more'

In [135]:
# Leaving stopwords out of the corpus

corpus_anx = [(wordfreq_anx[key],key) for key in list(wordfreq_anx.keys()) if key not in stop_words]

In [136]:
# Sorting

corpus_anx.sort(reverse = True)

In [139]:
# Making a dictionary with words and frequencies

words_anx = dict ((y, x) for x, y in corpus_anx)

In [140]:
words_anx

{'anxiety': 14202,
 'like': 10548,
 'feel': 8310,
 'get': 6952,
 'know': 6070,
 'time': 5333,
 'really': 5247,
 'even': 4266,
 'going': 4190,
 'want': 4134,
 'go': 3802,
 'people': 3754,
 'think': 3601,
 'would': 3570,
 'help': 3513,
 'anyone': 3481,
 'work': 3464,
 'life': 3432,
 'something': 3322,
 'one': 3317,
 'day': 3208,
 'things': 3154,
 'panic': 3148,
 'much': 2990,
 'feeling': 2853,
 'anxious': 2795,
 'back': 2755,
 'years': 2522,
 'bad': 2459,
 'always': 2432,
 'never': 2431,
 'job': 2368,
 'also': 2341,
 'got': 2305,
 'else': 2245,
 'need': 2201,
 'could': 2176,
 'started': 2169,
 'still': 2147,
 'make': 2137,
 'getting': 2090,
 'last': 2044,
 'since': 2034,
 'way': 2003,
 'anything': 2001,
 'take': 1988,
 'first': 1978,
 'every': 1955,
 'attack': 1922,
 'good': 1860,
 'friends': 1855,
 'someone': 1849,
 'lot': 1836,
 'school': 1823,
 'year': 1770,
 'everything': 1758,
 'see': 1749,
 'right': 1744,
 'felt': 1688,
 'well': 1639,
 'new': 1624,
 'sleep': 1611,
 'better': 1569,


In [141]:
# To check if word in in the words list and to get the frequency

x = input("enter the word: ")
if x in words_anx.keys():
    print(words_anx[x])

enter the word: feel
8310


### For depression

In [20]:
data_dep = pd.read_csv("depression_reddit.csv")

In [21]:
# I will only leave the post and author in the data

data_dep = data_dep[["post", "author"]]

In [22]:
# Lengths of posts

data_dep["length"] = list(map(lambda x: len(str(x).split()), data_dep["post"]))

In [23]:
# All posts together

reviews_dep = [""]
corpus_dep=[]
for review in data_dep["post"]:
    reviews_dep.append(review)
    corpus_dep.append(nltk.sent_tokenize(review))

In [24]:
# Flattening the list

corpus_dep=[sent for sublist in corpus_dep for sent in sublist]

In [149]:
len(corpus_dep)

35269

In [25]:
# Cleaning the sentences

for i in range(len(corpus_dep)):
    corpus_dep[i] = corpus_dep[i].lower()
    corpus_dep[i] = re.sub(r'\W+',' ',corpus_dep[i]) # Replace everything non-alpahnumeric by ' '
    corpus_dep[i] = re.sub(r'\s+',' ',corpus_dep[i]) # Replace one or more whitespaces by  ' '
    corpus_dep[i] = re.sub(r'\d+',' ',corpus_dep[i]) # Replace one or more digits by  ' '
    corpus_dep[i] = re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)'," ", corpus_dep[i]) # Replace e-mails by ' '
    # Replace urls by ''
    corpus_dep[i] = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', ' ' , corpus_dep[i]) 
    # Replace html tags by ''
    corpus_dep[i] = BeautifulSoup(corpus_dep[i], 'html.parser').get_text().strip()

In [26]:
# Checking an example

print(corpus_dep[5])

i really dont know what to do


In [27]:
# To get words and their frequences

wordfreq_dep = {}
for sentence in corpus_dep:
    words_dep = sentence.split()
    #tokens = nltk.word_tokenize(sentence) # To get the words, it can be also done with sentence.split()
    for word in words_dep:
        if ( word not in wordfreq_dep.keys() ): ## first time appearnce in the sentence
            wordfreq_dep[word] = 1 # We initialize the corresponding counter
        else: ## if the world is already existed in the dictionalry 
            wordfreq_dep[word] += 1 # We increase the corresponding counter

In [28]:
wordfreq_dep

{'school': 8134,
 'makes': 3887,
 'me': 64235,
 'suicidal': 2459,
 'please': 1901,
 'help': 10091,
 'im': 5185,
 'a': 98000,
 'year': 6923,
 'old': 2940,
 'girl': 2347,
 'from': 13140,
 'england': 36,
 'ive': 1078,
 'attempted': 272,
 'suicide': 2591,
 'times': 2965,
 'in': 53936,
 'the': 103264,
 'space': 443,
 'of': 72448,
 'almost': 2787,
 'years': 10040,
 'because': 19126,
 'my': 100622,
 'head': 2376,
 'is': 42567,
 'supposed': 1010,
 'to': 170002,
 'students': 166,
 'with': 36990,
 'anxiety': 4353,
 'and': 148421,
 'let': 2752,
 'us': 1524,
 'sit': 960,
 'outside': 1110,
 'her': 14295,
 'office': 311,
 'but': 50223,
 'she': 17768,
 'just': 43835,
 'tells': 485,
 'go': 11277,
 'away': 4746,
 'i': 378880,
 'told': 4385,
 'was': 33614,
 'going': 10580,
 'kill': 2408,
 'myself': 18906,
 'stop': 3198,
 'being': 9622,
 'dramatic': 132,
 'feeling': 7635,
 'really': 15920,
 'right': 5424,
 'now': 15738,
 'want': 20395,
 'tell': 4258,
 'mum': 363,
 'scared': 2133,
 'hurt': 1703,
 'll': 56

In [49]:
# Leaving stopwords out of the corpus

corpus_dep = [(wordfreq_dep[key],key) for key in list(wordfreq_dep.keys()) if key not in stop_words]

In [50]:
# Sorting

corpus_dep.sort(reverse = True)

In [56]:
# Making a dictionary with words and frequencies

words_dep = dict ((y, x) for x, y in corpus_dep)

In [57]:
words_dep

{'like': 34204,
 'feel': 31648,
 'know': 22727,
 'want': 20395,
 'life': 19988,
 'get': 19425,
 'time': 16753,
 'even': 16196,
 'really': 15920,
 'depression': 15113,
 'people': 15092,
 'one': 12534,
 'friends': 12072,
 'would': 12023,
 'go': 11277,
 'think': 10805,
 'never': 10708,
 'going': 10580,
 'help': 10091,
 'years': 10040,
 'things': 10003,
 'day': 9732,
 'much': 9632,
 'anything': 8884,
 'back': 8336,
 'school': 8134,
 'work': 8016,
 'good': 7929,
 'something': 7894,
 'anyone': 7837,
 'depressed': 7636,
 'feeling': 7635,
 'always': 7604,
 'make': 7500,
 'still': 7471,
 'everything': 7157,
 'way': 7081,
 'could': 7072,
 'better': 7045,
 'year': 6923,
 'someone': 6898,
 'got': 6886,
 'nothing': 6697,
 'talk': 6548,
 'every': 6536,
 'see': 6428,
 'need': 6350,
 'anymore': 6236,
 'job': 6152,
 'happy': 6151,
 'since': 6024,
 'ever': 5758,
 'bad': 5715,
 'family': 5690,
 'last': 5688,
 'love': 5569,
 'right': 5424,
 'getting': 5360,
 'hate': 5281,
 'im': 5185,
 'felt': 5088,
 'try

In [147]:
# To check if word in in the words list and to get the frequency

x = input("enter the word: ")
if x in words_dep.keys():
    print(words_dep[x])

enter the word: worthwhile
62
