In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re 
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import ngrams
import spacy
%matplotlib inline

wordnet = WordNetLemmatizer()
sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words

In [None]:
def clean_text(text,stopwords):
    text = re.sub('[^a-zA-Z]', ' ',text)
    text = text.lower()
    text = text.split(' ')
    text = [wordnet.lemmatize(word) for word in text]
    text = [word for word in text if word not in stopwords]
    text = ' '.join(text)
    return text

def combine_data(list1, list2):
    combined_data = []
    for i in range(len(list1)):
        new_str = list1[i] +' ' + list2[i]
        combined_data.append(new_str)
    return combined_data

def generate_frequency(text_list):
    fdist =  FreqDist()
    for i in text_list:
        words = i.split(' ')
        words = [word for word in words if word != '']
        words = ngrams(words,1)
        for x in words:
            fdist[x[0]]+=1
    return fdist

In [None]:
df = pd.read_csv('../Dataset/RedditSuicideData.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
new_words = ["http", "www", "co", "u", "com", "t", "s", "m",
             "ve", "dy", "ll", 'n', 'r', 'b', "wa", "y", "don", "ha"]
for words in new_words:
    all_stopwords.add(words)

In [None]:
df['text'].fillna(value=' ',inplace=True)

In [None]:
df['title'] = df['title'].apply(clean_text, stopwords = all_stopwords)
df['text'] = df['text'].apply(clean_text, stopwords = all_stopwords)

In [None]:
df['cleaned'] = combine_data(df['title'],df['text'])

In [None]:
df.to_csv('./cleanedRedditSuicide.csv', index=False)

In [None]:
fdist = generate_frequency(df['cleaned'])
top_words = fdist.most_common(n=150)

In [None]:
word_dict = {}
for i in range(len(top_words)):
    word_dict[top_words[i][0]] = top_words[i][1]

In [None]:
wordcloud = WordCloud(background_color="white",max_font_size=30).generate_from_frequencies(word_dict)

plt.figure(figsize = (14, 8)) 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df = pd.read_csv('../Dataset/TwitterSuicideData.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['cleaned'] = df['text'].apply(clean_text,stopwords = all_stopwords)

In [None]:
df.to_csv('./cleanedTwitterSuicide.csv', index=False)

In [None]:
fdist = generate_frequency(df['cleaned'])
top_words = fdist.most_common(n=150)

In [None]:
word_dict = {}
for i in range(len(top_words)):
    word_dict[top_words[i][0]] = top_words[i][1]

In [None]:
wordcloud = WordCloud(background_color="white",max_font_size=40).generate_from_frequencies(word_dict)

plt.figure(figsize = (14, 8)) 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df = pd.read_csv('../Dataset/NoSuicideData.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['text'].fillna(value=' ',inplace=True)

In [None]:
df['title'] = df['title'].apply(clean_text,stopwords = all_stopwords)
df['text'] = df['text'].apply(clean_text,stopwords = all_stopwords)

In [None]:
df['cleaned'] = combine_data(df['title'],df['text'])

In [None]:
df.to_csv('./cleanedRedditNonSuicide.csv', index=False)

In [None]:
fdist = generate_frequency(df['cleaned'])
top_words = fdist.most_common(n=150)

In [None]:
word_dict = {}
for i in range(len(top_words)):
    word_dict[top_words[i][0]] = top_words[i][1]

In [None]:
wordcloud = WordCloud(background_color="white",max_font_size=30).generate_from_frequencies(word_dict)

plt.figure(figsize = (14, 8)) 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()