In [None]:
import numpy as np
import pandas as pd
import pyspark as ps
import matplotlib.pyplot as plt
import csv

import unicodedata

from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from gensim import corpora, models

import pyLDAvis.gensim

In [None]:
df = pd.read_json('reduced_tweets.json', lines=True)

## Necessary data processing:

Different languages have different pattern but no difference betweet mental illness related tweets and unrelated tweets can be found

In [None]:
#this might be a dumb idea in retrospect. we need to keep èä to use stop word removal effectivley
# df['main'] = df['main'].astype(str).str.lower().\
#                     apply(lambda tweet: unicodedata.normalize('NFD', tweet).\
#                     encode('ascii', 'ignore').decode('utf-8'))
# df['main'] = df['main'].str.replace(r'[^\w\s]', '')
# df['main'].head()

Why we should use the tweets tokenizer:

In [None]:
tknzr = TweetTokenizer()
tknzr.tokenize('mir gahts so so lala lol!! merd:X :D')

In [None]:
'mir gahts so so lala lol!! :D'.split()

In [None]:
tokenized = df['main'].map(lambda x: tknzr.tokenize(x))

In [None]:
#tokenized.to_pickle('tokenized_strings.pkl')
#save time and read from this pkl

In [None]:
# df['split'] = df['main'].str.split()
df['tokenized'] = tokenized

In [None]:
def remove_stops(language, cleaned, frame):
    lang = language[:2]
    if language == 'german':
        lang = 'de'
    lang_set = stopwords.words(language)
    cleaned.loc[cleaned['lang'] == lang, frame] = cleaned.loc[cleaned['lang'] == lang, frame].\
        apply(lambda tweet: [word for word in tweet if word not in lang_set])

In [None]:
remove_stops('english', df, 'tokenized')
remove_stops('french', df, 'tokenized')
remove_stops('german', df, 'tokenized')

In [None]:
df.head()

In [None]:
def stem_words(language, cleaned, frame):
    lang = language[:2]
    if language == 'german':
        lang = 'de'
    
    stemmer = SnowballStemmer(language)
        
    cleaned.loc[cleaned['lang'] == lang, frame] = cleaned.loc[cleaned['lang'] == lang, frame].\
        apply(lambda tweet: [stemmer.stem(word) for word in tweet])

In [None]:
stem_words('english', df, 'tokenized')
stem_words('french', df, 'tokenized')
stem_words('german', df, 'tokenized')

In [None]:
df.tokenized.to_pickle('tokenized_stemmed.pkl')

In [None]:
DICT_PATH = "dictionary.csv"
dictionaries = pd.read_csv(DICT_PATH)
dictionaries.head()

In [None]:
en_dict = dictionaries['english'].dropna()
fr_dict = dictionaries['french'].dropna()
de_dict = pd.concat([dictionaries['german'].dropna(), dictionaries['swiss_german'].dropna()])

In [None]:
def dict_cleaning(lang):
    lang_dict = eval(lang + '_dict')
    lang_dict = lang_dict.astype(str).str.lower().\
                        apply(lambda expression: unicodedata.normalize('NFD', expression).\
                        encode('ascii', 'ignore').decode('utf-8'))

In [None]:
dict_cleaning('en')
dict_cleaning('fr')
dict_cleaning('de')

In [None]:
#Tokenizing
en_dict = en_dict.str.split()
fr_dict = fr_dict.str.split()
de_dict = de_dict.str.split()

In [None]:
def dict_remove_stops(language):
    lang = language[:2]
    if language == 'german':
        lang = 'de'
    
    lang_dict = eval(lang + '_dict')
    lang_set = stopwords.words(language)
        
    lang_dict = lang_dict.apply(lambda expression: [word for word in expression if word not in lang_set])

In [None]:
#Removing stop words
dict_remove_stops('english')
dict_remove_stops('french')
dict_remove_stops('german')

In [None]:
def dict_stem_words(language):
    lang = language[:2]
    if language == 'german':
        lang = 'de'
    
    lang_dict = eval(lang + '_dict')
    stemmer = SnowballStemmer(language)
        
    lang_dict = lang_dict.apply(lambda expression: [stemmer.stem(word) for word in expression])

In [None]:
#Stemming the words
dict_stem_words('english')
dict_stem_words('french')
dict_stem_words('german')

In [None]:
df['merged'] = df.tokenized.map(lambda x:  ' '.join(x))
de_dict = de_dict.map(lambda x:  ' '.join(x))
en_dict = en_dict.map(lambda x:  ' '.join(x))
fr_dict = fr_dict.map(lambda x:  ' '.join(x))

In [None]:
def getMatching(tweet_list, )

In [None]:
df['tokenized'].head()

In [None]:
def check_dict(tweet, dict_):
    """checks if dict entry matches tweet"""
    match = [ w for w in dict_ if w in tweet] #find matching for each entry
    return len(match) > 0 #at least one match

In [None]:
english_filtered = df[df.lang == 'en'][df[df.lang == 'en']['merged'].map(lambda x: check_dict(x, en_dict))]

In [None]:
english_filtered.main.sample(5)

In [None]:
english_filtered.count()

In [None]:
french_filtered = df[df.lang == 'fr'][df[df.lang == 'fr']['tokenized'].map(lambda x: check_dict(x, fr_dict))]
french_filtered.head()

In [None]:
len(french_filtered)

In [None]:
german_filtered = df[df.lang == 'de'][df[df.lang == 'de']['tokenized'].map(lambda x: check_dict(x, de_dict))]
german_filtered.head()

In [None]:
len(german_filtered)

In [None]:
#english_filtered.to_csv('english.csv')

In [None]:
#english_filtered.sample(n=1000).to_csv('english_1000.csv')

In [None]:
#french_filtered.sample(n=1000).to_csv('french_1000.csv')

In [None]:
#move this above
import re
s = "string. With. Punctuation? ! àéè. : ; "
s = re.sub(r'[^\w\s]','',s)
s

In [None]:
english_filtered_2 = english_filtered.tokenized.map(lambda x: [re.sub(r'[^\w\s]','',s) for s in x])

## LDA starts here

In [None]:
dictionary = corpora.Dictionary(english_filtered_2)

In [None]:
dictionary.filter_extremes(no_below=3, no_above=.5)

In [None]:
print(dictionary)

In [None]:
texts = english_filtered_2.tolist()

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
ldamodel = models.LdaMulticore(corpus, id2word=dictionary, num_topics=3, workers=3, iterations=100, passes=5) #takes like 5minutes on leo's pc

In [None]:
ldamodel.show_topics()

In [None]:
#ldamodel =  models.LdaModel.load('lda.model') #retrive lda model

In [None]:
sent_to_cluster = list()
for n,doc in enumerate(corpus):
    if doc:
        cluster = max(ldamodel[doc],key=lambda x:x[1])
        sent_to_cluster.append(cluster[0])

In [None]:
#ldamodel.save('lda.model') #save lda model

In [None]:
vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

In [None]:
pyLDAvis.display(vis_data)

## Same for french etc:

In [None]:
french_filtered_2 = french_filtered.tokenized.map(lambda x: [re.sub(r'[^\w\s]','',s) for s in x])

In [None]:
french_filtered_2.head()

In [None]:
fr_dictionary = corpora.Dictionary(french_filtered_2)
fr_dictionary.filter_extremes(no_below=5, no_above=.3)
fr_texts = french_filtered_2.tolist()
fr_corpus = [fr_dictionary.doc2bow(text) for text in fr_texts]

ldamodel_2 = models.LdaMulticore(fr_corpus, id2word=fr_dictionary, num_topics=10, workers=3, iterations=100, passes=5)

vis_data = pyLDAvis.gensim.prepare(ldamodel_2, fr_corpus, fr_dictionary)

pyLDAvis.display(vis_data)

## Same for german:

In [None]:
german_filtered_2 = german_filtered.tokenized.map(lambda x: [re.sub(r'[^\w\s]','',s) for s in x])

In [None]:
ger_dictionary = corpora.Dictionary(german_filtered_2)
ger_dictionary.filter_extremes(no_below=3, no_above=.5)
ger_texts = german_filtered_2.tolist()
ger_corpus = [ger_dictionary.doc2bow(text) for text in ger_texts]

ldamodel_2 = models.LdaMulticore(ger_corpus, id2word=ger_dictionary, num_topics=5, workers=3, iterations=100, passes=5)

vis_data = pyLDAvis.gensim.prepare(ldamodel_2, ger_corpus, ger_dictionary)

pyLDAvis.display(vis_data)