In [None]:
#import of libraries
import pickle
import pandas as pd
import numpy as np
import statistics
from itertools import islice
from sklearn.model_selection import KFold

from helper.data_loading import *
from helper.preprocessing import *
from helper.topic_model_helper import *

# Plotting tools
import matplotlib.pyplot as plt

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
#%matplotlib inline

#pd.set_option('display.max_colwidth', -1)  # or 199

print("###### load data ######")
df_org = get_all_germEval_data()
df_org = df_org[df_org.label_1 == "OTHER"].reset_index(drop=True)

print("###### prepare splits for 5-fold CV ######")
kf = KFold(n_splits=5)

train_splits = []
test_splits = []
for train_index, test_index in kf.split(df_org):
    train = df_org.iloc[train_index].copy()
    test = df_org.iloc[test_index].copy()
    
    train_splits.append(create_poooling_data(train))#
    test_splits.append(create_poooling_data(test))#


In [None]:
# Find the right number of topics
'''
print("###### start 5-fold CV for different k (GENSIM)######")

npmi_score = {'train':[],'test':[]}
different_k = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,25,30,35] #range(2, 30, 2)

for k in different_k:
    npmi_score_cv = {'train':[],'test':[]}
    print("### Number of topics: {} ###".format(k))
    for i in range(0,5):
                
        data_train=prepare_data(train_splits[i], preparePoolingData=True)
        data_test=prepare_data(test_splits[i], preparePoolingData=True)
        
        data_train = filter_extremes(data_train.token,no_below=5,no_above=0.1,min_no_token=5)
        data_test = filter_extremes(data_test.token,no_below=5,no_above=0.1,min_no_token=5)

        write_new_train_test_data(data_train, data_test)     

        dict_train, corpus_train, dict_train_fil, corpus_train_fil = create_dict(data_train, None, None, None)
        
        model = runLDA(dict_train, corpus_train, k,random_state=100)    
        topics = get_LDA_topics(model, k, 10)      

        ##Evaluate
        #npmi
        npmi_score_cv["train"].append(calculate_npmi(topics,"germEval_train",10))
        npmi_score_cv["test"].append(calculate_npmi(topics,"germEval_test",10))
        
        print("fold: ",i)
        print("\tNPMI (train)\tNPMI (test)")
        print("LDA:\t{}\t\t{}".format(npmi_score_cv["train"][-1],npmi_score_cv["test"][-1]))
       
    npmi_score['train'].append(npmi_score_cv["train"])
    npmi_score['test'].append(npmi_score_cv["test"])
    
    #with open('output/npmi_score_gensim.pkl','wb') as f:
    #    pickle.dump(npmi_score, f)

    print("\nFinal Score:")
    print("\tNPMI (train)\tNPMI (test)")
    print("LDA:\t{}\t\t{}".format(npmi_score["train"][-1],npmi_score["test"][-1]))

print("::::FINAL::::")
print("train: ",npmi_score["train"])
print("test: ",npmi_score["test"])
'''

In [None]:
with open('output/topic_model/npmi_score_gensim.pkl','rb') as f:
    npmi_score_gensim = pickle.load(f)
    
x = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,25,30,35] #range(2, 30, 2)


train_score_mean = np.array(list(map(lambda x: statistics.mean(x),npmi_score_gensim["train"])))
train_score_stdev = np.array(list(map(lambda x: statistics.stdev(x),npmi_score_gensim["train"])))

test_score_mean = np.array(list(map(lambda x: statistics.mean(x),npmi_score_gensim["test"])))
test_score_stdev = np.array(list(map(lambda x: statistics.stdev(x),npmi_score_gensim["test"])))

#plot npmi score
fig, ax = plt.subplots(figsize=(15, 8))

ax.plot(x, test_score_mean, lw=2, label='Test data', color='blue')
ax.fill_between(x, test_score_mean+test_score_stdev, test_score_mean-test_score_stdev, facecolor='blue', alpha=0.5)
ax.plot(x, train_score_mean, lw=2, label='Train data', color='orange')
ax.fill_between(x, train_score_mean+train_score_stdev, train_score_mean-train_score_stdev, facecolor='orange', alpha=0.5)

#ax.set_title(r'NPMI score for different k')
ax.legend(loc='upper left')
ax.set_xlabel('Number of topics (k)')
ax.set_ylabel('NPMI score')
ax.xaxis.grid(color='gray', linestyle='dashed')

plt.grid()    

fig.savefig("output/figures/topic_model_npmi_score_different_k.svg", format="svg")

## Run final Topic Model

In [None]:
def runLDA(dict_train, corpus_train, k, random_state=100):    
       return(gensim.models.ldamodel.LdaModel(corpus=corpus_train,
                                           id2word=dict_train,
                                           num_topics=k, 
                                           random_state=random_state,
                                           update_every=0,
                                           passes=25,  #epochs
                                           iterations=2000, #how many iterations the VB is allowed in the E-step/inference without convergence
                                           chunksize=10000,
                                           eval_every=None,
                                           alpha='auto',#'asymmetric',
                                           per_word_topics=True)) #callbacks=callbacks
    

df_all = create_poooling_data(df_org.copy())
df_all = prepare_data(df_all, preparePoolingData=True)

df_all_filtered = filter_extremes(df_all.token, no_below=5, no_above=0.1, min_no_token=5)
write_new_train_test_data(df_all_filtered, df_all_filtered) 

dict_all, corpus_all, dict_train_fil, corpus_train_fil = create_dict(df_all_filtered, None, None, None)

model = runLDA(dict_all, corpus_all, 12,random_state=100)    
topics = get_LDA_topics(model, 12, 10)     

#print("\tNPMI\tLog-Perplexity")
#print("LDA:\t{}\t{}".format(calculate_npmi(topics,"germEval_train","10"),model.log_perplexity(corpus_all)))
for topic in topics:
    print(" ".join(topic))
    
#     NPMI:  Log-Perplexity
#LDA: 0.14  -7.7027527852570765

'''
model.save("output/topic_model/model_lda")
dict_all.save("output/topic_model/dict")
'''

## Visualize topics

In [None]:
#prepare data
df_all = create_poooling_data(df_org.copy())
df_all = prepare_data(df_all, preparePoolingData=True)
df_all_filtered = filter_extremes(df_all.token,no_below=5,no_above=0.1,min_no_token=5)

#load model,dict,create corpus
model = gensim.models.ldamodel.LdaModel.load("output/topic_model/model_lda")
dictionary = gensim.corpora.Dictionary.load("output/topic_model/dict")
corpus = [dictionary.doc2bow(text) for text in df_all_filtered]

In [None]:
#visualize and save 
# set lambda=0.6 for selecting top n words, this improves the coherence for humans 
# quelle: (https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf) 

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)#'mmds','tsne', mds='tsne'
#pyLDAvis.save_html(vis, 'output/topic_model/graphic.html')
vis

In [None]:
#top n words with lambda=0.6
top_10_words_relevance = [
    'venezuela usa brexit sozialismus russland sozialistische maduro einfach sanktionen parlament',
    'aksynode16 altkatholisch synode fakt flüchtling antrag menschen neu müssen ring',
    'frankfurt blockupy heute solidarität erdogan türkei hbf nonazis morgen kundgebung',
    'fcsp schon mehr deutschland ndr gibt wissen bild welt bus',
    'sky moin zusammen schönen tag besser verstanden freitag gruppe spielt',
    'ard deutschland zdf seehofer m18 merkel tatort mal maaßen israel',
    'frau mann immer mal arte leider gesagt folgen dank afd',
    'nsu hamburg spd hamburgpride rassismus chemnitz koeln probleme pride etc',
    'spd cdu csu grüne fdp hartz hambibleibt hambacherforst merkel toll',
   'aachen nazi demo naziwatchac ac1811 antirepac berlin passt repression antifanrw',
   'deutschland afd noafd wer sicher immer macht warum angst land',
   'deutschland antisemitismus jude israel holocaust judenhass muslime auschwitz vernichtung antisemitische',
]
top_10_words_relevance = [topic.split() for topic in top_10_words_relevance]


## Statistics of data

In [None]:
#prepare data
df_all, hashtag_user, hashtag_user_filtered, single_tw = create_poooling_data(df_org.copy(),getStatsBack=True)
df_all = prepare_data(df_all, preparePoolingData=True)
df_all_filtered = filter_extremes(df_all.token,no_below=5,no_above=0.1,min_no_token=5)

#load model,dict,create corpus
model = gensim.models.ldamodel.LdaModel.load("output/topic_model/model_lda")
dictionary = gensim.corpora.Dictionary.load("output/topic_model/dict")
corpus = [dictionary.doc2bow(text) for text in df_all_filtered]

In [None]:
df_org_prepared = prepare_data(df_org.copy())
other_corpus = list(map(dictionary.doc2bow, df_org_prepared.token))

tweet_topic_distribution = get_topic_distribution(model, other_corpus)

infereces_assigned_topic = np.argmax(tweet_topic_distribution,axis=1)
unique, counts = np.unique(infereces_assigned_topic, return_counts=True)

mosted_common_hashtags = {k: v for k, v in sorted(hashtag_user.items(),reverse=True, key=lambda item: item[1])}

topics = get_LDA_topics(model, 12, 10) 
topic_distribution_germEval = list(map(lambda x: round((x/df_org.shape[0]),2),counts))

In [None]:
print("Total tweets:\t\t\t\t",df_org.shape[0])
print("Total hashtags:\t\t\t\t",len(hashtag_user))
print("Hashtags appearing at least 2 times:\t",len(hashtag_user_filtered)-single_tw)
print("Tweets with no hashtag:\t\t\t",single_tw)

print("\ntop 10 hashtags:\n",list(islice(mosted_common_hashtags.items(), 5)))

print("\nTotal documents after pooling:\t\t",df_all.shape[0])
print("Total documents filtered (train data):\t",len(df_all_filtered),"\n filter: no_below=5, no_above=0.1, min_no_token=5")

print("\n######\n\nExample of aggregated documents:\n #nato")
print(df_all.loc[df_all["key"]=="#nato",["text"]].values[0][0])

print("\n #lindner")
print(df_all.loc[df_all["key"]=="#lindner",["text"]].values[0][0])

print("\n######\n\nFinal topic model:\t\t\t 12 topics")
print("Topic distribution for germEval:\t",topic_distribution_germEval)

print("\nTop 10 words:")
for i,topic in enumerate(topics):
    print(str(i+1)+"{ "+" ".join(topic), "} Topic size: ",str(int(topic_distribution_germEval[i]*100))+"%")
    
print("\nApply relevance ranking to topic for more readability:")
print(" formula: lambda*p(w|t) + (1-lambda)*p(w|t)/p(w) with lambda: 0.6\n")
for i,topic in enumerate(top_10_words_relevance):
    print(str(i+1)+"{ "+" ".join(topic), "}")



In [None]:
import matplotlib.pyplot as plt

In [None]:
topic_distribution_germEval

In [None]:
fig.autolayout : True

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['1', '2', '3', '4', '5','6','7','8','9','10','11','12']
students = topic_distribution_germEval
ax.bar(langs, students)
ax.set_xlabel('Topic')
ax.set_ylabel('Frequency')
plt.show()

In [None]:
fig.savefig("output/figures/topic_distribution_germEval.svg", format="svg", bbox_inches = "tight")