In [None]:
# TOPICS - 33
# abstention, accountability, agricultural policy, civil protection,
# corruption, culture, debt, democracy, economy, education, elections,
# employment, energy, entrepreneurship, environment, external affairs,
# health, human rights, housing, infrastructure, justice, labor, media,
# migration, national security, pandemic, pensioners, privatization,
# public sector, social state, transparency, tourism, other


In [3]:
import pandas as pd
import json

president_json_file = '/content/drive/MyDrive/project_data/preprocessed_data_no_stemming/Copy of hitler_preprocessed2.json'

with open(president_json_file, 'r') as f:
  json_str = f.read()

data = json.loads(json_str)
df = pd.DataFrame(columns = ['Text'])
number_of_transcripts = len(data['Items'])
for i in range(number_of_transcripts):
  #store in csv as a different record for each transcript
  df.loc[len(df)] = [data['Items'][i]['transcript']]

data_text = df[['Text']]
data_text['index'] = data_text.index
documents = data_text

### Data Preprocessing

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [30]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
processed_docs = documents['Text'].map(preprocess)

In [31]:
for doc in processed_docs:
  print(doc)

45


### BOW

In [8]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [9]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abandon
1 absenc
2 accept
3 accompani
4 accord
5 add
6 administr
7 advanc
8 advantag
9 alli
10 america


In [10]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[43]

In [13]:
# Example
bow_doc_43 = bow_corpus[43]

for i in range(len(bow_doc_43)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_43[i][0],
                                                     dictionary[bow_doc_43[i][0]],
                                                     bow_doc_43[i][1]))

Word 1 ("alli") appears 1 time.
Word 2 ("answer") appears 1 time.
Word 4 ("asia") appears 4 time.
Word 5 ("attempt") appears 1 time.
Word 6 ("away") appears 2 time.
Word 8 ("better") appears 1 time.
Word 9 ("command") appears 3 time.
Word 10 ("communism") appears 2 time.
Word 11 ("communiti") appears 1 time.
Word 13 ("courag") appears 1 time.
Word 14 ("decid") appears 1 time.
Word 15 ("dedic") appears 1 time.
Word 22 ("event") appears 3 time.
Word 23 ("expect") appears 1 time.
Word 25 ("fail") appears 1 time.
Word 29 ("germani") appears 1 time.
Word 30 ("globe") appears 7 time.
Word 32 ("heavi") appears 1 time.
Word 33 ("hour") appears 2 time.
Word 34 ("independ") appears 2 time.
Word 38 ("involv") appears 6 time.
Word 39 ("ladi") appears 1 time.
Word 40 ("latin") appears 1 time.
Word 43 ("longer") appears 1 time.
Word 44 ("lose") appears 2 time.
Word 47 ("missil") appears 1 time.
Word 48 ("natur") appears 4 time.
Word 50 ("nuclear") appears 2 time.
Word 54 ("prevent") appears 3 time.


### TF-IDF

In [14]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [15]:
corpus_tfidf = tfidf[bow_corpus]

### Running LDA with BOW model

In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=32, id2word=dictionary, passes=10, workers=2)

In [32]:
for idx, topic in lda_model.print_topics(10): # all topics returned ordered by significance
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 18 
Words: 0.073*"treati" + 0.050*"abroad" + 0.049*"nuclear" + 0.044*"allow" + 0.035*"vice" + 0.035*"suggest" + 0.032*"disarma" + 0.029*"question" + 0.028*"number" + 0.028*"subject"
Topic: 25 
Words: 0.079*"univers" + 0.030*"honor" + 0.025*"learn" + 0.023*"student" + 0.018*"talent" + 0.017*"institut" + 0.017*"write" + 0.016*"contribut" + 0.016*"person" + 0.015*"women"
Topic: 1 
Words: 0.133*"berlin" + 0.033*"germani" + 0.031*"threat" + 0.030*"disarma" + 0.029*"soviet" + 0.028*"citi" + 0.024*"crisi" + 0.021*"negoti" + 0.019*"longer" + 0.018*"propos"
Topic: 29 
Words: 0.004*"berlin" + 0.004*"nuclear" + 0.004*"senat" + 0.004*"independ" + 0.004*"latin" + 0.004*"question" + 0.004*"growth" + 0.004*"major" + 0.004*"billion" + 0.004*"improv"
Topic: 31 
Words: 0.038*"citi" + 0.038*"decad" + 0.025*"intend" + 0.019*"nuclear" + 0.019*"month" + 0.019*"week" + 0.019*"expect" + 0.019*"growth" + 0.019*"choos" + 0.019*"ignor"
Topic: 2 
Words: 0.060*"fight" + 0.043*"involv" + 0.038*"command" + 0.

### Performance evaluation for LDA with BOW

In [23]:
for index, score in sorted(lda_model[bow_corpus[40]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 20)))


Score: 0.7672661542892456	 
Topic: 0.014*"nuclear" + 0.012*"growth" + 0.011*"allianc" + 0.011*"billion" + 0.011*"economi" + 0.010*"cooper" + 0.010*"improv" + 0.010*"independ" + 0.010*"month" + 0.010*"alli" + 0.009*"social" + 0.009*"capac" + 0.009*"product" + 0.009*"soviet" + 0.009*"step" + 0.008*"propos" + 0.008*"latin" + 0.008*"expand" + 0.008*"hop" + 0.008*"measur"

Score: 0.20850178599357605	 
Topic: 0.123*"nuclear" + 0.084*"treati" + 0.052*"arm" + 0.051*"soviet" + 0.035*"step" + 0.027*"agreement" + 0.026*"risk" + 0.020*"small" + 0.019*"disarma" + 0.017*"conduct" + 0.016*"debat" + 0.015*"destruct" + 0.013*"hazard" + 0.013*"threat" + 0.013*"mankind" + 0.013*"berlin" + 0.013*"produc" + 0.012*"prevent" + 0.012*"spread" + 0.011*"destroy"

Score: 0.018662666901946068	 
Topic: 0.074*"senat" + 0.049*"question" + 0.028*"vice" + 0.024*"vote" + 0.018*"suggest" + 0.015*"soviet" + 0.015*"latin" + 0.014*"move" + 0.013*"parti" + 0.013*"ahead" + 0.012*"major" + 0.012*"growth" + 0.012*"feder" + 0.

### Performance evaluation on generated text

In [None]:
output_text_file = '/content/drive/MyDrive/gpt2_outputs/gen_hitler.text'
with open(output_text_file, 'r') as f:
  output_text = f.read()

# USING BOW
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Topic:")
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))