In [None]:
import pandas as pd
import numpy as np
import re
import pprint
from collections import defaultdict

import string
from string import punctuation

import os

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

import gensim
from gensim import corpora, models, similarities

import nltk
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
df = pd.read_csv(r'C:\Users\Luca Nannini\Desktop/AllDebates.csv')
df.set_index("TURN STARTS", inplace= True)
first_debate = df.iloc[:131]
#first debate text:
debate = list(first_debate.TEXT)

# First Debate Overall LDA

In [None]:
debtok = [
     [word for word in document.lower().split()]
    for document in debate
 ]

In [None]:
debate = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       debtok]
# remove words less than two letters
debate = [list(filter(None, [re.sub(r'\W*\b\w{1,2}\b','', x) for x in y])) for y in
       debate]
# remove punctuation
debate = [list(filter(None, [re.sub("[\.\,\…\!\?\:\;\-\—\_\=\*\@\#\$\"\''\``]",'', x) for x in y]))
                                    for y in debate]

# remove common words and tokenize
stoplist = stopwords.words('english')
stoplist = ['people','want','know','believe','you','really','lot','tell','young','stopandfrisk','that','get','got','much','many','put','kind','thanks','thank','think','well','nobody','take','taken','taking','going','go','things','maybe','something','yes','way','would','could','actually','almost','see','seen','sean','called','thing','let','done','went','say','whether','said','look','one','like','also','good','new','ever','little','cannot','everything','lester','even','hannity'] + list(stoplist)

cleaned_debate = [
     [word for word in document if word not in stoplist]
    for document in debate
 ]

In [None]:
dictionary = corpora.Dictionary(cleaned_debate)
corpus = [dictionary.doc2bow(text) for text in cleaned_debate]

total_topics = 5
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=total_topics, passes=50, per_word_topics=True)

In [None]:
lda.show_topics(total_topics,10)

### First Debate Semantic Heatmap

In [None]:
from collections import OrderedDict

data_lda = {i: OrderedDict(lda.show_topic(i,25)) for i in range(total_topics)}
#data_lda

In [None]:
df_lda = pd.DataFrame(data_lda)
print(df_lda.shape)
df_lda = df_lda.fillna(0).T
print(df_lda.shape)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

g=sns.clustermap(df_lda.corr(), center=0, cmap="RdBu", metric='cosine', linewidths=1, figsize=(10, 12))
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.show()
#plt.setp(ax_heatmap.get_yticklabels(), rotation=0)  # For y axis

### First Debate pyLDAvis

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda, corpus, dictionary, mds='TSNE')
panel

### First Debate Topic WordCloud

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.XKCD_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(background_color='white',
                  width=2000,
                  height=1400,
                  max_words=15,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda.show_topics(formatted=False)

fig, axes = plt.subplots(5, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=200)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

### FastText Debate Word Embeddings 

In [None]:
from gensim.models import FastText
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

model = FastText(cleaned_debate, size=100, window=50, min_count=10, workers=6)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)

In [None]:
def tsne_plot(model):
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
tsne_plot(model)

***

# LDA analysis of each segment

- Achieving Prosperity: jobs creation, bringing back expatriated American manufacturers, tax policy.
- Candidates private scandals: Trump’s tax return release, Clinton’s e-mails scandal.
- America’s Direction: healing race relations, police bias, Trump’s questioning Obama’s birth certificate legitimacy.
- Securing America: national institutions cyber attacks, ISIS, homegrown terroristic attacks, Iraq War, nuclear weapons policy.
- Mutual Acceptance & Election Outcome: Trump’s opinion on Clinton’s public figure, acceptance of election outcome.
 ***

In [None]:
Deb_I = list(first_debate.loc['09:04:52':'09:30:11'].TEXT)

In [None]:
Deb_I = [
     [word for word in document.lower().split()]
    for document in Deb_I
 ]

Deb_I = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       Deb_I]
# remove words less than two letters
Deb_I = [list(filter(None, [re.sub(r'\W*\b\w{1,2}\b','', x) for x in y])) for y in
       Deb_I]
# remove punctuation
Deb_I = [list(filter(None, [re.sub("[\.\,\…\!\?\:\;\-\—\_\=\*\@\#\$\"\''\``]",'', x) for x in y]))
          for y in Deb_I]
# remove common words and tokenize
stoplist = stopwords.words('english')
stoplist = ['know','believe','you','really','lot','that','get','got','much','many','put','kind','thanks','thank','think','well','take','taken','going','go','things','maybe','something','yes','way','would','could','actually','almost','see','sean','called','thing','let','done','went','say','whether','said','look','one','like','also','good','new','ever','little','cannot','everything','lester','even','hannity'] + list(stoplist)
Deb_I = [
     [word for word in document if word not in stoplist]
    for document in Deb_I
 ]

In [None]:
dictionaryI = corpora.Dictionary(Deb_I)
corpusI = [dictionaryI.doc2bow(text) for text in Deb_I]

total_topics = 3
lda_I = models.LdaModel(corpusI, id2word=dictionaryI, num_topics=total_topics)

In [None]:
lda_I.show_topics(total_topics,10)

***

In [None]:
Deb_II = list(first_debate.loc['09:31:38':'09:41:42'].TEXT)

In [None]:
Deb_II = [
     [word for word in document.lower().split()]
    for document in Deb_II
 ]
Deb_II = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       Deb_II]
# remove words less than two letters
Deb_II = [list(filter(None, [re.sub(r'\W*\b\w{1,2}\b','', x) for x in y])) for y in
       Deb_II]
# remove punctuation
Deb_II = [list(filter(None, [re.sub("[\.\,\…\!\?\:\;\-\—\_\=\*\@\#\$\"\''\``]",'', x) for x in y]))
          for y in Deb_II]
# remove common words and tokenize
stoplist = stopwords.words('english')
stoplist = ['know','believe','you','really','lot','that','get','got','much','many','put','kind','thanks','thank','think','well','take','taken','going','go','things','maybe','something','yes','way','would','could','actually','almost','see','sean','called','thing','let','done','went','say','whether','said','look','one','like','also','good','new','ever','little','cannot','everything','lester','even','hannity'] + list(stoplist)
Deb_II = [
     [word for word in document if word not in stoplist]
    for document in Deb_II
 ]

In [None]:
dictionaryII = corpora.Dictionary(Deb_II)
corpusII = [dictionaryII.doc2bow(text) for text in Deb_II]

total_topics = 3
lda_II = models.LdaModel(corpusII, id2word=dictionaryII, num_topics=total_topics)


#IF: (IndexError: index 391 is out of bounds for axis 1 with size 386) THEN shut down and restart the kernel

In [None]:
lda_II.show_topics(total_topics,10)

***

In [None]:
Deb_III = list(first_debate.loc['09:44:06':'10:04:10'].TEXT)

In [None]:
Deb_III = [
     [word for word in document.lower().split()]
    for document in Deb_III
 ]
Deb_III = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       Deb_III]
# remove words less than two letters
Deb_III = [list(filter(None, [re.sub(r'\W*\b\w{1,2}\b','', x) for x in y])) for y in
       Deb_III]
# remove punctuation
Deb_III = [list(filter(None, [re.sub("[\.\,\…\!\?\:\;\-\—\_\=\*\@\#\$\"\''\``]",'', x) for x in y]))
          for y in Deb_III]
# remove common words and tokenize
stoplist = stopwords.words('english')
stoplist = ['know','believe','you','really','lot','that','get','got','much','many','put','kind','thanks','thank','think','well','take','taken','going','go','things','maybe','something','yes','way','would','could','actually','almost','see','sean','called','thing','let','done','went','say','whether','said','look','one','like','also','good','new','ever','little','cannot','everything','lester','even','hannity'] + list(stoplist)
Deb_III = [
     [word for word in document if word not in stoplist]
    for document in Deb_III
 ]

In [None]:
dictionaryIII = corpora.Dictionary(Deb_III)
corpusIII = [dictionaryIII.doc2bow(text) for text in Deb_III]

total_topics = 3
lda_III = models.LdaModel(corpusIII, id2word=dictionaryIII, num_topics=total_topics)

In [None]:
lda_III.show_topics(total_topics,10)

*** 
<br>

In [None]:
Deb_IV = list(first_debate.loc['10:06:26':'10:31:51'].TEXT)

In [None]:
Deb_IV = [
     [word for word in document.lower().split()]
    for document in Deb_IV
 ]
Deb_IV = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       Deb_IV]
# remove words less than two letters
Deb_IV = [list(filter(None, [re.sub(r'\W*\b\w{1,2}\b','', x) for x in y])) for y in
       Deb_IV]
# remove punctuation
Deb_IV = [list(filter(None, [re.sub("[\.\,\…\!\?\:\;\-\—\_\=\*\@\#\$\"\''\``]",'', x) for x in y]))
          for y in Deb_IV]
# remove common words and tokenize
stoplist = stopwords.words('english')
stoplist = ['know','believe','you','really','lot','that','get','got','much','many','put','kind','thanks','thank','think','well','take','taken','going','go','things','maybe','something','yes','way','would','could','actually','almost','see','sean','called','thing','let','done','went','say','whether','said','look','one','like','also','good','new','ever','little','cannot','everything','lester','even','hannity'] + list(stoplist)
Deb_IV = [
     [word for word in document if word not in stoplist]
    for document in Deb_IV
 ]

In [None]:
dictionaryIV = corpora.Dictionary(Deb_IV)
corpusIV = [dictionaryIV.doc2bow(text) for text in Deb_IV]

total_topics = 3
lda_IV = models.LdaModel(corpusIV, id2word=dictionaryIV, num_topics=total_topics)

In [None]:
lda_IV.show_topics(total_topics,10)

*** 
<br>

In [None]:
Deb_V = list(first_debate.loc['10:33:04':'10:37:43'].TEXT)

In [None]:
Deb_V = [
     [word for word in document.lower().split()]
    for document in Deb_V
 ]
Deb_V = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       Deb_V]
# remove words less than two letters
Deb_V = [list(filter(None, [re.sub(r'\W*\b\w{1,2}\b','', x) for x in y])) for y in
       Deb_V]
# remove punctuation
Deb_V = [list(filter(None, [re.sub("[\.\,\…\!\?\:\;\-\—\_\=\*\@\#\$\"\''\``]",'', x) for x in y]))
          for y in Deb_V]
# remove common words and tokenize
stoplist = stopwords.words('english')
stoplist = ['know','believe','you','really','lot','that','get','got','much','many','put','kind','thanks','thank','think','well','take','taken','going','go','things','maybe','something','yes','way','would','could','actually','almost','see','sean','called','thing','let','done','went','say','whether','said','look','one','like','also','good','new','ever','little','cannot','everything','lester','even','hannity'] + list(stoplist)
Deb_V = [
     [word for word in document if word not in stoplist]
    for document in Deb_V
 ]

In [None]:
dictionaryV = corpora.Dictionary(Deb_V)
corpusV = [dictionaryV.doc2bow(text) for text in Deb_V]

total_topics = 2
lda_V = models.LdaModel(corpusV, id2word=dictionaryV, num_topics=total_topics)

In [None]:
lda_V.show_topics(total_topics,10)