# Packages Required

In [None]:
import pandas as pd
import numpy as np
import re
import pprint
from collections import defaultdict

import string
from string import punctuation

import os

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

import gensim
from gensim import corpora, models, similarities

import nltk
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Time conversion of Tweets  

In [None]:
tweets = pd.read_csv(r'C:\Users\Luca Nannini\Desktop/TweetsClean.csv')
tweet = tweets[tweets['RT'] == False]
tweet

In [None]:
#import datetime
from datetime import datetime
from datetime import timedelta

clean_timestamp =  tweet['created_at'].apply(lambda x: datetime.strptime(x,'%Y-%m-%dT%H:%M:%SZ'))
#offset in hours for EST timezone
offset_hours = -16
#account for offset from UTC using timedelta                                
local_timestamp = clean_timestamp + timedelta(hours=offset_hours)  

In [None]:
final_timestamp = local_timestamp.dt.strftime('%X')

In [None]:
#create a new dataframe with the new time format and tweets text
TIME = final_timestamp
TEXT = tweet['text']
new_tweet = pd.DataFrame(dict(TIME = TIME, TEXT = TEXT))
new_tweet.set_index("TIME", inplace= True)

In [None]:
new_tweet

# Overall tweet volume & tweets LDA 

### Tweet Volume graph

#NB. In order to visualize the tweet volume, the TIME column must be moved away as index. 
BUT for sub-chunk the new_tweet dataframe function should not be run.
Then, for the following segments analysis section do RUN AGAIN the entire time conversion section obtaining again the original "new_tweet" dataframe.

In [None]:
new_tweet.reset_index(inplace= True)

In [None]:
tweets_ratio = pd.to_datetime(new_tweet['TIME'], format='%X')

trace = go.Histogram(
    x=tweets_ratio,
    marker=dict(
        color='lightblue'
    ),
    opacity=0.75
)

layout = go.Layout(
    title='Overall Debate',
    height=450,
    width=1200,
    xaxis=dict(
        title='Time Segment for each bar = 29sec'
    ),
    yaxis=dict(
        title='Tweet Volume'
    ),
    bargap=0.2,
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

### Tweets LDA 

In [None]:
tweets = list(new_tweet.TEXT)

twtok = [
     [word for word in document.lower().split()]
      for document in tweets
 ]

# remove words that appear only once
frequency = defaultdict(int)
for text in twtok:
     for token in text:
        frequency[token] += 1
        
debtok = [
     [token for token in text if frequency[token] > 1]
     for text in twtok
 ]

In [None]:
tweets = [list(filter(None, [re.sub(r'\b\w{1,2}\b','', x) for x in y])) for y in
       debtok]
tweets = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(@[A-Za-z0-9]+)','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[\.\,\'\"\!\?\:\;\-\_\/\=\(\)\|\*\&\@\#\$\"]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[^\x00-\x7F]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(.)\1{2,}',r'\1', x) for x in y])) for y in
       tweets]

In [None]:
stoplist1 = ['hillary','clinton','trump','donald','debatenight','debate','people','want','know','believe','you','really','lot','tell','young','stopandfrisk','that','get','got','much','many','put','kind','thanks','thank','think','well','nobody','take','taken','taking','going','go','things','maybe','something','yes','way','would','could','actually','almost','see','seen','sean','called','thing','let','done','went','say','whether','said','look','one','like','also','good','new','ever','little','cannot','everything','lester','even','hannity'] + list(stoplist)

In [None]:
lmtzr = WordNetLemmatizer()
stoplist = stopwords.words('english')
cleaned_tweets = [
     [lmtzr.lemmatize(word) for word in document if word not in stoplist1]
    for document in tweets
 ]

In [None]:
cleaned_tweets

In [None]:
dictionary = corpora.Dictionary(cleaned_tweets)
corpus = [dictionary.doc2bow(text) for text in cleaned_tweets]

In [None]:
total_topics = 5
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=total_topics)

In [None]:
lda.show_topics(total_topics,10)

### Tweets Semantic Heatmap

In [None]:
from collections import OrderedDict

data_lda = {i: OrderedDict(lda.show_topic(i,25)) for i in range(total_topics)}
#data_lda

In [None]:
df_lda = pd.DataFrame(data_lda)
print(df_lda.shape)
df_lda = df_lda.fillna(0).T
print(df_lda.shape)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

g=sns.clustermap(df_lda.corr(), center=0, cmap="RdBu", metric='cosine', linewidths=1, figsize=(10, 12))
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.show()
#plt.setp(ax_heatmap.get_yticklabels(), rotation=0)  # For y axis

### Tweets pyLDAvis

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda, corpus, dictionary, mds='TSNE')
panel

### Tweets Debate Topic WordCloud

***

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.XKCD_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(background_color='white',
                  width=2000,
                  height=1400,
                  max_words=15,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda.show_topics(formatted=False)

fig, axes = plt.subplots(5, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=200)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

### FastText Debate Word Embeddings 

In [None]:
from gensim.models import FastText
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

model = FastText(cleaned_tweets, size=100, window=50, min_count=500, workers=6)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)

In [None]:
def tsne_plot(model):
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
tsne_plot(model)

<br>

## Debate & Tweets Segment Time Frames

In tweets segments, +5 sec from the end of the final sentence expressed by a candidate is given before moving to the next one 

- Achieving Prosperity: jobs creation, bringing back expatriated American manufacturers, tax policy.
- Candidates private scandals: Trump’s tax return release, Clinton’s e-mails scandal.
- America’s Direction: healing race relations, police bias, Trump’s questioning Obama’s birth certificate legitimacy.
- Securing America: national institutions cyber attacks, ISIS, homegrown terroristic attacks, Iraq War, nuclear weapons policy.
- Mutual Acceptance & Election Outcome: Trump’s opinion on Clinton’s public figure, acceptance of election outcome.
 ***

N.B. Before running this section, RUN AGAIN the ENTIRE time conversion section for obtain the original "new_tweets" dataframe having as index the TIME - otherwise these following codes will output list of strings with no tokens

### I° Segment, "Achieving Prosperity"

In [None]:
#define "Achieving Prosperity" segment
TW_I = new_tweet.loc['09:04:53':'09:31:23']
TW_I.reset_index(inplace= True)

In [None]:
TW_I_TIME = TW_I['TIME'] 

tweets_ratio_I = pd.to_datetime(TW_I_TIME, format='%X')

trace = go.Histogram(
    x=tweets_ratio_I,
    marker=dict(
        color='lightblue'
    ),
    opacity=0.75
)

layout = go.Layout(
    title='I. Achieving Prosperity',
    height=450,
    width=1200,
    xaxis=dict(
        title='Time Segment for each bar = 14sec'
    ),
    yaxis=dict(
        title='Tweet Volume'
    ),
    bargap=0.2,
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [None]:
tweets1 = list(TW_I.TEXT)

twtok1 = [
     [word for word in document.lower().split()]
      for document in tweets1
 ]

# remove words that appear only once
frequency = defaultdict(int)
for text in twtok1:
     for token in text:
        frequency[token] += 1
        
debtok = [
     [token for token in text if frequency[token] > 1]
     for text in twtok1
 ]

In [None]:
tweets = [list(filter(None, [re.sub(r'\b\w{1,2}\b','', x) for x in y])) for y in
       debtok]
tweets = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(@[A-Za-z0-9]+)','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[\.\,\'\"\!\?\:\;\-\_\/\=\(\)\|\*\@\#\$\"]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[^\x00-\x7F]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(.)\1{2,}',r'\1', x) for x in y])) for y in
       tweets]

lmtzr = WordNetLemmatizer()
stoplist = stopwords.words('english')
cleaned_tweets1 = [
     [lmtzr.lemmatize(word) for word in document if word not in stoplist1]
    for document in tweets
 ]

cleaned_tweets1

In [None]:
dictionary1 = corpora.Dictionary(cleaned_tweets1)

corpus1 = [dictionary1.doc2bow(text) for text in cleaned_tweets1]

In [None]:
total_topics = 3
lda = models.LdaModel(corpus1, id2word=dictionary1, num_topics=total_topics)

In [None]:
lda.show_topics(total_topics,10)

<br>

### II° Segment, "Candidate Figure Issues"

In [None]:
#define "Candidate Figure Issues" segment
TW_II = new_tweet.loc['09:31:38':'09:43:41']
TW_II.reset_index(inplace= True)

In [None]:
TW_II_TIME = TW_II['TIME'] 

tweets_ratio_II = pd.to_datetime(TW_II_TIME, format='%X')

trace = go.Histogram(
    x=tweets_ratio_II,
    marker=dict(
        color='lightblue'
    ),
    opacity=0.75
)

layout = go.Layout(
    title='II. Candidate Figure Issues',
    height=450,
    width=1200,
    xaxis=dict(
        title='Time Segment for each bar = 9sec'
    ),
    yaxis=dict(
        title='Tweet Volume'
    ),
    bargap=0.2,
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [None]:
tweets2 = list(TW_II.TEXT)

twtok2 = [
     [word for word in document.lower().split()]
      for document in tweets2
 ]

# remove words that appear only once
frequency = defaultdict(int)
for text in twtok2:
     for token in text:
        frequency[token] += 1
        
debtok2 = [
     [token for token in text if frequency[token] > 1]
     for text in twtok2
 ]

tweets = [list(filter(None, [re.sub(r'\b\w{1,2}\b','', x) for x in y])) for y in
       debtok2]
tweets = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(@[A-Za-z0-9]+)','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[\.\,\'\"\!\?\:\;\_\-\/\=\(\)\|\*\@\&\#\$\"]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[^\x00-\x7F]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(.)\1{2,}',r'\1', x) for x in y])) for y in
       tweets]

lmtzr = WordNetLemmatizer()
cleaned_tweets2 = [
     [lmtzr.lemmatize(word) for word in document if word not in stoplist1]
    for document in tweets
 ]

cleaned_tweets2

In [None]:
dictionary2 = corpora.Dictionary(cleaned_tweets2)
corpus2 = [dictionary2.doc2bow(text) for text in cleaned_tweets2]

In [None]:
total_topics = 3
lda2 = models.LdaModel(corpus2, id2word=dictionary2, num_topics=total_topics)

lda2.show_topics(total_topics,10)

### III° Segment, "America's Direction"

In [None]:
#define "America's direction" segment
TW_III = new_tweet.loc['09:44:06':'10:06:14']
TW_III.reset_index(inplace= True)

In [None]:
TW_III_TIME = TW_III['TIME'] 

tweets_ratio_III = pd.to_datetime(TW_III_TIME, format='%X')

trace = go.Histogram(
    x=tweets_ratio_III,
    marker=dict(
        color='lightblue'
    ),
    opacity=0.75
)

layout = go.Layout(
    title="III. America's Direction",
    height=450,
    width=1200,
    xaxis=dict(
        title='Time Segment for each bar = 14sec'
    ),
    yaxis=dict(
        title='Tweet Volume'
    ),
    bargap=0.2,
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [None]:
tweets3 = list(TW_III.TEXT)

twtok3 = [
     [word for word in document.lower().split()]
      for document in tweets3
 ]

# remove words that appear only once
frequency = defaultdict(int)
for text in twtok3:
     for token in text:
        frequency[token] += 1
        
debtok2 = [
     [token for token in text if frequency[token] > 1]
     for text in twtok3
 ]

tweets = [list(filter(None, [re.sub(r'\b\w{1,2}\b','', x) for x in y])) for y in
       debtok2]
tweets = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(@[A-Za-z0-9]+)','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[\.\,\'\"\!\?\:\;\-\_\/\=\(\)\|\*\&\@\#\$\"]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[^\x00-\x7F]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(.)\1{2,}',r'\1', x) for x in y])) for y in
       tweets]

lmtzr = WordNetLemmatizer()
cleaned_tweets3 = [
     [lmtzr.lemmatize(word) for word in document if word not in stoplist1]
    for document in tweets
 ]

cleaned_tweets3

In [None]:
dictionary3 = corpora.Dictionary(cleaned_tweets3)
corpus3 = [dictionary3.doc2bow(text) for text in cleaned_tweets3]

In [None]:
total_topics = 3
lda3 = models.LdaModel(corpus3, id2word=dictionary3, num_topics=total_topics)

lda3.show_topics(total_topics,10)

### IV° Segment, "Securing America"

In [None]:
#define "Securing America" segment
TW_IV = new_tweet.loc['10:06:26':'10:33:00']
TW_IV.reset_index(inplace= True)

In [None]:
TW_IV_TIME = TW_IV['TIME'] 

tweets_ratio_IV = pd.to_datetime(TW_IV_TIME, format='%X')

trace = go.Histogram(
    x=tweets_ratio_IV,
    marker=dict(
        color='lightblue'
    ),
    opacity=0.75
)

layout = go.Layout(
    title='IV. Securing America',
    height=450,
    width=1200,
    xaxis=dict(
        title='Time Segment for each bar = 14sec'
    ),
    yaxis=dict(
        title='Tweet Volume'
    ),
    bargap=0.2,
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [None]:
tweets4 = list(TW_IV.TEXT)

twtok4 = [
     [word for word in document.lower().split()]
      for document in tweets4
 ]

# remove words that appear only once
frequency = defaultdict(int)
for text in twtok4:
     for token in text:
        frequency[token] += 1
        
debtok4 = [
     [token for token in text if frequency[token] > 1]
     for text in twtok4
 ]

tweets = [list(filter(None, [re.sub(r'\b\w{1,2}\b','', x) for x in y])) for y in
       debtok4]
tweets = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(@[A-Za-z0-9]+)','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[\.\,\'\"\!\?\:\;\-\_\/\=\(\)\|\*\&\@\#\$\"]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[^\x00-\x7F]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(.)\1{2,}',r'\1', x) for x in y])) for y in
       tweets]

lmtzr = WordNetLemmatizer()
cleaned_tweets4 = [
     [lmtzr.lemmatize(word) for word in document if word not in stoplist1]
    for document in tweets
 ]

cleaned_tweets4

In [None]:
dictionary4 = corpora.Dictionary(cleaned_tweets4)
corpus4 = [dictionary4.doc2bow(text) for text in cleaned_tweets4]

In [None]:
total_topics = 3
lda4 = models.LdaModel(corpus4, id2word=dictionary4, num_topics=total_topics)

lda4.show_topics(total_topics,10)

### V° Segment, "Mutual & Election Acceptance"

In [None]:
#define "Mutual and Election Acceptance"
TW_V = new_tweet.loc['10:33:04':'10:38:56']
TW_V.reset_index(inplace= True)

In [None]:
TW_V_TIME = TW_V['TIME'] 

tweets_ratio_V = pd.to_datetime(TW_V_TIME, format='%X')

trace = go.Histogram(
    x=tweets_ratio_V,
    marker=dict(
        color='lightblue'
    ),
    opacity=0.75
)

layout = go.Layout(
    title='V. Mutual and Election Acceptance',
    height=450,
    width=1200,
    xaxis=dict(
        title='Time Segment for each bar = 9sec'
    ),
    yaxis=dict(
        title='Tweet Volume'
    ),
    bargap=0.2,
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [None]:
tweets5 = list(TW_V.TEXT)

twtok5 = [
     [word for word in document.lower().split()]
      for document in tweets5
 ]

# remove words that appear only once
frequency = defaultdict(int)
for text in twtok5:
     for token in text:
        frequency[token] += 1
        
debtok5 = [
     [token for token in text if frequency[token] > 1]
     for text in twtok5
 ]

tweets = [list(filter(None, [re.sub(r'\b\w{1,2}\b','', x) for x in y])) for y in
       debtok5]
tweets = [list(filter(None, [re.sub(r'\d+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(@[A-Za-z0-9]+)','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[\.\,\'\"\!\?\:\;\-\_\/\=\(\)\|\*\@\#\$\"]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'[^\x00-\x7F]+','', x) for x in y])) for y in
       tweets]
tweets = [list(filter(None, [re.sub(r'(.)\1{2,}',r'\1', x) for x in y])) for y in
       tweets]

lmtzr = WordNetLemmatizer()
cleaned_tweets5 = [
     [lmtzr.lemmatize(word) for word in document if word not in stoplist]
    for document in tweets
 ]

cleaned_tweets5

In [None]:
dictionary5 = corpora.Dictionary(cleaned_tweets5)
corpus5 = [dictionary5.doc2bow(text) for text in cleaned_tweets5]

In [None]:
total_topics = 2
lda5 = models.LdaModel(corpus5, id2word=dictionary5, num_topics=total_topics)

lda5.show_topics(total_topics,10)