In [1]:
import pandas as pd
import numpy as np
# LDA, tSNE
from sklearn.manifold import TSNE
from gensim.models.ldamodel import LdaModel
# NLTK
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()



In [2]:
#data to 

In [3]:
with open('movie_summaries_bottom_250.txt', 'r') as myfile:
    summary=myfile.readlines()
with open('movie_titles_bottom_250.txt', 'r') as myfile:
    title=myfile.readlines()

In [4]:
import string
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()
preprocess = [strip_proppers(doc) for doc in summary]


In [5]:
df = pd.DataFrame(columns=['Title','Year','Summary'])
se = pd.Series(preprocess)
ti = pd.Series(title)
df['Summary'] = se.values
df['Title'] = ti.values

In [6]:
# Removing numerals:
df['Summary_tokens'] = df.Summary.map(lambda x: re.sub(r'\d+', '', x))
# Lower case:
df['Summary_tokens'] = df.Summary.map(lambda x: x.lower())
print(df['Summary_tokens'][0][:500])
df.to_csv('Combine_bottom.csv', encoding= 'utf-8')

unlocks the never-before-seen secret world inside your smartphone within the messaging app is a bustling city where all your favorite emojis live hoping to be selected by the phone\ user this world each emoji has only one facial expression except for an exuberant emoji who was born without a filter and is bursting with multiple expressions to become normal like the other emojis enlists the help of his handy best friend and the notorious code breaker emoji they embark on an epic app-venture throu


In [7]:
df['Summary_tokens'] = df.Summary_tokens.map(lambda x: RegexpTokenizer(r'\w+').tokenize(x))
print(df['Summary_tokens'][0][:25])

['unlocks', 'the', 'never', 'before', 'seen', 'secret', 'world', 'inside', 'your', 'smartphone', 'within', 'the', 'messaging', 'app', 'is', 'a', 'bustling', 'city', 'where', 'all', 'your', 'favorite', 'emojis', 'live', 'hoping']


In [8]:
snowball = SnowballStemmer("english")  
df['Summary_tokens'] = df.Summary_tokens.map(lambda x: [snowball.stem(token) for token in x])
print(df['Summary_tokens'][0][:25])

['unlock', 'the', 'never', 'befor', 'seen', 'secret', 'world', 'insid', 'your', 'smartphon', 'within', 'the', 'messag', 'app', 'is', 'a', 'bustl', 'citi', 'where', 'all', 'your', 'favorit', 'emoji', 'live', 'hope']


In [9]:
stop_en = stopwords.words('english')
df['Summary_tokens'] = df.Summary_tokens.map(lambda x: [t for t in x if t not in stop_en]) 
print(df['Summary_tokens'][0][:25])

['unlock', 'never', 'befor', 'seen', 'secret', 'world', 'insid', 'smartphon', 'within', 'messag', 'app', 'bustl', 'citi', 'favorit', 'emoji', 'live', 'hope', 'select', 'phone', 'user', 'world', 'emoji', 'onli', 'one', 'facial']


In [10]:
df['Summary_tokens'] = df.Summary_tokens.map(lambda x: [t for t in x if len(t) > 1])
print(df['Summary_tokens'][0][:25])

['unlock', 'never', 'befor', 'seen', 'secret', 'world', 'insid', 'smartphon', 'within', 'messag', 'app', 'bustl', 'citi', 'favorit', 'emoji', 'live', 'hope', 'select', 'phone', 'user', 'world', 'emoji', 'onli', 'one', 'facial']


In [11]:
from gensim import corpora, models
np.random.seed(2017)
texts = df['Summary_tokens'].values
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = models.ldamodel.LdaModel(corpus, id2word=dictionary, 
                                    num_topics=8, passes=5, minimum_probability=0)

In [12]:
hm = np.array([[y for (x,y) in ldamodel[corpus[i]]] for i in range(len(corpus))])


In [15]:
tsne = TSNE(random_state=2017, perplexity=30)
tsne_embedding = tsne.fit_transform(hm)
tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y'])
tsne_embedding['hue'] = hm.argmax(axis=1)
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider


In [18]:
source = ColumnDataSource(
        data=dict(
            x = tsne_embedding.x,
            y = tsne_embedding.y,
            colors = [all_palettes['Set1'][8][i] for i in tsne_embedding.hue],
            title = df.Title,

            alpha = [0.9] * tsne_embedding.shape[0],
            size = [7] * tsne_embedding.shape[0]
        )
    )
hover_tsne = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Title:</span>
            <span style="font-size: 12px">@title</span>

        </div>
    </div>
    """)
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(plot_width=1000, plot_height=500, tools=tools_tsne, title='Papers')
plot_tsne.circle('x', 'y', size='size', fill_color='colors', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source, name="df")

callback = CustomJS(args=dict(source=source), code="""
    var data = source.data;
    var f = cb_obj.value
    x = data['x']
    y = data['y']
    colors = data['colors']
    alpha = data['alpha']
    title = data['Title']
    size = data['size']

    source.trigger('change');
""")



layout = column(plot_tsne)

In [19]:
show(layout)


In [None]:
#https://www.kaggle.com/yohanb/lda-visualized-using-t-sne-and-bokeh