In [1]:
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn import metrics 
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
# print (pd.__name__, pd.__version__)

In [2]:
text = pd.read_pickle("movies.p")

In [3]:
text.shape

(304354, 9)

In [5]:
text.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy


In [6]:
def grams_df(df, numgrams):

    grams_df = pd.DataFrame({'grams': [], 'line_id': []})

    for index, row in df.iterrows():
        #print(index)
        #create list of bigrams for each text
        text = row['words']    
        token = nltk.word_tokenize(text)
        grams_list = list(ngrams(token, numgrams))
        #grams_list = [gram for gram in grams]

        #create dict of ngrams
        line = row['line_id']
        gen = row['gender_from']
        d = {'grams': grams_list, 'line_id': np.repeat(line, len(grams_list)), 'gender_from': np.repeat(gen, len(grams_list))}
        grams_df = pd.concat([grams_df, pd.DataFrame(d)], axis = 0)
        
    return grams_df


In [7]:
test = text[:5]

In [8]:
grams_df = grams_df(test,3)
grams_df

Unnamed: 0,gender_from,grams,line_id
0,f,"(we, make, quick)",L194
1,f,"(make, quick, roxanne)",L194
2,f,"(quick, roxanne, korrine)",L194
3,f,"(roxanne, korrine, andrew)",L194
4,f,"(korrine, andrew, barrett)",L194
5,f,"(andrew, barrett, incredibly)",L194
6,f,"(barrett, incredibly, horrendous)",L194
7,f,"(incredibly, horrendous, public)",L194
8,f,"(horrendous, public, break)",L194
9,f,"(public, break, quad)",L194


In [7]:
bigrams = grams_df(text[:10000],2)



In [4]:
# get two separate texts 
women = text[text['gender_from'] == 'f']
men = text[text['gender_from'] == 'm']

In [5]:
women.shape

(80720, 9)

In [6]:
men.shape

(188200, 9)

In [14]:
# transform
count_vect = CountVectorizer(ngram_range = (2,3)) # using bigrams and trigrams
w_words_counts = count_vect.fit_transform(women.words, )
tfidf_transformer = TfidfTransformer()
w_words_tfidf = tfidf_transformer.fit_transform(w_words_counts)

In [19]:
# transform
count_vect2 = CountVectorizer(ngram_range = (2,3)) # using bigrams and trigrams
m_words_counts = count_vect2.fit_transform(men.words, )
tfidf_transformer = TfidfTransformer()
m_words_tfidf = tfidf_transformer.fit_transform(m_words_counts)

In [9]:
# count_vect.vocabulary_

In [13]:
# words_tfidf[1,:].toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [20]:
nmf = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(w_words_tfidf)
nmf2 = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(m_words_tfidf)

In [24]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
# source: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [30]:
len(nmf.components_)

20

In [33]:
no_top_words = 10
display_topics(nmf, count_vect.get_feature_names(), no_top_words)

Topic 0:
jacuzzi get jail true dominican russian israeli james charles charles jail satisfy my dolly mr jake forget door nobody jake everything it like me
Topic 1:
it worth shot him white girl it would almost him way three it yet folks it would rely chuck hello click your keep get promote it would nice
Topic 2:
her hand she her heart would jesse jackson friend baby friend aware it judgement run judgement reflect her her supper her himbry it program exit
Topic 3:
juice free him way three jump car go julia wolf murder him loan you julia he him jump put it behind my jump back seat dialect you
Topic 4:
kane he evil karimojo bell give illegitimate child know karen sisco tail karen bureau iii get idiot break local diamond bottom karen it pleasure devote lloyd would
Topic 5:
jeffrey slow jennifer repeat we fake it survive dr weir starck jello mold factory we earn jell me mom jennifer love fact you serve faith science true
Topic 6:
joe sound jog your memory good let get joel live good chart co

In [21]:
no_top_words = 10
display_topics(nmf2, count_vect2.get_feature_names(), no_top_words)

Topic 0:
you know you know it know it you know you you know him know him you know me you know he know me well you know
Topic 1:
you mean you mean you mean you know you mean you mean it mean it you mean we see you mean you mean like mean like
Topic 2:
thank you thank you sir you sir thank you much you much yes thank you yes thank you thank you thank you thank you thank
Topic 3:
you go think you go think you know you go you go it go you you go you you you go well you go go it
Topic 4:
you think think you you think you you think it think it you think he think he make you think you think we make you
Topic 5:
you say you say you say you you say it say it think you say know you say make you say whatever you say would you say
Topic 6:
yes sir yes sir it sir it sir yes sir yes sir you sir yes sir you yes sir we sir we yes sir they
Topic 7:
you want you want me want me you want know want know you want go want go you want it want it it you want
Topic 8:
you talk know you talk hell you talk hell 

In [13]:
bigrams = grams_df(text[:50000],2)



In [14]:
bigrams['grams'].value_counts()[:10]

(i, know)       1553
(you, know)     1347
(i, think)      1234
(i, want)        967
(i, get)         910
(you, get)       836
(you, want)      730
(gon, na)        625
(you, i)         602
(you, think)     596
Name: grams, dtype: int64

In [20]:
# remove unknown gender
bigrams = bigrams[bigrams['gender_from'] != '?']

# get counts
grouped = bigrams.groupby(['gender_from', 'grams']).count()
g = grouped['line_id'].groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(10))
g

gender_from  grams       
f            (i, know)       514
             (you, know)     400
             (i, think)      391
             (i, want)       291
             (i, get)        256
             (you, i)        205
             (know, i)       199
             (you, want)     199
             (you, get)      183
             (i, go)         175
m            (i, know)       920
             (you, know)     838
             (i, think)      752
             (i, want)       602
             (you, get)      590
             (i, get)        582
             (you, want)     480
             (gon, na)       438
             (you, think)    378
             (i, go)         376
Name: line_id, dtype: int64

In [21]:
bigrams_genre = pd.merge(bigrams, text[['genre', 'line_id']], on = 'line_id')

In [22]:
bigrams_genre.head()

Unnamed: 0,gender_from,grams,line_id,genre
0,f,"(we, make)",L194,comedy
1,f,"(make, quick)",L194,comedy
2,f,"(quick, roxanne)",L194,comedy
3,f,"(roxanne, korrine)",L194,comedy
4,f,"(korrine, andrew)",L194,comedy


In [23]:
grouped2 = bigrams_genre.groupby(['genre','gender_from', 'grams']).count()
g2 = grouped2['line_id'].groupby(['genre', 'gender_from'], group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(5))
g2.reset_index()

Unnamed: 0,genre,gender_from,grams,line_id
0,action,f,"(you, know)",80
1,action,f,"(i, think)",71
2,action,f,"(i, know)",64
3,action,f,"(you, get)",42
4,action,f,"(mr, peel)",36
5,action,m,"(you, know)",197
6,action,m,"(i, know)",185
7,action,m,"(gon, na)",153
8,action,m,"(i, want)",144
9,action,m,"(i, get)",140
