In [14]:
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn import metrics 
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
# print (pd.__name__, pd.__version__)

In [3]:
text = pd.read_pickle("movies.p")

In [4]:
text.shape

(304354, 9)

In [5]:
text.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy


In [6]:
def grams_df(df, numgrams):

    grams_df = pd.DataFrame({'grams': [], 'line_id': []})

    for index, row in df.iterrows():
        #print(index)
        #create list of bigrams for each text
        text = row['words']    
        token = nltk.word_tokenize(text)
        grams_list = list(ngrams(token, numgrams))
        #grams_list = [gram for gram in grams]

        #create dict of ngrams
        line = row['line_id']
        gen = row['gender_from']
        d = {'grams': grams_list, 'line_id': np.repeat(line, len(grams_list)), 'gender_from': np.repeat(gen, len(grams_list))}
        grams_df = pd.concat([grams_df, pd.DataFrame(d)], axis = 0)
        
    return grams_df


In [7]:
test = text[:5]

In [8]:
grams_df = grams_df(test,3)
grams_df

Unnamed: 0,gender_from,grams,line_id
0,f,"(we, make, quick)",L194
1,f,"(make, quick, roxanne)",L194
2,f,"(quick, roxanne, korrine)",L194
3,f,"(roxanne, korrine, andrew)",L194
4,f,"(korrine, andrew, barrett)",L194
5,f,"(andrew, barrett, incredibly)",L194
6,f,"(barrett, incredibly, horrendous)",L194
7,f,"(incredibly, horrendous, public)",L194
8,f,"(horrendous, public, break)",L194
9,f,"(public, break, quad)",L194


In [7]:
bigrams = grams_df(text[:10000],2)



In [20]:
bigrams_series = bigrams.grams
bigrams_series[:10]

0                  (we, make)
1               (make, quick)
2            (quick, roxanne)
3          (roxanne, korrine)
4           (korrine, andrew)
5           (andrew, barrett)
6       (barrett, incredibly)
7    (incredibly, horrendous)
8        (horrendous, public)
9             (public, break)
Name: grams, dtype: object

In [21]:
# get two separate texts 
women = text[text['gender_from'] == 'f']
men = text[text['gender_from'] == 'm']

In [24]:
women.shape

(80720, 9)

In [23]:
men.shape

(188200, 9)

In [25]:
# transform
count_vect = CountVectorizer(ngram_range = (1,2)) # using bag of words and bigrams
w_words_counts = count_vect.fit_transform(women.words, )
tfidf_transformer = TfidfTransformer()
w_words_tfidf = tfidf_transformer.fit_transform(w_words_counts)

In [41]:
# transform
count_vect = CountVectorizer(ngram_range = (1,2)) # using bag of words and bigrams
m_words_counts = count_vect.fit_transform(men.words, )
tfidf_transformer = TfidfTransformer()
m_words_tfidf = tfidf_transformer.fit_transform(m_words_counts)

In [26]:
# count_vect.vocabulary_

{'we': 192255,
 'make': 105092,
 'quick': 141345,
 'roxanne': 149095,
 'korrine': 93881,
 'andrew': 5642,
 'barrett': 12381,
 'incredibly': 83059,
 'horrendous': 79973,
 'public': 140281,
 'break': 18715,
 'quad': 141068,
 'we make': 192885,
 'make quick': 105447,
 'quick roxanne': 141386,
 'roxanne korrine': 149096,
 'korrine andrew': 93882,
 'andrew barrett': 5643,
 'barrett incredibly': 12385,
 'incredibly horrendous': 83067,
 'horrendous public': 79974,
 'public break': 140284,
 'break quad': 18814,
 'well': 194193,
 'think': 178322,
 'start': 166351,
 'pronunciation': 139757,
 'okay': 126318,
 'you': 201752,
 'well think': 194809,
 'think we': 179130,
 'we start': 193254,
 'start pronunciation': 166545,
 'pronunciation okay': 139758,
 'okay you': 126621,
 'hacking': 69606,
 'gagging': 60072,
 'spit': 165266,
 'part': 130883,
 'please': 135865,
 'hacking gagging': 69607,
 'gagging spit': 60073,
 'spit part': 165274,
 'part please': 131001,
 'bout': 17973,
 'try': 184327,
 'french':

In [13]:
# words_tfidf[1,:].toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [42]:
#nmf = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(w_words_tfidf)
nmf2 = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(m_words_tfidf)

In [38]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
# source: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [40]:
no_top_words = 10
display_topics(nmf, count_vect.get_feature_names(), no_top_words)

Topic 0:
you you know you you thank you thank mean you mean see you go talk
Topic 1:
yes yes you yes it oh yes yes yes yes sir sir yes know yes he say yes
Topic 2:
it it you it it know it you it think it get it make it go like it
Topic 3:
know you know know you know it know know want know know he he know know him they
Topic 4:
yeah oh yeah yeah you yeah it yeah yeah yeah well yeah know yeah guess yeah sure yeah right
Topic 5:
he he say him know he he go his think he he want he get he he
Topic 6:
oh god oh my oh yeah oh god oh you my god my oh yes oh oh
Topic 7:
go you go we go let it go let go go you go back go home want go
Topic 8:
me tell tell me you tell me you let me tell you let it me excuse me
Topic 9:
think you think think you think it think he think we really they she think they
Topic 10:
okay you okay it okay okay okay okay you okay it oh okay okay we everything okay he okay
Topic 11:
right you right it right right you right right he right right it yeah right oh right she righ

In [43]:
no_top_words = 10
display_topics(nmf2, count_vect.get_feature_names(), no_top_words)

Topic 0:
you you know you you thank you thank talk see you go like you talk
Topic 1:
yes yes sir sir yes it yes you yes yes oh yes yes he yes know yes we
Topic 2:
it it you get it you it like it it know it think it take make
Topic 3:
yeah oh yeah yeah you yeah yeah yeah it yeah right yeah sure yeah well yeah know yeah we
Topic 4:
know you know know you know it know know know he we know want know she they
Topic 5:
go you go we go let let go it go go you go home go back home
Topic 6:
right you right it right right you right right yeah right he right right it right back right we
Topic 7:
he he say him his he get know he he go he want think he he know
Topic 8:
me tell tell me you tell me you she let me tell you let it me
Topic 9:
we we go we get talk we know think we we need we talk need him
Topic 10:
think you think think you think it think we think he really she think they think she
Topic 11:
oh oh yeah god oh you oh my oh yes my god oh god my oh shit
Topic 12:
say you say he say she say

In [30]:
"hey {}".format("hi")

'hey hi'

In [34]:
" ".join("hi")

'h i'

In [13]:
bigrams = grams_df(text[:50000],2)



In [14]:
bigrams['grams'].value_counts()[:10]

(i, know)       1553
(you, know)     1347
(i, think)      1234
(i, want)        967
(i, get)         910
(you, get)       836
(you, want)      730
(gon, na)        625
(you, i)         602
(you, think)     596
Name: grams, dtype: int64

In [20]:
# remove unknown gender
bigrams = bigrams[bigrams['gender_from'] != '?']

# get counts
grouped = bigrams.groupby(['gender_from', 'grams']).count()
g = grouped['line_id'].groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(10))
g

gender_from  grams       
f            (i, know)       514
             (you, know)     400
             (i, think)      391
             (i, want)       291
             (i, get)        256
             (you, i)        205
             (know, i)       199
             (you, want)     199
             (you, get)      183
             (i, go)         175
m            (i, know)       920
             (you, know)     838
             (i, think)      752
             (i, want)       602
             (you, get)      590
             (i, get)        582
             (you, want)     480
             (gon, na)       438
             (you, think)    378
             (i, go)         376
Name: line_id, dtype: int64

In [21]:
bigrams_genre = pd.merge(bigrams, text[['genre', 'line_id']], on = 'line_id')

In [22]:
bigrams_genre.head()

Unnamed: 0,gender_from,grams,line_id,genre
0,f,"(we, make)",L194,comedy
1,f,"(make, quick)",L194,comedy
2,f,"(quick, roxanne)",L194,comedy
3,f,"(roxanne, korrine)",L194,comedy
4,f,"(korrine, andrew)",L194,comedy


In [23]:
grouped2 = bigrams_genre.groupby(['genre','gender_from', 'grams']).count()
g2 = grouped2['line_id'].groupby(['genre', 'gender_from'], group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(5))
g2.reset_index()

Unnamed: 0,genre,gender_from,grams,line_id
0,action,f,"(you, know)",80
1,action,f,"(i, think)",71
2,action,f,"(i, know)",64
3,action,f,"(you, get)",42
4,action,f,"(mr, peel)",36
5,action,m,"(you, know)",197
6,action,m,"(i, know)",185
7,action,m,"(gon, na)",153
8,action,m,"(i, want)",144
9,action,m,"(i, get)",140


In [24]:
bigrams.groupby(['gender_from', 'grams']).size().unstack()

grams,"(-, back)","(-, crawford)","(-, forget)","(-, i)","(-, mesa)","(-, way)","(0, 200)","(0, i)","(00, he)","(00, insurance)",...,"(zuul, refers)","(zuul, roylance)","(, catch)","(, you)","(-, your)","(, i)","(, it)","(, run)","(£, 150)","(­, quarter)"
gender_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f,,,,,,,,1.0,,,...,1.0,1.0,,,1.0,,,,,1.0
m,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,...,,,1.0,1.0,,2.0,1.0,1.0,1.0,
