In [19]:
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn import metrics 
from sklearn.decomposition import NMF, LatentDirichletAllocation
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [2]:
# print (pd.__name__, pd.__version__)

In [3]:
text = pd.read_pickle("movies.p")

In [4]:
text.shape

(304354, 9)

In [5]:
text.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy


### Common n-grams

In [6]:
def grams_df(df, numgrams):

    grams_df = pd.DataFrame({'grams': [], 'line_id': []})

    for index, row in df.iterrows():
        #print(index)
        #create list of bigrams for each text
        text = row['words']    
        token = nltk.word_tokenize(text)
        grams_list = list(ngrams(token, numgrams))
        #grams_list = [gram for gram in grams]

        #create dict of ngrams
        line = row['line_id']
        gen = row['gender_from']
        d = {'grams': grams_list, 'line_id': np.repeat(line, len(grams_list)), 'gender_from': np.repeat(gen, len(grams_list))}
        grams_df = pd.concat([grams_df, pd.DataFrame(d)], axis = 0)
        
    return grams_df

In [7]:
test = text[:5]

In [8]:
grams_df = grams_df(test,3)
grams_df

Unnamed: 0,gender_from,grams,line_id
0,f,"(we, make, quick)",L194
1,f,"(make, quick, roxanne)",L194
2,f,"(quick, roxanne, korrine)",L194
3,f,"(roxanne, korrine, andrew)",L194
4,f,"(korrine, andrew, barrett)",L194
5,f,"(andrew, barrett, incredibly)",L194
6,f,"(barrett, incredibly, horrendous)",L194
7,f,"(incredibly, horrendous, public)",L194
8,f,"(horrendous, public, break)",L194
9,f,"(public, break, quad)",L194


In [7]:
bigrams = grams_df(text[:10000],2)



In [13]:
bigrams = grams_df(text[:50000],2)



In [14]:
bigrams['grams'].value_counts()[:10]

(i, know)       1553
(you, know)     1347
(i, think)      1234
(i, want)        967
(i, get)         910
(you, get)       836
(you, want)      730
(gon, na)        625
(you, i)         602
(you, think)     596
Name: grams, dtype: int64

In [20]:
# remove unknown gender
bigrams = bigrams[bigrams['gender_from'] != '?']

# get counts
grouped = bigrams.groupby(['gender_from', 'grams']).count()
g = grouped['line_id'].groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(10))
g

gender_from  grams       
f            (i, know)       514
             (you, know)     400
             (i, think)      391
             (i, want)       291
             (i, get)        256
             (you, i)        205
             (know, i)       199
             (you, want)     199
             (you, get)      183
             (i, go)         175
m            (i, know)       920
             (you, know)     838
             (i, think)      752
             (i, want)       602
             (you, get)      590
             (i, get)        582
             (you, want)     480
             (gon, na)       438
             (you, think)    378
             (i, go)         376
Name: line_id, dtype: int64

In [21]:
bigrams_genre = pd.merge(bigrams, text[['genre', 'line_id']], on = 'line_id')

In [22]:
bigrams_genre.head()

Unnamed: 0,gender_from,grams,line_id,genre
0,f,"(we, make)",L194,comedy
1,f,"(make, quick)",L194,comedy
2,f,"(quick, roxanne)",L194,comedy
3,f,"(roxanne, korrine)",L194,comedy
4,f,"(korrine, andrew)",L194,comedy


In [23]:
grouped2 = bigrams_genre.groupby(['genre','gender_from', 'grams']).count()
g2 = grouped2['line_id'].groupby(['genre', 'gender_from'], group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(5))
g2.reset_index()

Unnamed: 0,genre,gender_from,grams,line_id
0,action,f,"(you, know)",80
1,action,f,"(i, think)",71
2,action,f,"(i, know)",64
3,action,f,"(you, get)",42
4,action,f,"(mr, peel)",36
5,action,m,"(you, know)",197
6,action,m,"(i, know)",185
7,action,m,"(gon, na)",153
8,action,m,"(i, want)",144
9,action,m,"(i, get)",140


### Topic Modeling 

In [6]:
# get two separate texts 
women = text[text['gender_from'] == 'f']
men = text[text['gender_from'] == 'm']

In [7]:
women.shape

(80720, 9)

In [8]:
men.shape

(188200, 9)

In [22]:
stopWords = set(stopwords.words('english'))

In [28]:
# source: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# trying the genism version 

# convert to list
data = women.words.values.tolist()

# tokenize 
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))  

data_words = list(sent_to_words(data))

print(data_words[:1])

[['we', 'make', 'quick', 'roxanne', 'korrine', 'andrew', 'barrett', 'incredibly', 'horrendous', 'public', 'break', 'quad']]


In [29]:
# Build/train the bigram and trigram models
bigram = gensim.models.Phrases(data_words, common_terms=stopWords, min_count=5, threshold=50) 
trigram = gensim.models.Phrases(bigram[data_words], threshold=50)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [62]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopWords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

data_words = make_bigrams(data_words)
data_words = make_trigrams(data_words)

In [32]:
# create dictionary
id2word = corpora.Dictionary(data_words)

# create corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# print(corpus[:1])

In [23]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [32]:
lda_model.show_topics()[0]

(0, '0.152*"your" + 0.058*"take" + 0.033*"need" + 0.027*"away" + 0.025*"keep" + 0.024*"gonna" + 0.023*"okay" + 0.023*"girl" + 0.020*"place" + 0.020*"god"')

In [25]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.417475379274


In [63]:
def prep_for_model(series):
    
    # convert to list
    data = series.values.tolist()

    # tokenize 
    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence)))  

    data_words = list(sent_to_words(data))    
    
    # Build/train the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, common_terms=stopWords, min_count=5, threshold=50) 
    trigram = gensim.models.Phrases(bigram[data_words], threshold=50)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    data_words = remove_stopwords(data_words)
    data_words = make_bigrams(data_words)
    data_words = make_trigrams(data_words)
    
    
    # create dictionary
    id2word = corpora.Dictionary(data_words)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    
    return data_words, id2word, corpus

    

In [56]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print (num_topics)
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word, num_topics=num_topics, 
                                                    random_state=100, update_every=1, chunksize=100, passes=10,
                                                    alpha='auto', per_word_topics=True)
        model_list.append(lda_model)
        coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [85]:
# data_words, id2word, corpus = prep_for_model(women.words)
# run models and get coherence values
# model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
                                                        #texts=data_words, start=3, limit=13, step=3)


In [87]:
# run topic modeling for both gender scripts 
# this takes a while :)

topics = []
coherence = []

for gen in [women.words, men.words]:
    
    data_words, id2word, corpus = prep_for_model(gen)
    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
                                                        texts=data_words, start=3, limit=16, step=3)
    winner = np.argmax(coherence_values)
    topics.append(model_list[winner].show_topics())
    coherence.append(max(coherence_values))
    
    



3
6
9
12
3
6
9
12


In [89]:
topics[0]

[(0,
  '0.116*"know" + 0.043*"tell" + 0.043*"one" + 0.027*"thing" + 0.023*"love" + 0.021*"need" + 0.020*"find" + 0.017*"happen" + 0.015*"gonna" + 0.015*"okay"'),
 (1,
  '0.061*"think" + 0.050*"say" + 0.047*"see" + 0.035*"right" + 0.031*"back" + 0.031*"look" + 0.029*"time" + 0.028*"make" + 0.020*"nothing" + 0.017*"please"'),
 (2,
  '0.068*"like" + 0.032*"take" + 0.022*"kill" + 0.022*"never" + 0.015*"even" + 0.015*"life" + 0.015*"away" + 0.014*"maybe" + 0.014*"man" + 0.014*"much"'),
 (3,
  '0.092*"oh" + 0.031*"something" + 0.028*"give" + 0.022*"help" + 0.020*"leave" + 0.020*"work" + 0.016*"girl" + 0.014*"god" + 0.014*"old" + 0.013*"stay"'),
 (4,
  '0.088*"get" + 0.043*"come" + 0.039*"well" + 0.032*"yes" + 0.029*"good" + 0.029*"could" + 0.018*"way" + 0.015*"call" + 0.013*"long" + 0.013*"day"'),
 (5,
  '0.102*"go" + 0.069*"want" + 0.033*"would" + 0.030*"let" + 0.027*"little" + 0.026*"mean" + 0.020*"really" + 0.018*"talk" + 0.018*"try" + 0.016*"ever"')]

In [38]:
# coherence_values w 5,10,15,20 and without removing stop words

[0.42153758853695117,
 0.43137364789792498,
 0.35664280992332309,
 0.33498604156610046]

In [45]:
# coherence_values w 4,8,12,16,20

[0.43744648500154187,
 0.43870775627161263,
 0.37864758999155984,
 0.33683501106085695,
 0.33498604156610046]

In [10]:
# trying the sklearn version

texts = [men.words, women.words]

In [37]:
ngram_range = [(1,1), (2,3)] # bag of words, bigrams and trigrams
max_features = [1000]
no_topics = [8, 10, 15]
no_top_words = [5, 10]

In [38]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ,".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
# inspiration source: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [43]:
# add nmf later                       

def topic_model(texts):

    for a in ngram_range:
        print (a)
        for x in max_features:
            print (x)
            for y in no_topics: 
                print (y)
                for text in texts: 

                    #transform
                    count_vect = CountVectorizer(ngram_range = a, max_features = x, stop_words = 'english') # using bigrams and trigrams
                    word_counts = count_vect.fit_transform(text, )
                    tfidf_transformer = TfidfTransformer()
                    words_tfidf = tfidf_transformer.fit_transform(word_counts)

                    lda = LatentDirichletAllocation(n_topics=y, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(word_counts) 
                        
                    for z in no_top_words: 
                        print (z)
                        display_topics(lda, count_vect.get_feature_names(), z)


In [44]:
topic_model(texts)

(1, 1)
1000
8
5
Topic 0:
like ,think ,people ,listen ,live
Topic 1:
say ,man ,year ,hear ,long
Topic 2:
look ,talk ,gonna ,maybe ,try
Topic 3:
let ,mean ,way ,time ,leave
Topic 4:
know ,come ,thing ,need ,day
Topic 5:
want ,tell ,make ,work ,okay
Topic 6:
right ,yeah ,oh ,sorry ,hell
Topic 7:
good ,yes ,little ,mr ,really
10
Topic 0:
like ,think ,people ,listen ,live ,play ,woman ,turn ,suppose ,son
Topic 1:
say ,man ,year ,hear ,long ,wait ,believe ,shit ,new ,dead
Topic 2:
look ,talk ,gonna ,maybe ,try ,fuck ,love ,money ,away ,feel
Topic 3:
let ,mean ,way ,time ,leave ,sir ,kid ,home ,course ,real
Topic 4:
know ,come ,thing ,need ,day ,ask ,help ,night ,use ,place
Topic 5:
want ,tell ,make ,work ,okay ,sure ,kill ,guy ,hey ,boy
Topic 6:
right ,yeah ,oh ,sorry ,hell ,bad ,big ,friend ,run ,uh
Topic 7:
good ,yes ,little ,mr ,really ,happen ,life ,lot ,make ,old
5
Topic 0:
come ,kill ,long ,big ,kid
Topic 1:
right ,look ,happen ,sorry ,like
Topic 2:
oh ,make ,talk ,love ,leave
Topic 3:

In [30]:
# transform
count_vect = CountVectorizer(ngram_range = (2,3), max_features = 1000, stop_words = 'english') # using bigrams and trigrams
w_words_counts = count_vect.fit_transform(women.words, )
tfidf_transformer = TfidfTransformer()
w_words_tfidf = tfidf_transformer.fit_transform(w_words_counts)

In [46]:
# transform
#count_vect2 = CountVectorizer(ngram_range = (2,3)) # using bigrams and trigrams
count_vect2 = CountVectorizer(ngram_range = (2,3), max_features = 1000, stop_words = 'english') # using bigrams and trigrams
m_words_counts = count_vect2.fit_transform(men.words, )
tfidf_transformer = TfidfTransformer()
m_words_tfidf = tfidf_transformer.fit_transform(m_words_counts)

In [20]:
nmf = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(w_words_tfidf)
nmf2 = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(m_words_tfidf)

In [10]:
# Run LDA for women
lda = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(w_words_counts) 

In [17]:
# LDA for men
lda2 = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(m_words_counts) 

In [21]:
no_top_words = 10
display_topics(lda, count_vect.get_feature_names(), no_top_words)

Topic 0:
know know ,want talk ,sound like ,good night ,think maybe ,want hear ,know right ,big deal ,think really ,little bit
Topic 1:
look like ,think know ,know happen ,come home ,good morning ,yeah know ,want make ,say thing ,mr moraes ,say think
Topic 2:
uh huh ,think like ,like like ,oh know ,love love ,say know ,know talk ,tell know ,think say ,thing know
Topic 3:
long time ,new york ,know like ,make sure ,run away ,want tell ,think good ,know thing ,good idea ,really like
Topic 4:
ask question ,think want ,mr lombard ,okay okay ,good time ,really want ,thousand dollar ,know make ,want say ,walk away
Topic 5:
oh god ,know think ,year ago ,know mean ,oh yes ,wait minute ,high school ,know tell ,really know ,say good
Topic 6:
want know ,know say ,know want ,need help ,want want ,thing like ,make feel ,say want ,know love ,tell truth
Topic 7:
like know ,year old ,really think ,oh come ,think make ,old man ,say say ,think gonna ,lot people ,good good
Topic 8:
feel like ,oh yeah ,want

In [22]:
display_topics(lda2, count_vect.get_feature_names(), no_top_words)

Topic 0:
look great ,want run ,want drive ,young girl ,want say ,know try ,good night ,thing life ,right away ,say way
Topic 1:
look sorry ,think wait ,oh yeah ,tell time ,mother die ,yeah know ,want talk ,happen know ,say love ,say think
Topic 2:
open door ,start look ,yes course ,yeah tell ,good good ,know know ,good guy ,long ago ,second chance ,want right
Topic 3:
know look ,know man ,know understand ,know help ,run away ,look pretty ,thing know ,tell good ,make way ,answer question
Topic 4:
dr lecter ,use tell ,like woman ,make think ,remember say ,think love ,make friend ,know friend ,middle night ,like come
Topic 5:
yes dear ,oh wow ,think help ,think father ,know true ,scar death ,cause know ,tomorrow morning ,oh look ,nice place
Topic 6:
want hurt ,yes think ,law school ,like real ,know time ,want believe ,read book ,gonna tell ,remember time ,think stop
Topic 7:
oh hi ,know suppose ,ask know ,make sure ,thing happen ,think little ,kill kill ,good morning ,believe want ,want e

In [19]:
display_topics(nmf, count_vect.get_feature_names(), 5)

NameError: name 'nmf' is not defined

In [21]:
no_top_words = 10
display_topics(nmf2, count_vect2.get_feature_names(), no_top_words)

Topic 0:
you know you know it know it you know you you know him know him you know me you know he know me well you know
Topic 1:
you mean you mean you mean you know you mean you mean it mean it you mean we see you mean you mean like mean like
Topic 2:
thank you thank you sir you sir thank you much you much yes thank you yes thank you thank you thank you thank you thank
Topic 3:
you go think you go think you know you go you go it go you you go you you you go well you go go it
Topic 4:
you think think you you think you you think it think it you think he think he make you think you think we make you
Topic 5:
you say you say you say you you say it say it think you say know you say make you say whatever you say would you say
Topic 6:
yes sir yes sir it sir it sir yes sir yes sir you sir yes sir you yes sir we sir we yes sir they
Topic 7:
you want you want me want me you want know want know you want go want go you want it want it it you want
Topic 8:
you talk know you talk hell you talk hell 

In [9]:
# count_vect.vocabulary_

In [13]:
# words_tfidf[1,:].toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])