In [1]:
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn import metrics 
from sklearn.decomposition import NMF, LatentDirichletAllocation
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
# print (pd.__name__, pd.__version__)

In [75]:
text = pd.read_pickle("../data/movies.p")

In [77]:
text.shape

304354

In [8]:
text.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy


### Common n-grams

In [None]:
def grams_df(df, numgrams):

    grams_df = pd.DataFrame({'grams': [], 'line_id': []})

    for index, row in df.iterrows():
        #print(index)
        #create list of bigrams for each text
        text = row['words']    
        token = nltk.word_tokenize(text)
        grams_list = list(ngrams(token, numgrams))
        #grams_list = [gram for gram in grams]

        #create dict of ngrams
        line = row['line_id']
        gen = row['gender_from']
        d = {'grams': grams_list, 'line_id': np.repeat(line, len(grams_list)), 'gender_from': np.repeat(gen, len(grams_list))}
        grams_df = pd.concat([grams_df, pd.DataFrame(d)], axis = 0)
        
    return grams_df

In [None]:
test = text[:5]

In [None]:
grams_df = grams_df(test,3)
grams_df

In [None]:
bigrams = grams_df(text[:10000],2)

In [None]:
bigrams = grams_df(text[:50000],2)

In [None]:
bigrams['grams'].value_counts()[:10]

In [None]:
# remove unknown gender
bigrams = bigrams[bigrams['gender_from'] != '?']

# get counts
grouped = bigrams.groupby(['gender_from', 'grams']).count()
g = grouped['line_id'].groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(10))
g

In [None]:
bigrams_genre = pd.merge(bigrams, text[['genre', 'line_id']], on = 'line_id')

In [None]:
bigrams_genre.head()

In [None]:
grouped2 = bigrams_genre.groupby(['genre','gender_from', 'grams']).count()
g2 = grouped2['line_id'].groupby(['genre', 'gender_from'], group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(5))
g2.reset_index()

### Topic Modeling 

In [27]:
#mallet_path = '/Users/jasmindial/Desktop/mallet-2.0.8/bin/mallet'
#mallet_path = "/Users/jasmindial/Desktop/mallet-2.0.8/bin/mallet"
mallet_path = "../mallet-2.0.8/bin/mallet"

In [56]:
# remove terms with low frequency
total = []
for index, row in text.iterrows():
    text = row['words']    
    token = nltk.word_tokenize(text)
    total.extend(token) 

In [80]:
high_freq = nltk.FreqDist(total).most_common(round(.9*text.shape[0]))
high_freq_words = [word for word in total if word in high_freq]
len(high_freq_words)

KeyboardInterrupt: 

In [13]:
# get two separate texts 
women = text[text['gender_from'] == 'f']
women.name = 'women'
men = text[text['gender_from'] == 'm']
men.name = 'men'

In [14]:
women.shape

(80720, 9)

In [15]:
men.shape

(188200, 9)

In [16]:
stopWords = set(stopwords.words('english'))

In [None]:
# source: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# trying the genism version 

In [17]:
def add_ngrams(texts):
    
    bigram = gensim.models.Phrases(texts, common_terms=stopWords, min_count=5, threshold=50) 
    trigram = gensim.models.Phrases(bigram[texts], threshold=15)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stopWords] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    return texts

In [18]:
def prep_for_model(series):
    
    # convert to list
    data = series.values.tolist()

    # tokenize 
    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence)))  

    data_words = list(sent_to_words(data)) 
    data_words = add_ngrams(data_words)
    
    # create dictionary
    id2word = corpora.Dictionary(data_words)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_words]
    
    return data_words, id2word, corpus

In [21]:
# just testing 
data_words, id2word, corpus = prep_for_model(women.words)
# make sure grams are working
len(set([word for sent in data_words for word in sent if "_" in word]))



396

In [22]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word, num_topics=6, 
                                                    random_state=100, update_every=1, chunksize=100, passes=10,
                                                    alpha='auto', per_word_topics=True)

In [24]:
#returns gamma, probability of topics for row 
test = lda_model.inference(corpus)

In [33]:
#one per row
len(test[0])

80720

In [37]:
test[0][800]

array([ 22.99334526,  27.18163872,  23.92817688,  18.76008797,
        20.13619232,  21.81669617], dtype=float32)

In [33]:
# doesn't like this
#lda_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=8, id2word=id2word)


In [21]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print (num_topics)
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word, num_topics=num_topics, 
                                                    random_state=100, update_every=1, chunksize=100, passes=10,
                                                    alpha='auto', per_word_topics=True)
        #lda_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(lda_model)
        coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# run models and get coherence values
# model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
                                                        #texts=data_words, start=3, limit=13, step=3)

In [37]:
# run topic modeling for both gender scripts 
# this takes a while :)

common_genres = ["action", "drama", "comedy", "crime"]

topics = []
coherence = []

for gen in [women, men]:
    print (gen.name)
    for x in common_genres:
        df = gen[gen.genre == x]
        group = df.words
    
        data_words, id2word, corpus = prep_for_model(group)
        model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
                                                            texts=data_words, start=4, limit=13, step=2)
        winner = np.argmax(coherence_values)
        topics.append(model_list[winner].show_topics())
        coherence.append(max(coherence_values))

women




4
6
8
10
12
4
6
8
10
12
4
6
8
10
12
4
6
8
10
12
men
4
6
8
10
12
4
6
8
10
12
4
6
8
10
12
4
6
8
10
12


In [38]:
coherence

[0.46977957700179357,
 0.34450925461689547,
 0.36559425716214738,
 0.42960439997554917,
 0.31859532648701405,
 0.42040706275787021,
 0.31829103522739638,
 0.35003504796796642]

In [None]:
# create features

for each gender: 
    for genre in common_genres:
        subset the data into right genre
        model = d[genre][gender]
        gammas = model.inference(subset.words)[0]
        concat values to the main dataframe! 
        
    for the rest not in the common_genres:
        model = d[overall][gender]
        do the same thing




In [81]:
text.genre.unique()

array(['comedy', 'adventure', 'action', 'crime', 'fantasy', 'sci-fi',
       'drama', 'biography', 'horror', 'animation', 'thriller', 'mystery',
       'short', 'family', 'romance', 'documentary', 'film-noir'], dtype=object)

In [None]:
# run topic modeling for both gender scripts 
# this takes a while :)

topics = []
coherence = []

for gen in [women.words, men.words]:
    
    data_words, id2word, corpus = prep_for_model(gen)
    model_list, coherence_values = compute_coherence_values(mallet_path, dictionary=id2word, corpus=corpus,
                                                        texts=data_words, start=8, limit=11, step=2)
    winner = np.argmax(coherence_values)
    topics.append(model_list[winner].show_topics())
    coherence.append(max(coherence_values))
    
    

In [None]:
topics[0]

In [None]:
topics[1]

In [None]:
coherence

In [None]:
# coherence_values w 5,10,15,20 and w/o removing stop words (10 does best)

In [None]:
# coherence_values w 4,8,12,16,20 and w/o removing stop words (8 does best)

In [None]:
# trying the sklearn version

texts = [men.words, women.words]

In [None]:
ngram_range = [(1,1), (2,3)] # bag of words, bigrams and trigrams
max_features = [1000]
no_topics = [8, 10, 15]
no_top_words = [5, 10]

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ,".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
# inspiration source: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [None]:
# only lda                       

def topic_model(texts):

    for a in ngram_range:
        print (a)
        for x in max_features:
            print (x)
            for y in no_topics: 
                print (y)
                for text in texts: 

                    #transform
                    count_vect = CountVectorizer(ngram_range = a, max_features = x, stop_words = 'english') # using bigrams and trigrams
                    word_counts = count_vect.fit_transform(text, )
                    tfidf_transformer = TfidfTransformer()
                    words_tfidf = tfidf_transformer.fit_transform(word_counts)

                    lda = LatentDirichletAllocation(n_topics=y, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(word_counts) 
                        
                    for z in no_top_words: 
                        print (z)
                        display_topics(lda, count_vect.get_feature_names(), z)


In [None]:
topic_model(texts)

In [None]:
# transform
count_vect = CountVectorizer(ngram_range = (2,3), max_features = 1000, stop_words = 'english') # using bigrams and trigrams
w_words_counts = count_vect.fit_transform(women.words, )
tfidf_transformer = TfidfTransformer()
w_words_tfidf = tfidf_transformer.fit_transform(w_words_counts)

In [None]:
# transform
#count_vect2 = CountVectorizer(ngram_range = (2,3)) # using bigrams and trigrams
count_vect2 = CountVectorizer(ngram_range = (2,3), max_features = 1000, stop_words = 'english') # using bigrams and trigrams
m_words_counts = count_vect2.fit_transform(men.words, )
tfidf_transformer = TfidfTransformer()
m_words_tfidf = tfidf_transformer.fit_transform(m_words_counts)

In [None]:
nmf = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(w_words_tfidf)
nmf2 = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(m_words_tfidf)

In [None]:
# Run LDA for women
lda = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(w_words_counts) 

In [None]:
# LDA for men
lda2 = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(m_words_counts) 

In [None]:
no_top_words = 10
display_topics(lda, count_vect.get_feature_names(), no_top_words)

In [None]:
display_topics(lda2, count_vect.get_feature_names(), no_top_words)

In [None]:
no_top_words = 10
display_topics(nmf2, count_vect2.get_feature_names(), no_top_words)

In [None]:
# count_vect.vocabulary_

In [None]:
# words_tfidf[1,:].toarray()