In [1]:
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn import metrics 
from sklearn.decomposition import NMF, LatentDirichletAllocation
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
# print (pd.__name__, pd.__version__)

In [59]:
text = pd.read_pickle("../data/movies_lines_train.p")
text_all = pd.read_pickle("../data/movies.p")

In [61]:
text_all.shape

(304354, 9)

In [60]:
text_all.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy


### Common n-grams

In [None]:
def grams_df(df, numgrams):

    grams_df = pd.DataFrame({'grams': [], 'line_id': []})

    for index, row in df.iterrows():
        #print(index)
        #create list of bigrams for each text
        text = row['words']    
        token = nltk.word_tokenize(text)
        grams_list = list(ngrams(token, numgrams))
        #grams_list = [gram for gram in grams]

        #create dict of ngrams
        line = row['line_id']
        gen = row['gender_from']
        d = {'grams': grams_list, 'line_id': np.repeat(line, len(grams_list)), 'gender_from': np.repeat(gen, len(grams_list))}
        grams_df = pd.concat([grams_df, pd.DataFrame(d)], axis = 0)
        
    return grams_df

In [None]:
test = text[:5]

In [None]:
grams_df = grams_df(test,3)
grams_df

In [None]:
bigrams = grams_df(text[:10000],2)

In [None]:
bigrams = grams_df(text[:50000],2)

In [None]:
bigrams['grams'].value_counts()[:10]

In [None]:
# remove unknown gender
bigrams = bigrams[bigrams['gender_from'] != '?']

# get counts
grouped = bigrams.groupby(['gender_from', 'grams']).count()
g = grouped['line_id'].groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(10))
g

In [None]:
bigrams_genre = pd.merge(bigrams, text[['genre', 'line_id']], on = 'line_id')

In [None]:
bigrams_genre.head()

In [None]:
grouped2 = bigrams_genre.groupby(['genre','gender_from', 'grams']).count()
g2 = grouped2['line_id'].groupby(['genre', 'gender_from'], group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(5))
g2.reset_index()

### Topic Modeling 

In [27]:
#mallet_path = '/Users/jasmindial/Desktop/mallet-2.0.8/bin/mallet'
mallet_path = "../mallet-2.0.8/bin/mallet"

In [11]:
# remove terms with low frequency

# get all the words
total = []
for index, row in text.iterrows():
    Text = row['words']    
    token = nltk.word_tokenize(Text)
    total.extend(token) 

In [None]:
high_freq = nltk.FreqDist(total).most_common(round(.9*text.shape[0]))
high_freq = [word for (word, count) in high_freq]

In [24]:
def clean_words(x):

    tags = ['IN', 'CD', 'MD']
    text = nltk.word_tokenize(x)
    tags = nltk.pos_tag(text)
    
    # remove non-meaningful POS & very low-frequency
    words = [word for (word, tag) in tags if tag not in tags if word in high_freq]
    
    return words

text['words'] = text['words'].apply(clean_words)

In [26]:
# get two separate texts 
women = text[text['gender_from'] == 'f']
women.name = 'women'
men = text[text['gender_from'] == 'm']
men.name = 'men'

In [27]:
women.shape

(51762, 9)

In [28]:
men.shape

(125707, 9)

In [29]:
stopWords = set(stopwords.words('english'))

In [None]:
# source: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# trying the genism version 

In [30]:
def add_ngrams(texts):
    
    bigram = gensim.models.Phrases(texts, common_terms=stopWords, min_count=5, threshold=25) 
    trigram = gensim.models.Phrases(bigram[texts], threshold=25)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stopWords] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    return texts

In [31]:
def prep_for_model(series):
    
    # convert to list
    data = series.values.tolist()

    # tokenize 
    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence)))  

    data_words = list(sent_to_words(data)) 
    data_words = add_ngrams(data_words)
    
    # create dictionary
    id2word = corpora.Dictionary(data_words)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_words]
    
    return data_words, id2word, corpus

In [32]:
# just testing 
data_words, id2word, corpus = prep_for_model(women.words)
# make sure grams are working
len(set([word for sent in data_words for word in sent if "_" in word]))



223

In [33]:
# doesn't like this
#lda_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=8, id2word=id2word)

In [41]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        #print (num_topics)
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word, num_topics=num_topics, 
                                                    random_state=100, update_every=1, chunksize=100, passes=10,
                                                    alpha='auto', per_word_topics=True)
        #lda_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(lda_model)
        coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# run models and get coherence values
# model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
                                                        #texts=data_words, start=3, limit=13, step=3)

In [42]:
# run topic modeling for both gender scripts 
# this takes a while :)

common_genres = ["action", "drama", "comedy", "crime"]
#dictionary of models
d = {genre:{} for genre in common_genres}

topics = []
coherence = []

for gen in [women, men]:
    print (gen.name)
    for x in common_genres:
        df = gen[gen.genre == x]
        print(df.shape)
        group = df.words
    
        data_words, id2word, corpus = prep_for_model(group)
        model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
                                                            texts=data_words, start=4, limit=13, step=2)
        winner = np.argmax(coherence_values)
        d[x][gen.name] = model_list[winner]
        topics.append(model_list[winner].show_topics())
        coherence.append(max(coherence_values))

women
(7808, 9)




(11905, 9)
(14644, 9)
(6612, 9)
men
(29431, 9)
(28995, 9)
(28992, 9)
(15973, 9)


In [57]:
# run topic modeling for both overall gender scripts 
# add the results to the dictionary

d['overall'] = {}
topics2 = []
coherence2 = []

for gen in [women, men]:
    print (gen.name)
    words = gen.words
    data_words, id2word, corpus = prep_for_model(gen)
    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
                                                        texts=data_words, start=4, limit=13, step=2)
    winner = np.argmax(coherence_values)
    d['overall'][gen.name] = model_list[winner]
    topics2.append(model_list[winner].show_topics())
    coherence2.append(max(coherence_values))
    
    

women




men


KeyboardInterrupt: 

In [62]:
d

{'action': {'men': <gensim.models.ldamodel.LdaModel at 0x12374a940>,
  'women': <gensim.models.ldamodel.LdaModel at 0x125a3bdd8>},
 'comedy': {'men': <gensim.models.ldamodel.LdaModel at 0x12374e4e0>,
  'women': <gensim.models.ldamodel.LdaModel at 0x126d657b8>},
 'crime': {'men': <gensim.models.ldamodel.LdaModel at 0x102169dd8>,
  'women': <gensim.models.ldamodel.LdaModel at 0x12385c940>},
 'drama': {'men': <gensim.models.ldamodel.LdaModel at 0x112341470>,
  'women': <gensim.models.ldamodel.LdaModel at 0x1235f5278>},
 'overall': {'women': <gensim.models.ldamodel.LdaModel at 0x10d813828>}}

In [103]:
# create features for classification
# run this on the whole set of movies 

def topic_features(df, d, common_genres):

    # create empty features to be filled
    df = pd.concat([df,pd.DataFrame(columns=["WT1", "WT2", "WT3", "MT1", "MT2", "MT3"])])
    df_mf = df[df.gender_from != '?']


    for gender in ["women", "men"]: 

        # for lines from common genres, get probabilities based on genre-and-gender-specific model
        for genre in common_genres:

            #subset the data
            genre_df = df[df.genre == genre]
            genre_df.reset_index(inplace = True)

            #select appropriate model 
            model = d[genre][gender]

            #split data into chunks
            indices = [x for x in range(0, len(genre_df), 6611)]
            indices.append(len(genre_df))
            splits = [genre_df.index[indices[i]:indices[i+1]] for i in range(len(indices)-1)]
            subsets = [genre_df.iloc[split] for split in splits]

            for subset in subsets: 
                data_words, id2word, corpus = prep_for_model(subset.words)
                try: 
                    gammas = model.inference(corpus)[0]
                except IndexError: 
                    print ("index error!")
                else: 
                    print ("ok!")
                    #select only top 3 topics
                    trunc_gammas = [x[:3] for x in gammas]

                #add gammas to the dataframe
                if gender == "women":
                    df.loc[subset['index'], ['WT1', 'WT2', "WT3"]] = trunc_gammas
                if gender == "men":
                    df.loc[subset['index'], ['MT1', 'MT2', "MT3"]] = trunc_gammas
                    
            #print (genre)
            return df
        """            
        # for lines from uncommon genres, use probabilities based on gender-specific model

        uncommon_genres = text_topics[text_topics.genre not in common_genres]

        #choose model 
        model = d['overall'][gender]
        gammas = model.inference(uncommon_genres)
        if gender == "women":
            text_topics.loc[text_topics.genre not in common_genres, ['WT1', 'WT2', "WT3"]] = gammas
        if gender == "men":
            text_topics.loc[text_topics.genre not in common_genres, ['MT1', 'MT2', "MT3"]] = gammas
        """



In [104]:
topic_features(text_all, d, common_genres)



ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
action


In [216]:
#just making features that don't make sense

text_topics = pd.concat([text,pd.DataFrame(columns=["WT1", "WT2", "WT3", "MT1", "MT2", "MT3"])])
text_topics = text_topics[text_topics.gender_to != '?']


for gender in [women.name, men.name]: 
    #choose a model 
    model = d["action"][gender]
    
    #get gamma, probability of topics for each row
    #indices = [0, 51762, 103524, 155286, len(text_topics)]
    indices = [x for x in range(0, len(text_topics), 5000)]
    indices.append(len(text_topics))
    splits = [text_topics.index[indices[i]:indices[i+1]] for i in range(len(indices)-1)]
    subsets = [text_topics.iloc[split] for split in splits]
    for subset in subsets: 
        data_words, id2word, corpus = prep_for_model(subset.words)
        try: 
            gammas = model.inference(corpus)[0]
        except IndexError: 
            print ("index!")

        else: 
            print ("ok!")
            trunc_gammas = [x[:3] for x in gammas]


        if gender == "women":
            text_topics.loc[subset.index, ['WT1', 'WT2', "WT3"]] = trunc_gammas
        if gender == "men":
            text_topics.loc[subset.index, ['MT1', 'MT2', "MT3"]] = trunc_gammas



ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!
ok!


In [218]:
pickle.dump(text_topics, open('../data/movies_lines_train_topics.p', 'wb'))


In [154]:
text_topics.shape
[s[indices[i]:indices[i+1]] for i in xrange(len(indices)-1)]

(202394, 15)

In [156]:
splits = [0, 51762, 103524, 155286, len(text_topics)]

In [166]:
testing = [text_topics.index[splits[i]:splits[i+1]] for i in range(len(splits)-1)]

In [168]:
text_topics.iloc[testing[1]]

Unnamed: 0,MT1,MT2,MT3,WT1,WT2,WT3,char_id_from,char_id_to,gender_from,gender_to,genre,line_id,movie_id,movie_year,words
51762,,,,,,,u8830,u8843,m,f,crime,L645156,m599,1987,okay
51763,,,,,,,u8830,u8843,m,f,crime,L645157,m599,1987,you i hear you guy handle fairchild food merge...
51764,,,,,,,u8830,u8843,m,f,crime,L645158,m599,1987,come buddy you want get me disbar would you
51765,,,,,,,u8830,u8843,m,f,crime,L645159,m599,1987,listen it one college buddy talk another
51766,,,,,,,u8830,u8843,m,f,crime,L645160,m599,1987,yeah right
51767,,,,,,,u8830,u8843,m,f,crime,L645161,m599,1987,relax roger everybody it you know you know
51768,,,,,,,u8830,u8843,m,f,crime,L645162,m599,1987,i it moi
51769,,,,,,,u8830,u8843,m,f,crime,L645163,m599,1987,money you ever dream roger thing one get hurt ...
51770,,,,,,,u8830,u8843,m,f,crime,L645164,m599,1987,much i walk you
51771,,,,,,,u8830,u8843,m,f,crime,L645253,m599,1987,you put dime roger


In [151]:
type(corpus)

list

In [147]:
#need to split the data into chunks, too long right now!! 

data_words, id2word, corpus = prep_for_model(subset.words[:100])




In [188]:
test_gammas = model.inference(corpus)[0]

In [196]:
trunc_gammas = [x[:3] for x in test_gammas]

In [199]:
len(test_gammas)

7808

In [None]:
topics[0]

In [None]:
topics[1]

In [None]:
coherence

In [None]:
# coherence_values w 5,10,15,20 and w/o removing stop words (10 does best)

In [None]:
# coherence_values w 4,8,12,16,20 and w/o removing stop words (8 does best)

In [None]:
# trying the sklearn version

texts = [men.words, women.words]

In [None]:
ngram_range = [(1,1), (2,3)] # bag of words, bigrams and trigrams
max_features = [1000]
no_topics = [8, 10, 15]
no_top_words = [5, 10]

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ,".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
# inspiration source: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [None]:
# only lda                       

def topic_model(texts):

    for a in ngram_range:
        print (a)
        for x in max_features:
            print (x)
            for y in no_topics: 
                print (y)
                for text in texts: 

                    #transform
                    count_vect = CountVectorizer(ngram_range = a, max_features = x, stop_words = 'english') # using bigrams and trigrams
                    word_counts = count_vect.fit_transform(text, )
                    tfidf_transformer = TfidfTransformer()
                    words_tfidf = tfidf_transformer.fit_transform(word_counts)

                    lda = LatentDirichletAllocation(n_topics=y, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(word_counts) 
                        
                    for z in no_top_words: 
                        print (z)
                        display_topics(lda, count_vect.get_feature_names(), z)


In [None]:
topic_model(texts)

In [None]:
# transform
count_vect = CountVectorizer(ngram_range = (2,3), max_features = 1000, stop_words = 'english') # using bigrams and trigrams
w_words_counts = count_vect.fit_transform(women.words, )
tfidf_transformer = TfidfTransformer()
w_words_tfidf = tfidf_transformer.fit_transform(w_words_counts)

In [None]:
# transform
#count_vect2 = CountVectorizer(ngram_range = (2,3)) # using bigrams and trigrams
count_vect2 = CountVectorizer(ngram_range = (2,3), max_features = 1000, stop_words = 'english') # using bigrams and trigrams
m_words_counts = count_vect2.fit_transform(men.words, )
tfidf_transformer = TfidfTransformer()
m_words_tfidf = tfidf_transformer.fit_transform(m_words_counts)

In [None]:
nmf = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(w_words_tfidf)
nmf2 = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(m_words_tfidf)

In [None]:
# Run LDA for women
lda = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(w_words_counts) 

In [None]:
# LDA for men
lda2 = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(m_words_counts) 

In [None]:
no_top_words = 10
display_topics(lda, count_vect.get_feature_names(), no_top_words)

In [None]:
display_topics(lda2, count_vect.get_feature_names(), no_top_words)

In [None]:
no_top_words = 10
display_topics(nmf2, count_vect2.get_feature_names(), no_top_words)

In [None]:
# count_vect.vocabulary_

In [None]:
# words_tfidf[1,:].toarray()