In [16]:
import pickle
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn import metrics 

In [2]:
# print (pd.__name__, pd.__version__)

pandas 0.22.0


In [2]:
text = pd.read_pickle("movies.p")

In [3]:
text.shape

(304354, 9)

In [4]:
text.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy


In [6]:
def grams_df(df, numgrams):

    grams_df = pd.DataFrame({'grams': [], 'line_id': []})

    for index, row in df.iterrows():
        #print(index)
        #create list of bigrams for each text
        text = row['words']    
        token = nltk.word_tokenize(text)
        grams_list = list(ngrams(token, numgrams))
        #grams_list = [gram for gram in grams]

        #create dict of ngrams
        line = row['line_id']
        gen = row['gender_from']
        d = {'grams': grams_list, 'line_id': np.repeat(line, len(grams_list)), 'gender_from': np.repeat(gen, len(grams_list))}
        grams_df = pd.concat([grams_df, pd.DataFrame(d)], axis = 0)
        
    return grams_df


In [7]:
test = text[:5]

In [8]:
grams_df = grams_df(test,3)
grams_df

Unnamed: 0,gender_from,grams,line_id
0,f,"(we, make, quick)",L194
1,f,"(make, quick, roxanne)",L194
2,f,"(quick, roxanne, korrine)",L194
3,f,"(roxanne, korrine, andrew)",L194
4,f,"(korrine, andrew, barrett)",L194
5,f,"(andrew, barrett, incredibly)",L194
6,f,"(barrett, incredibly, horrendous)",L194
7,f,"(incredibly, horrendous, public)",L194
8,f,"(horrendous, public, break)",L194
9,f,"(public, break, quad)",L194


In [7]:
bigrams = grams_df(text[:10000],2)



In [20]:
bigrams_series = bigrams.grams
bigrams_series[:10]

0                  (we, make)
1               (make, quick)
2            (quick, roxanne)
3          (roxanne, korrine)
4           (korrine, andrew)
5           (andrew, barrett)
6       (barrett, incredibly)
7    (incredibly, horrendous)
8        (horrendous, public)
9             (public, break)
Name: grams, dtype: object

In [27]:
# transform
count_vect = CountVectorizer() # using bag of words
words_counts = count_vect.fit_transform(text.words)
tfidf_transformer = TfidfTransformer()
words_tfidf = tfidf_transformer.fit_transform(counts)

In [28]:
count_vect.vocabulary_

{'we': 38952,
 'make': 21747,
 'quick': 28724,
 'roxanne': 30599,
 'korrine': 19956,
 'andrew': 1903,
 'barrett': 3334,
 'incredibly': 18018,
 'horrendous': 17139,
 'public': 28432,
 'break': 4851,
 'quad': 28645,
 'well': 39060,
 'think': 35804,
 'start': 33945,
 'pronunciation': 28234,
 'okay': 25229,
 'you': 40110,
 'hacking': 15882,
 'gagging': 14393,
 'spit': 33591,
 'part': 26234,
 'please': 27303,
 'bout': 4709,
 'try': 36836,
 'french': 14065,
 'cuisine': 8762,
 'saturday': 31069,
 'night': 24504,
 'ask': 2528,
 'me': 22420,
 'cute': 8870,
 'your': 40125,
 'name': 24034,
 'forget': 13810,
 'it': 18786,
 'my': 23951,
 'fault': 12941,
 'proper': 28248,
 'introduction': 18586,
 'cameron': 5658,
 'thing': 35797,
 'mercy': 22648,
 'particularly': 26246,
 'hideous': 16717,
 'breed': 4885,
 'loser': 21255,
 'sister': 32568,
 'date': 9127,
 'she': 31966,
 'seems': 31582,
 'like': 20841,
 'could': 8277,
 'get': 14781,
 'easy': 11323,
 'enough': 11940,
 'unsolved': 37737,
 'mystery': 239

In [30]:
words_tfidf[0,:].toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [13]:
bigrams = grams_df(text[:50000],2)



In [14]:
bigrams['grams'].value_counts()[:10]

(i, know)       1553
(you, know)     1347
(i, think)      1234
(i, want)        967
(i, get)         910
(you, get)       836
(you, want)      730
(gon, na)        625
(you, i)         602
(you, think)     596
Name: grams, dtype: int64

In [20]:
# remove unknown gender
bigrams = bigrams[bigrams['gender_from'] != '?']

# get counts
grouped = bigrams.groupby(['gender_from', 'grams']).count()
g = grouped['line_id'].groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(10))
g

gender_from  grams       
f            (i, know)       514
             (you, know)     400
             (i, think)      391
             (i, want)       291
             (i, get)        256
             (you, i)        205
             (know, i)       199
             (you, want)     199
             (you, get)      183
             (i, go)         175
m            (i, know)       920
             (you, know)     838
             (i, think)      752
             (i, want)       602
             (you, get)      590
             (i, get)        582
             (you, want)     480
             (gon, na)       438
             (you, think)    378
             (i, go)         376
Name: line_id, dtype: int64

In [21]:
bigrams_genre = pd.merge(bigrams, text[['genre', 'line_id']], on = 'line_id')

In [22]:
bigrams_genre.head()

Unnamed: 0,gender_from,grams,line_id,genre
0,f,"(we, make)",L194,comedy
1,f,"(make, quick)",L194,comedy
2,f,"(quick, roxanne)",L194,comedy
3,f,"(roxanne, korrine)",L194,comedy
4,f,"(korrine, andrew)",L194,comedy


In [23]:
grouped2 = bigrams_genre.groupby(['genre','gender_from', 'grams']).count()
g2 = grouped2['line_id'].groupby(['genre', 'gender_from'], group_keys=False).apply(lambda x: x.sort_values(ascending=False).head(5))
g2.reset_index()

Unnamed: 0,genre,gender_from,grams,line_id
0,action,f,"(you, know)",80
1,action,f,"(i, think)",71
2,action,f,"(i, know)",64
3,action,f,"(you, get)",42
4,action,f,"(mr, peel)",36
5,action,m,"(you, know)",197
6,action,m,"(i, know)",185
7,action,m,"(gon, na)",153
8,action,m,"(i, want)",144
9,action,m,"(i, get)",140


In [24]:
bigrams.groupby(['gender_from', 'grams']).size().unstack()

grams,"(-, back)","(-, crawford)","(-, forget)","(-, i)","(-, mesa)","(-, way)","(0, 200)","(0, i)","(00, he)","(00, insurance)",...,"(zuul, refers)","(zuul, roylance)","(, catch)","(, you)","(-, your)","(, i)","(, it)","(, run)","(£, 150)","(­, quarter)"
gender_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f,,,,,,,,1.0,,,...,1.0,1.0,,,1.0,,,,,1.0
m,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,...,,,1.0,1.0,,2.0,1.0,1.0,1.0,
