In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
import gensim, logging
import cython
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load Data

In [2]:
movies = pickle.load(open("../data/movies_small.p", 'rb'))

In [3]:
movies.head()

Unnamed: 0,gender_to,movie_id,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m,m0,f,u0,u2,L194,we make quick roxanne korrine andrew barrett i...,1999,comedy
1,m,m0,f,u0,u2,L195,well i think we start pronunciation okay you,1999,comedy
2,m,m0,f,u0,u2,L196,hacking gagging spit part please,1999,comedy
3,m,m0,f,u0,u2,L197,okay bout we try french cuisine saturday night,1999,comedy
4,m,m0,f,u0,u2,L198,you ask me cute your name,1999,comedy


### Train Vector Model

In [4]:
sentences = list(s.split(' ') for s in movies['words'] if s!="")
sentences[:5]

[['we',
  'make',
  'quick',
  'roxanne',
  'korrine',
  'andrew',
  'barrett',
  'incredibly',
  'horrendous',
  'public',
  'break',
  'quad'],
 ['well', 'i', 'think', 'we', 'start', 'pronunciation', 'okay', 'you'],
 ['hacking', 'gagging', 'spit', 'part', 'please'],
 ['okay', 'bout', 'we', 'try', 'french', 'cuisine', 'saturday', 'night'],
 ['you', 'ask', 'me', 'cute', 'your', 'name']]

In [5]:
model_all = gensim.models.Word2Vec(sentences, size=100, min_count=3, workers=4)

2018-05-28 18:09:06,216 : INFO : collecting all words and their counts
2018-05-28 18:09:06,220 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-28 18:09:06,338 : INFO : PROGRESS: at sentence #10000, processed 66935 words, keeping 6555 word types
2018-05-28 18:09:06,445 : INFO : PROGRESS: at sentence #20000, processed 134947 words, keeping 9794 word types
2018-05-28 18:09:06,563 : INFO : PROGRESS: at sentence #30000, processed 205174 words, keeping 12281 word types
2018-05-28 18:09:06,666 : INFO : PROGRESS: at sentence #40000, processed 274660 words, keeping 14378 word types
2018-05-28 18:09:06,738 : INFO : PROGRESS: at sentence #50000, processed 338950 words, keeping 16004 word types
2018-05-28 18:09:06,807 : INFO : PROGRESS: at sentence #60000, processed 410874 words, keeping 17817 word types
2018-05-28 18:09:06,904 : INFO : PROGRESS: at sentence #70000, processed 482188 words, keeping 19217 word types
2018-05-28 18:09:06,965 : INFO : PROGRESS: at sen

2018-05-28 18:09:29,684 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-28 18:09:29,708 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-28 18:09:29,713 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-28 18:09:29,715 : INFO : EPOCH - 5 : training on 2048050 raw words (1481119 effective words) took 3.7s, 399878 effective words/s
2018-05-28 18:09:29,717 : INFO : training on a 10240250 raw words (7404484 effective words) took 18.7s, 395748 effective words/s


### Test Model

In [6]:
model_all.wv.similarity('fork', 'spoon')

0.9298682953442836

In [7]:
model_all.wv.doesnt_match("breakfast cereal dinner lunch".split())

2018-05-28 18:09:29,756 : INFO : precomputing L2-norms of word weight vectors


'cereal'

In [8]:
model_all.wv['woman']

array([ 1.8887823 ,  0.5843662 , -1.4433612 ,  1.6528975 , -1.1323974 ,
        1.8598673 , -0.6699696 , -0.79856366, -0.4462283 , -1.124497  ,
        0.09772055,  0.06851473, -1.9076995 , -0.49779946, -0.16395466,
        1.0017685 , -0.65013546,  1.4275025 , -0.33479625, -0.27996236,
       -0.23167257, -0.40326747,  1.4194703 , -0.34521198,  0.534384  ,
        0.93979645,  0.8191274 ,  1.8529729 , -1.1244892 , -1.0976552 ,
       -0.9274573 ,  0.7849394 , -0.4446491 ,  0.48743358,  0.4943941 ,
        0.07061413,  0.9832355 , -0.8781738 , -1.3890562 ,  0.38256395,
        0.25277007, -1.1496593 ,  1.4726934 ,  1.1602194 ,  0.5442263 ,
       -0.50361717,  1.0379058 , -0.585961  , -0.6740674 ,  0.7058485 ,
       -0.3550793 ,  0.22099417,  0.7024282 ,  0.38005868, -0.2126683 ,
       -0.7860658 , -0.27896193, -0.6637623 , -1.1320485 ,  0.08148002,
        0.80921525,  0.01282703, -1.4499536 ,  1.1806929 , -1.1590331 ,
       -0.01213041,  0.6948338 , -1.08083   , -0.6157887 ,  0.75

### Save Model

In [9]:
model_all.save("mdl/model_all")
# To load: model_all = gensim.models.Word2Vec.load("mdl/model_all")

2018-05-28 18:09:29,885 : INFO : saving Word2Vec object under mdl/model_all, separately None
2018-05-28 18:09:29,889 : INFO : not storing attribute vectors_norm
2018-05-28 18:09:29,896 : INFO : not storing attribute cum_table
2018-05-28 18:09:30,683 : INFO : saved mdl/model_all


## Exploration

In [10]:
df11=movies.groupby('genre')['words'].apply(lambda x: (x!='').sum()).reset_index(name='count')
df11

Unnamed: 0,genre,count
0,action,60933
1,adventure,10490
2,animation,3562
3,biography,13131
4,comedy,69155
5,crime,33927
6,documentary,1428
7,drama,71651
8,family,534
9,fantasy,4502


In [11]:
male_drama = movies['words'].loc[(movies['genre'] == 'drama') & (movies['gender_from'] == 'm')]
female_drama = movies['words'].loc[(movies['genre'] == 'drama') & (movies['gender_from'] == 'f')]

In [12]:
male_drama_raw = " ".join(male_drama)
female_drama_raw = " ".join(female_drama)

In [13]:
male_action = movies['words'].loc[(movies['genre'] == 'action') & (movies['gender_from'] == 'm')]
female_action = movies['words'].loc[(movies['genre'] == 'action') & (movies['gender_from'] == 'f')]

In [14]:
male_action_raw = " ".join(male_action)
female_action_raw = " ".join(female_action)

In [15]:
male_comedy = movies['words'].loc[(movies['genre'] == 'comedy') & (movies['gender_from'] == 'm')]
female_comedy = movies['words'].loc[(movies['genre'] == 'comedy') & (movies['gender_from'] == 'f')]

In [16]:
male_comedy_raw = " ".join(male_comedy)
female_comedy_raw = " ".join(female_comedy)

In [17]:
male_comedy_m0 = movies['words'].loc[(movies['genre'] == 'comedy') & (movies['gender_from'] == 'm') & (movies['movie_id'] == 'm0')]
female_comedy_m0 = movies['words'].loc[(movies['genre'] == 'comedy') & (movies['gender_from'] == 'f') & (movies['movie_id'] == 'm0')]
male_comedy_m0_raw = " ".join(male_comedy_m0)
female_comedy_m0_raw = " ".join(female_comedy_m0)

### Using TD-IDF

In [18]:
v_dra = TfidfVectorizer()
tfidf_dra = v_dra.fit_transform([male_drama_raw , female_drama_raw])

In [19]:
cosine_similarity(tfidf_dra[0], tfidf_dra[1])[0,0]

0.9967581870544995

In [20]:
v_act = TfidfVectorizer()
tfidf_act = v_act.fit_transform([male_action_raw , female_action_raw])

In [21]:
cosine_similarity(tfidf_act[0], tfidf_act[1])[0,0]

0.9956236092312521

In [22]:
v_com = TfidfVectorizer()
tfidf_com = v_com.fit_transform([male_comedy_raw , female_comedy_raw])

In [23]:
cosine_similarity(tfidf_com[0], tfidf_com[1])[0,0]

0.9968362064155483

In [24]:
v_com0 = TfidfVectorizer()
tfidf_com0 = v_com.fit_transform([male_comedy_m0_raw , female_comedy_m0_raw])
cosine_similarity(tfidf_com0[0], tfidf_com0[1])[0,0]

0.8620607491773777