In [7]:
import pandas as pd
import numpy as np
import pickle
import nltk
import gensim, logging
import cython
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load Data

In [9]:
movies = pickle.load(open("../data/movies.p", 'rb'))

### Train Vector Model

In [4]:
sentences = list(s.split(' ') for s in movies['words'] if s!="")
sentences[:5]

[['we',
  'make',
  'quick',
  'roxanne',
  'korrine',
  'andrew',
  'barrett',
  'incredibly',
  'horrendous',
  'public',
  'break',
  'quad'],
 ['well', 'i', 'think', 'we', 'start', 'pronunciation', 'okay', 'you'],
 ['hacking', 'gagging', 'spit', 'part', 'please'],
 ['okay', 'bout', 'we', 'try', 'french', 'cuisine', 'saturday', 'night'],
 ['you', 'ask', 'me', 'cute', 'your', 'name']]

In [5]:
model_all = gensim.models.Word2Vec(sentences, size=100, min_count=3, workers=4)

2018-05-28 18:39:31,793 : INFO : collecting all words and their counts
2018-05-28 18:39:31,797 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-28 18:39:31,848 : INFO : PROGRESS: at sentence #10000, processed 66935 words, keeping 6555 word types
2018-05-28 18:39:31,932 : INFO : PROGRESS: at sentence #20000, processed 134947 words, keeping 9794 word types
2018-05-28 18:39:31,994 : INFO : PROGRESS: at sentence #30000, processed 205174 words, keeping 12281 word types
2018-05-28 18:39:32,044 : INFO : PROGRESS: at sentence #40000, processed 274660 words, keeping 14378 word types
2018-05-28 18:39:32,101 : INFO : PROGRESS: at sentence #50000, processed 338950 words, keeping 16004 word types
2018-05-28 18:39:32,153 : INFO : PROGRESS: at sentence #60000, processed 410874 words, keeping 17817 word types
2018-05-28 18:39:32,198 : INFO : PROGRESS: at sentence #70000, processed 482188 words, keeping 19217 word types
2018-05-28 18:39:32,253 : INFO : PROGRESS: at sen

### Test Model

In [6]:
model_all.wv.similarity('fork', 'spoon')

0.9276335988352492

In [7]:
model_all.wv.doesnt_match("breakfast cereal dinner lunch".split())

2018-05-28 18:39:46,230 : INFO : precomputing L2-norms of word weight vectors


'cereal'

In [8]:
model_all.wv['woman']

array([-1.0416529 , -0.3685856 ,  0.2857633 ,  0.7566998 , -0.36599946,
       -1.1307472 ,  1.0142924 , -0.64130735, -0.506308  ,  0.40040955,
        0.10267434,  1.4729713 , -0.23874965, -0.2390065 , -0.61051136,
        0.17040619,  0.8940419 ,  1.9174225 ,  0.02209527,  1.5247627 ,
        0.3151395 ,  0.96852213,  0.583615  , -1.1568229 ,  0.81528264,
       -0.7475498 ,  0.8398345 , -0.22440444, -0.58057714,  0.08234546,
       -0.9970148 , -2.3366795 , -0.5780174 ,  0.8116196 ,  0.37080163,
       -1.3802438 ,  0.42913648, -0.39433882,  0.5978944 , -0.7098544 ,
       -1.3424009 , -1.6490427 ,  1.0018047 , -0.601419  , -0.16713834,
        1.0979226 , -0.5841204 ,  0.01826851, -0.41685054, -0.24803858,
       -0.93822026, -0.64984745, -0.9849708 ,  1.2248198 , -0.12188079,
       -0.00418284,  0.55344915, -0.287866  , -0.3271172 , -0.21001542,
        0.5070263 ,  0.02648011, -0.9358283 , -0.24930817, -1.1819657 ,
        0.24828771, -0.09647191, -0.12114934,  0.1779565 , -0.56

### Save Model

In [9]:
model_all.save("mdl/model_all")
# To load: model_all = gensim.models.Word2Vec.load("mdl/model_all")

2018-05-28 18:39:46,281 : INFO : saving Word2Vec object under mdl/model_all, separately None
2018-05-28 18:39:46,284 : INFO : not storing attribute vectors_norm
2018-05-28 18:39:46,286 : INFO : not storing attribute cum_table
2018-05-28 18:39:46,564 : INFO : saved mdl/model_all


## Exploration

In [10]:
df11=movies.groupby('genre')['movie_id'].apply(lambda x: (x!='').sum()).reset_index(name='count')
df11

Unnamed: 0,genre,count
0,action,62758
1,adventure,10805
2,animation,3618
3,biography,13467
4,comedy,71129
5,crime,35011
6,documentary,1458
7,drama,73909
8,family,543
9,fantasy,4657


### Using TD-IDF & Calculate Cosine Similarities

In [11]:
def calc_cosim_gender(movie_df):   
    mcosim = movie_df.groupby(['movie_id', 'gender_from']).apply(lambda x: " ".join(x['words'])).reset_index(name='raw_text')
    mcosim = mcosim.pivot(index='movie_id', columns='gender_from', values='raw_text').reset_index().fillna('Empty')
    
    mcosim['fit'] = mcosim[['f','m']].apply(lambda x: TfidfVectorizer().fit_transform([x[0], x[1]]), axis=1)
    mcosim['gender_cosim'] = mcosim['fit'].apply(lambda x: cosine_similarity(x[0], x[1])[0,0])
    
    mcosim = pd.merge(mcosim, movie_df[['movie_id', 'genre']], how='inner', on='movie_id')
    
    return mcosim[['movie_id', 'gender_cosim', 'genre']].drop_duplicates().reset_index(drop=True)

In [12]:
cosim_score = calc_cosim_gender(movies)
cosim_score.head()

Unnamed: 0,movie_id,gender_cosim,genre
0,m0,0.862061,comedy
1,m1,0.74255,adventure
2,m10,0.880711,drama
3,m100,0.0,action
4,m101,0.763138,biography


In [13]:
cosim_score.describe()

Unnamed: 0,gender_cosim
count,616.0
mean,0.618904
std,0.321106
min,0.0
25%,0.50752
50%,0.75391
75%,0.851891
max,1.0


## Calculating Cosine Similarity of Training Dataset

#### Read and Calculate Gender Cosine Similarity Score

In [14]:
movies_train = pickle.load(open("../data/movies_lines_train.p", 'rb'))

In [15]:
movies_holdout= pickle.load(open("../data/movies_lines_holdout.p", 'rb'))

In [16]:
movies_holdout.head()

Unnamed: 0,movie_id,gender_to,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m49,m,f,u761,u765,L163186,thanks miss,1999,comedy
1,m49,m,f,u761,u765,L163187,you kind i amanda,1999,comedy
2,m49,m,f,u761,u765,L163188,right well thanks drink stuff amanda reason me...,1999,comedy
3,m49,m,f,u761,u765,L163189,glum hawk night still young fill plenty compen...,1999,comedy
4,m49,m,f,u761,u765,L163190,huh,1999,comedy


In [66]:
len(movies_holdout)

101960

In [17]:
movies_train.head()

Unnamed: 0,movie_id,gender_to,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m18,m,m,u284,u288,L34746,hello tom,1932,drama
1,m18,m,m,u284,u288,L34747,you you want,1932,drama
2,m18,m,m,u284,u288,L34748,you suppose anybody want money money money,1932,drama
3,m18,m,m,u284,u288,L34749,listen i tell you i interested deal i,1932,drama
4,m18,m,m,u284,u288,L34750,i want know,1932,drama


In [18]:
cosim_train = calc_cosim_gender(movies_train)
cosim_train.head()

Unnamed: 0,movie_id,gender_cosim,genre
0,m100,0.0,action
1,m101,0.763138,biography
2,m104,0.001068,biography
3,m105,0.942188,crime
4,m106,0.743309,drama


In [19]:
cosim_desc = cosim_train.describe()
cosim_desc

Unnamed: 0,gender_cosim
count,412.0
mean,0.624933
std,0.319369
min,0.0
25%,0.523451
50%,0.756726
75%,0.853729
max,1.0


In [20]:
cosim_desc.loc['mean', 'gender_cosim']

0.6249326174795061

#### Calculate Normalized Cosine Similarity Scores

In [21]:
# HELPERS: Indexing cosim mean and std 
def genre_dstat(desc_df, genre, type='mean'):
    return float(desc_df.loc[desc_df['genre']== genre, ('gender_cosim', type)])

# HELPER: Calculate normalized cosine similarity score 
def norm_cosim(cosim_score, desc_df, genre):
    norm_nomcosim = cosim_score - genre_dstat(desc_df, genre)
    norm_denumcosim = genre_dstat(desc_df, genre, 'std')
    
    if norm_denumcosim != 0:
        return norm_nomcosim/norm_denumcosim
    else:
        return cosim_score

In [22]:
# Normalized cosine similarity score by genre and return score dataframe  
def normalize_cosim_by_genre(cosim_df):
    desc_stat = cosim_df.groupby('genre').describe()
    
    cosim_df['norm_cosim'] = cosim_df.apply(lambda x: norm_cosim(x['gender_cosim'], desc_stat, x['genre']), axis=1)

    return cosim_df

In [23]:
# Normalized cosine similarity score by genre and return score dataframe  
def normalize_cosim(cosim_df):
    desc_stat = cosim_df.describe().reset_index()
    
    cosim_df['norm_cosim'] = (cosim_df['gender_cosim']-cosim_desc.loc['mean', 'gender_cosim'])/cosim_desc.loc['std', 'gender_cosim']
    
    return cosim_df

### Class: Train mean and std on training data, calculate normalized cosine similarity score

In [36]:
# Read data: one movie title 
one_movie = movies_train[movies_train['movie_id']=='m105']

In [60]:
class cosim:
    def __init__(self):
        self.train_df = pd.DataFrame()
        self.train_mean = 0
        self.train_std = 0
    
    def similarity(self, movie_df, train=False):
        movie_cm = movie_df.groupby(['movie_id', 'gender_from']).apply(lambda x: " ".join(x['words'])).reset_index(name='raw_text')
        movie_cm = movie_cm.pivot(index='movie_id', columns='gender_from', values='raw_text').reset_index().fillna('Empty')

        movie_cm['fit'] = movie_cm[['f','m']].apply(lambda x: TfidfVectorizer().fit_transform([x[0], x[1]]), axis=1)
        movie_cm['gender_cosim'] = movie_cm['fit'].apply(lambda x: cosine_similarity(x[0], x[1])[0,0])

        movie_cm = pd.merge(movie_cm, movie_df[['movie_id', 'genre']], how='inner', on='movie_id')

        return_df = movie_cm[['movie_id', 'gender_cosim', 'genre']].drop_duplicates().reset_index(drop=True)
        
        if train:
            return return_df
        
        return float(return_df['gender_cosim'])
    
    def train_movies(self, train_df):
        train_proc = self.similarity(train_df, train=True)
        
        train_stat = train_proc.describe()
        self.train_mean = train_stat.loc['mean', 'gender_cosim']
        self.train_std = train_stat.loc['std', 'gender_cosim']
    
    def normalize_cosim(self, cosim_df):
        cosim_df['norm_cosim'] = (cosim_df['gender_cosim']-self.train_mean)/self_train_std

        return cosim_df
    
    def norm_similarity(self, movie_df):
        cosim_df = self.similarity(movie_df, train=True)
        cosim_df['norm_cosim'] = (cosim_df['gender_cosim']-self.train_mean)/self.train_std
        
        return float(cosim_df['norm_cosim'])
        

In [61]:
mvs = cosim()

In [62]:
mvs.train_movies(movies_train)

In [63]:
mvs.train_mean

0.6249326174795061

In [64]:
mvs.train_std

0.31936910205753566

In [65]:
mvs.norm_similarity(one_movie)

0.9933830047143571

In [67]:
mvs.similarity(one_movie)

0.9421884556943471