In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
import gensim, logging
import cython
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load Data

In [257]:
movies = pickle.load(open("../data/movies.p", 'rb'))

### Train Vector Model

In [4]:
sentences = list(s.split(' ') for s in movies['words'] if s!="")
sentences[:5]

[['we',
  'make',
  'quick',
  'roxanne',
  'korrine',
  'andrew',
  'barrett',
  'incredibly',
  'horrendous',
  'public',
  'break',
  'quad'],
 ['well', 'i', 'think', 'we', 'start', 'pronunciation', 'okay', 'you'],
 ['hacking', 'gagging', 'spit', 'part', 'please'],
 ['okay', 'bout', 'we', 'try', 'french', 'cuisine', 'saturday', 'night'],
 ['you', 'ask', 'me', 'cute', 'your', 'name']]

In [5]:
model_all = gensim.models.Word2Vec(sentences, size=100, min_count=3, workers=4)

2018-05-28 18:39:31,793 : INFO : collecting all words and their counts
2018-05-28 18:39:31,797 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-28 18:39:31,848 : INFO : PROGRESS: at sentence #10000, processed 66935 words, keeping 6555 word types
2018-05-28 18:39:31,932 : INFO : PROGRESS: at sentence #20000, processed 134947 words, keeping 9794 word types
2018-05-28 18:39:31,994 : INFO : PROGRESS: at sentence #30000, processed 205174 words, keeping 12281 word types
2018-05-28 18:39:32,044 : INFO : PROGRESS: at sentence #40000, processed 274660 words, keeping 14378 word types
2018-05-28 18:39:32,101 : INFO : PROGRESS: at sentence #50000, processed 338950 words, keeping 16004 word types
2018-05-28 18:39:32,153 : INFO : PROGRESS: at sentence #60000, processed 410874 words, keeping 17817 word types
2018-05-28 18:39:32,198 : INFO : PROGRESS: at sentence #70000, processed 482188 words, keeping 19217 word types
2018-05-28 18:39:32,253 : INFO : PROGRESS: at sen

### Test Model

In [6]:
model_all.wv.similarity('fork', 'spoon')

0.9276335988352492

In [7]:
model_all.wv.doesnt_match("breakfast cereal dinner lunch".split())

2018-05-28 18:39:46,230 : INFO : precomputing L2-norms of word weight vectors


'cereal'

In [8]:
model_all.wv['woman']

array([-1.0416529 , -0.3685856 ,  0.2857633 ,  0.7566998 , -0.36599946,
       -1.1307472 ,  1.0142924 , -0.64130735, -0.506308  ,  0.40040955,
        0.10267434,  1.4729713 , -0.23874965, -0.2390065 , -0.61051136,
        0.17040619,  0.8940419 ,  1.9174225 ,  0.02209527,  1.5247627 ,
        0.3151395 ,  0.96852213,  0.583615  , -1.1568229 ,  0.81528264,
       -0.7475498 ,  0.8398345 , -0.22440444, -0.58057714,  0.08234546,
       -0.9970148 , -2.3366795 , -0.5780174 ,  0.8116196 ,  0.37080163,
       -1.3802438 ,  0.42913648, -0.39433882,  0.5978944 , -0.7098544 ,
       -1.3424009 , -1.6490427 ,  1.0018047 , -0.601419  , -0.16713834,
        1.0979226 , -0.5841204 ,  0.01826851, -0.41685054, -0.24803858,
       -0.93822026, -0.64984745, -0.9849708 ,  1.2248198 , -0.12188079,
       -0.00418284,  0.55344915, -0.287866  , -0.3271172 , -0.21001542,
        0.5070263 ,  0.02648011, -0.9358283 , -0.24930817, -1.1819657 ,
        0.24828771, -0.09647191, -0.12114934,  0.1779565 , -0.56

### Save Model

In [9]:
model_all.save("mdl/model_all")
# To load: model_all = gensim.models.Word2Vec.load("mdl/model_all")

2018-05-28 18:39:46,281 : INFO : saving Word2Vec object under mdl/model_all, separately None
2018-05-28 18:39:46,284 : INFO : not storing attribute vectors_norm
2018-05-28 18:39:46,286 : INFO : not storing attribute cum_table
2018-05-28 18:39:46,564 : INFO : saved mdl/model_all


## Exploration

In [84]:
df11=movies.groupby('genre')['movie_id'].apply(lambda x: (x!='').sum()).reset_index(name='count')
df11

Unnamed: 0,genre,count
0,action,62758
1,adventure,10805
2,animation,3618
3,biography,13467
4,comedy,71129
5,crime,35011
6,documentary,1458
7,drama,73909
8,family,543
9,fantasy,4657


### Using TD-IDF & Calculate Cosine Similarities

In [176]:
def calc_cosim_gender(movie_df):   
    mcosim = movie_df.groupby(['movie_id', 'gender_from']).apply(lambda x: " ".join(x['words'])).reset_index(name='raw_text')
    mcosim = mcosim.pivot(index='movie_id', columns='gender_from', values='raw_text').reset_index().drop('?', axis=1).fillna('Empty')
    
    mcosim['fit'] = mcosim[['f','m']].apply(lambda x: TfidfVectorizer().fit_transform([x[0], x[1]]), axis=1)
    mcosim['gender_cosim'] = mcosim['fit'].apply(lambda x: cosine_similarity(x[0], x[1])[0,0])
    
    mcosim = pd.merge(mcosim, movie_df[['movie_id', 'genre']], how='inner', on='movie_id')
    
    return mcosim[['movie_id', 'gender_cosim', 'genre']].drop_duplicates().reset_index(drop=True)

In [254]:
cosim_score = calc_cosim_gender(movies)
cosim_score.head()

Unnamed: 0,movie_id,gender_cosim,genre
0,m100,0.0,action
1,m101,0.763138,biography
2,m104,0.001068,biography
3,m105,0.942188,crime
4,m106,0.743309,drama


In [182]:
cosim_score.groupby('genre').describe()

Unnamed: 0_level_0,gender_cosim,gender_cosim,gender_cosim,gender_cosim,gender_cosim,gender_cosim,gender_cosim,gender_cosim
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
action,151.0,0.568041,0.341886,0.0,0.360406,0.715904,0.831122,1.0
adventure,27.0,0.576467,0.290256,0.0,0.482959,0.69867,0.778962,0.889845
animation,10.0,0.492413,0.390296,0.0,0.046342,0.713862,0.814783,0.846943
biography,23.0,0.588373,0.350862,0.0,0.378852,0.763138,0.877579,0.945393
comedy,118.0,0.679655,0.314736,0.0,0.653381,0.829528,0.882248,0.956988
crime,67.0,0.605394,0.358645,0.0,0.427927,0.783201,0.869661,0.942188
documentary,3.0,0.532347,0.461164,0.0,0.393635,0.78727,0.798521,0.809772
drama,137.0,0.672491,0.272052,0.0,0.580605,0.774413,0.858866,0.950163
family,1.0,0.74831,,0.74831,0.74831,0.74831,0.74831,0.74831
fantasy,14.0,0.478423,0.310591,0.0,0.296773,0.505159,0.737766,0.852766


## Calculating Cosine Similarity of Training Dataset

#### Read and Calculate Gender Cosine Similarity Score

In [None]:
movies_train = pickle.load(open("../data/movies_lines_train.p", 'rb'))

In [262]:
movies_train.head()

Unnamed: 0,movie_id,gender_to,gender_from,char_id_from,char_id_to,line_id,words,movie_year,genre
0,m18,m,m,u284,u288,L34746,hello tom,1932,drama
1,m18,m,m,u284,u288,L34747,you you want,1932,drama
2,m18,m,m,u284,u288,L34748,you suppose anybody want money money money,1932,drama
3,m18,m,m,u284,u288,L34749,listen i tell you i interested deal i,1932,drama
4,m18,m,m,u284,u288,L34750,i want know,1932,drama


In [263]:
cosim_train = calc_cosim_gender(movies_train)
cosim_train.head()

Unnamed: 0,movie_id,gender_cosim,genre
0,m100,0.0,action
1,m101,0.763138,biography
2,m104,0.001068,biography
3,m105,0.942188,crime
4,m106,0.743309,drama


In [266]:
cosim_train.groupby('genre').describe()

Unnamed: 0_level_0,gender_cosim,gender_cosim,gender_cosim,gender_cosim,gender_cosim,gender_cosim,gender_cosim,gender_cosim
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
action,104.0,0.591046,0.332723,0.0,0.540836,0.722971,0.838851,1.0
adventure,19.0,0.541697,0.29447,0.0,0.436321,0.651766,0.751266,0.889845
animation,8.0,0.42543,0.411422,0.0,0.0,0.455159,0.814226,0.846943
biography,19.0,0.542869,0.362641,0.0,0.224997,0.694098,0.857557,0.945393
comedy,81.0,0.687361,0.310663,0.0,0.666792,0.829357,0.878311,0.956988
crime,47.0,0.639596,0.336804,0.0,0.537164,0.78401,0.876972,0.942188
documentary,3.0,0.532347,0.461164,0.0,0.393635,0.78727,0.798521,0.809772
drama,85.0,0.687257,0.260323,0.0,0.638457,0.782265,0.849287,0.950163
family,1.0,0.74831,,0.74831,0.74831,0.74831,0.74831,0.74831
fantasy,10.0,0.476056,0.311599,0.0,0.296773,0.505159,0.726099,0.852766


#### Calculate Normalized Cosine Similarity Scores

In [242]:
# HELPERS: Indexing cosim mean and std 
def genre_dstat(desc_df, genre, type='mean'):
    return float(desc_df.loc[desc_df['genre']== genre, ('gender_cosim', type)])

# HELPER: Calculate normalized cosine similarity score 
def norm_cosim(cosim_score, desc_df, genre):
    norm_nomcosim = cosim_score - genre_dstat(desc_df, genre)
    norm_denumcosim = genre_dstat(desc_df, genre, 'std')
    
    if norm_denumcosim != 0:
        return norm_nomcosim/norm_denumcosim
    else:
        return cosim_score

In [243]:
# Normalized cosine similarity score and return score dataframe  
def normalize_cosim(cosim_df):
    desc_stat = cosim_df.groupby('genre').describe().reset_index()
    
    cosim_df['norm_cosim'] = cosim_df.apply(lambda x: norm_cosim(x['gender_cosim'], desc_stat, x['genre']), axis=1)

    return cosim_df

In [246]:
# Gender Cosine Similarity  calculation on each movies in training dataset 
movie_cosim_train = normalize_cosim(movie_cosim_train)
movie_cosim_train.head()

Unnamed: 0,movie_id,gender_cosim,genre,norm_cosim
0,m100,0.0,action,-1.776389
1,m101,0.763138,biography,0.607401
2,m104,0.001068,biography,-1.494041
3,m105,0.942188,crime,0.898422
4,m106,0.743309,drama,0.215317
