In [197]:
import pickle
import pandas as pd

# Load centrality data

In [198]:
df = pickle.load(open('../data/holdout_centrality.p', 'rb'))

# Load movie metadata

In [199]:
headers = ['movie_id', 'movie_title', 'movie_year', 'imdb_rating', 'imdb_vote', 'genre']

filepath = '../cornell_movie_dialogs_corpus/movie_titles_metadata.txt'

lines = []

with open(filepath, 'r', encoding = 'iso-8859-1') as f:
    for line in f:
        cols = line.split(' +++$+++ ')
        lines.append(cols)
            
movies = pd.DataFrame(lines, columns = headers)
    
movies = movies[['movie_id', 'movie_title', 'imdb_rating']]

movies.head()

Unnamed: 0,movie_id,movie_title,imdb_rating
0,m0,10 things i hate about you,6.9
1,m1,1492: conquest of paradise,6.2
2,m2,15 minutes,6.1
3,m3,2001: a space odyssey,8.4
4,m4,48 hrs.,6.9


# Limit genres to top 10

In [200]:
top_10 = pd.DataFrame(df.groupby('genre').size().sort_values(ascending = False).head(10)).reset_index()

In [201]:
top_10 = list(top_10['genre'])
top_10

['drama',
 'action',
 'comedy',
 'crime',
 'horror',
 'adventure',
 'thriller',
 'biography',
 'fantasy',
 'sci-fi']

In [202]:
df = df[df['genre'].isin(top_10)]

# Join centrality and movie metadata

In [203]:
df = df.merge(movies, on = 'movie_id')

df.head(2)

Unnamed: 0,gender,degree,betweenness,movie_id,year,genre,movie_title,imdb_rating
0,f,0.066667,0.0,m49,1999,comedy,detroit rock city,6.5
1,m,0.253333,0.24381,m49,1999,comedy,detroit rock city,6.5


# By gender

In [204]:
df.groupby('gender').mean().reset_index()[['gender', 'degree', 'betweenness']]

Unnamed: 0,gender,degree,betweenness
0,f,0.240497,0.105373
1,m,0.288901,0.161116


# By genre

In [205]:
by_genre_f = by_genre[by_genre['gender'] == 'f']
by_genre_m = by_genre[by_genre['gender'] == 'm']

by_genre_wide = by_genre_f.merge(by_genre_m, on = 'genre', suffixes=('_f', '_m'))

by_genre_wide['diff'] = by_genre_wide['degree_m'] - by_genre_wide['degree_f']

cols = ['genre', 'degree_f', 'degree_m', 'diff']

by_genre_wide = by_genre_wide[cols]

by_genre_wide.sort_values('diff')

Unnamed: 0,genre,degree_f,degree_m,diff
7,fantasy,0.328153,0.283929,-0.044224
8,horror,0.293525,0.255583,-0.037942
4,comedy,0.213803,0.237539,0.023736
9,thriller,0.220445,0.262914,0.042468
2,animation,0.207559,0.261831,0.054271
6,drama,0.21321,0.270435,0.057225
0,action,0.221869,0.297258,0.075389
5,crime,0.209388,0.286129,0.076741
1,adventure,0.26543,0.346556,0.081126
3,biography,0.127744,0.210523,0.082778


In [206]:
by_genre_f = by_genre[by_genre['gender'] == 'f']
by_genre_m = by_genre[by_genre['gender'] == 'm']

by_genre_wide = by_genre_f.merge(by_genre_m, on = 'genre', suffixes=('_f', '_m'))

by_genre_wide['diff'] = by_genre_wide['betweenness_m'] - by_genre_wide['betweenness_f']

cols = ['genre', 'betweenness_f', 'betweenness_m', 'diff']

by_genre_wide = by_genre_wide[cols]

by_genre_wide.sort_values('diff')

Unnamed: 0,genre,betweenness_f,betweenness_m,diff
7,fantasy,0.25239,0.140287,-0.112103
8,horror,0.164886,0.115213,-0.049673
4,comedy,0.1006,0.124415,0.023816
9,thriller,0.095204,0.125604,0.0304
6,drama,0.093069,0.152022,0.058953
2,animation,0.076092,0.153424,0.077332
5,crime,0.098863,0.184586,0.085723
1,adventure,0.07796,0.177057,0.099097
3,biography,0.05155,0.150933,0.099382
0,action,0.083339,0.185458,0.102119


# By movie

In [207]:
df_f = df[df['gender'] == 'f']
df_m = df[df['gender'] == 'm']

df_wide = df_f.merge(df_m, on = ['genre', 'movie_id'], suffixes=('_f', '_m'))

df_wide['diff'] = df_wide['degree_m'] - df_wide['degree_f']

cols = ['genre', 'degree_f', 'degree_m', 'diff', 'movie_title_f', 'movie_id', 'imdb_rating_f']

df_wide = df_wide[cols]

In [208]:
top_5 = df_wide.sort_values('diff').head()
bottom_5 = df_wide.sort_values('diff', ascending = False).head()

In [209]:
top_5

Unnamed: 0,genre,degree_f,degree_m,diff,movie_title_f,movie_id,imdb_rating_f
182,drama,0.928571,0.112245,-0.816327,contact,m304,7.4
135,sci-fi,1.0,0.25,-0.75,arcade,m447,4.6
98,drama,0.857143,0.309524,-0.547619,mimic,m440,5.7
145,drama,1.0,0.5,-0.5,white angel,m604,4.4
137,sci-fi,0.590909,0.166667,-0.424242,the curse,m314,4.5


In [210]:
bottom_5

Unnamed: 0,genre,degree_f,degree_m,diff,movie_title_f,movie_id,imdb_rating_f
159,crime,0.138889,0.888889,0.75,vertigo,m594,8.6
177,drama,0.25,0.75,0.5,solaris,m187,6.2
13,crime,0.25,0.625,0.375,crash,m307,8.0
32,drama,0.166667,0.541667,0.375,signs,m179,6.9
180,action,0.166667,0.5,0.333333,point break,m477,6.9


In [279]:
df_f = df[df['gender'] == 'f']
df_m = df[df['gender'] == 'm']

df_wide = df_f.merge(df_m, on = ['genre', 'movie_id'], suffixes=('_f', '_m'))

df_wide['diff'] = df_wide['betweenness_m'] - df_wide['betweenness_f']

cols = ['genre', 'betweenness_f', 'betweenness_m', 'diff', 'movie_title_f', 'movie_id', 'imdb_rating_f', 'year_f']

df_wide = df_wide[cols]

In [280]:
top_5 = df_wide.sort_values('diff').head()
bottom_5 = df_wide.sort_values('diff', ascending = False).head()

In [281]:
top_5

Unnamed: 0,genre,betweenness_f,betweenness_m,diff,movie_title_f,movie_id,imdb_rating_f,year_f
182,drama,0.945055,0.021193,-0.923862,contact,m304,7.4,1997
135,sci-fi,0.875,0.004464,-0.870536,arcade,m447,4.6,1993
145,drama,0.75,0.025,-0.725,white angel,m604,4.4,1994
98,drama,0.738095,0.043651,-0.694444,mimic,m440,5.7,1997
54,action,0.833333,0.166667,-0.666667,entrapment,m335,6.1,1999


In [282]:
bottom_5

Unnamed: 0,genre,betweenness_f,betweenness_m,diff,movie_title_f,movie_id,imdb_rating_f,year_f
159,crime,0.055556,0.972222,0.916667,vertigo,m594,8.6,1958
13,crime,0.0,0.5,0.5,crash,m307,8.0,2004/I
36,action,0.0,0.5,0.5,the rock,m206,7.3,1996
83,action,0.0,0.5,0.5,three kings,m570,7.3,1999
177,drama,0.0,0.416667,0.416667,solaris,m187,6.2,2002


# IMDB ratings

In [283]:
best_for_f = df_wide.sort_values('diff')[:20]
best_for_m = df_wide.sort_values('diff')[168:]

In [284]:
best_for_f['imdb_rating_f'] = pd.to_numeric(best_for_f['imdb_rating_f'])
best_for_m['imdb_rating_f'] = pd.to_numeric(best_for_m['imdb_rating_f'])

In [285]:
best_for_f['imdb_rating_f'].mean()

6.279999999999999

In [286]:
best_for_m['imdb_rating_f'].mean()

6.92

# By year

In [300]:
def clean_year(x):
    if len(x) > 4:
        x = x[:4]
    return x

df_wide['year'] = df_wide['year_f'].apply(clean_year)

In [307]:
df_wide['year'] = df_wide['year'].astype('datetime64[ns]')
df_wide.head()

Unnamed: 0,genre,betweenness_f,betweenness_m,diff,movie_title_f,movie_id,imdb_rating_f,year_f,year
0,comedy,0.0,0.24381,0.24381,detroit rock city,m49,6.5,1999,1999-01-01
1,action,0.0,0.284444,0.284444,rush hour,m497,6.8,1998,1998-01-01
2,comedy,0.173611,0.150463,-0.023148,the lost boys,m212,7.0,1987,1987-01-01
3,action,0.222222,0.296296,0.074074,assassins,m250,6.0,1995,1995-01-01
4,action,0.030303,0.247475,0.217172,mystery men,m143,5.9,1999,1999-01-01


In [327]:
def get_decade(x):
    return str(x.year)[2]

df_wide['decade'] = df_wide['year'].apply(get_decade)

In [329]:
df_wide.groupby('decade')['diff'].mean()

decade
0    0.097202
3   -0.038600
4   -0.103406
5    0.074829
6    0.197487
7    0.093065
8   -0.007137
9    0.055590
Name: diff, dtype: float64