In [148]:
import pickle
import pandas as pd

# Load centrality data

In [149]:
df = pickle.load(open('../data/centrality.p', 'rb'))

# Load movie metadata

In [150]:
headers = ['movie_id', 'movie_title', 'movie_year', 'imdb_rating', 'imdb_vote', 'genre']

filepath = '../cornell_movie_dialogs_corpus/movie_titles_metadata.txt'

lines = []

with open(filepath, 'r', encoding = 'iso-8859-1') as f:
    for line in f:
        cols = line.split(' +++$+++ ')
        lines.append(cols)
            
movies = pd.DataFrame(lines, columns = headers)
    
movies = movies[['movie_id', 'movie_title', 'imdb_rating']]

movies.head()

Unnamed: 0,movie_id,movie_title,imdb_rating
0,m0,10 things i hate about you,6.9
1,m1,1492: conquest of paradise,6.2
2,m2,15 minutes,6.1
3,m3,2001: a space odyssey,8.4
4,m4,48 hrs.,6.9


# Limit genres to top 10

In [151]:
top_10 = pd.DataFrame(df.groupby('genre').size().sort_values(ascending = False).head(10)).reset_index()

In [152]:
top_10 = list(top_10['genre'])
top_10

['action',
 'drama',
 'comedy',
 'crime',
 'horror',
 'adventure',
 'biography',
 'fantasy',
 'animation',
 'thriller']

In [153]:
df = df[df['genre'].isin(top_10)]

# Join centrality and movie metadata

In [154]:
df = df.merge(movies, on = 'movie_id')

df.head(2)

Unnamed: 0,gender,degree,closeness,betweenness,eigenvector,overall_avg,movie_id,year,genre,movie_title,imdb_rating
0,f,0.272727,0.504914,0.123273,0.226173,0.281772,m0,1999,comedy,10 things i hate about you,6.9
1,m,0.318182,0.526121,0.100303,0.291073,0.30892,m0,1999,comedy,10 things i hate about you,6.9


# By gender

In [129]:
df.groupby('gender').mean()

Unnamed: 0_level_0,degree,closeness,betweenness,eigenvector,overall_avg
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
f,0.222084,0.468528,0.098725,0.266861,0.26405
m,0.272597,0.489536,0.1566,0.291997,0.302683


# By genre

In [130]:
by_genre_f = by_genre[by_genre['gender'] == 'f']
by_genre_m = by_genre[by_genre['gender'] == 'm']

by_genre_wide = by_genre_f.merge(by_genre_m, on = 'genre', suffixes=('_f', '_m'))

by_genre_wide['diff'] = by_genre_wide['overall_avg_m'] - by_genre_wide['overall_avg_f']

cols = ['genre', 'overall_avg_f', 'overall_avg_m', 'diff']

by_genre_wide = by_genre_wide[cols]

In [131]:
by_genre_wide.sort_values('diff')

Unnamed: 0,genre,overall_avg_f,overall_avg_m,diff
7,fantasy,0.367086,0.324687,-0.042399
8,horror,0.307255,0.27066,-0.036595
9,thriller,0.262997,0.275967,0.01297
4,comedy,0.259546,0.275997,0.016451
6,drama,0.258788,0.301287,0.042499
2,animation,0.22793,0.280516,0.052586
5,crime,0.263162,0.320144,0.056983
1,adventure,0.284263,0.345986,0.061723
0,action,0.260214,0.322901,0.062687
3,biography,0.203854,0.276876,0.073022


# By movie

In [143]:
df_f = df[df['gender'] == 'f']
df_m = df[df['gender'] == 'm']

df_wide = df_f.merge(df_m, on = ['genre', 'movie_id'], suffixes=('_f', '_m'))

df_wide['diff'] = df_wide['overall_avg_m'] - df_wide['overall_avg_f']

cols = ['genre', 'overall_avg_f', 'overall_avg_m', 'diff', 'movie_title_f', 'movie_id', 'imdb_rating_f']

df_wide = df_wide[cols]

In [144]:
top_5 = df_wide.sort_values('diff').head()
bottom_5 = df_wide.sort_values('diff', ascending = False).head()

In [145]:
top_5

Unnamed: 0,genre,overall_avg_f,overall_avg_m,diff,movie_title_f,movie_id,imdb_rating_f
287,drama,0.866915,0.211642,-0.655273,contact,m304,7.4
534,fantasy,0.926777,0.335395,-0.591381,the magic toyshop,m580,6.2
470,drama,0.765404,0.312124,-0.453281,seven days to live,m510,5.2
409,drama,0.755146,0.307316,-0.447829,mimic,m440,5.7
553,drama,0.838377,0.408664,-0.429713,white angel,m604,4.4


In [146]:
bottom_5

Unnamed: 0,genre,overall_avg_f,overall_avg_m,diff,movie_title_f,movie_id,imdb_rating_f
93,drama,0.293745,0.926777,0.633031,i am legend,m95,7.1
545,crime,0.241027,0.865346,0.624319,vertigo,m594,8.6
423,action,0.266602,0.835773,0.569171,neuromancer,m457,9.3
240,crime,0.339397,0.862568,0.523171,badlands,m254,7.9
58,drama,0.221916,0.629513,0.407597,fear and loathing in las vegas,m60,7.6
