In [1]:
import pickle
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Part 1 : Listing Movies to Query

In [27]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
movies.drop(['homepage', 'keywords','original_language','overview','release_date','spoken_languages', \
             'status','title','tagline','vote_count'\
            ], \
            axis=1, \
            inplace=True \
           )

In [28]:
credits = pd.read_csv('data/tmdb_5000_credits.csv')
credits = credits[credits.cast != '[]']
credits.drop(['title', 'crew'], axis=1, inplace=True)
credits['cast_id'] = credits['cast'].apply(lambda row: list(set(pd.read_json(row)['id'])))

In [29]:
frames = pd.DataFrame()
new_df = pd.DataFrame()

for idx, film in credits.iterrows():
    cast_df = pd.DataFrame(eval(credits['cast'][idx]))
    cast_df['credits'] = idx
    cast_df = cast_df.drop(['character','order', 'credit_id', 'cast_id'],axis = 1)  
    
    frames = [new_df, cast_df]
    new_df = pd.concat(frames, join = 'outer', ignore_index=True)

In [30]:
discount_old = credits['cast_id'].apply(pd.Series).stack().value_counts()
discount_old = list(discount_old[discount_old > 4].index.astype(int))

In [31]:
nodes_df = new_df['credits'].groupby([new_df.gender, new_df.id, new_df.name]).apply(list).reset_index()
nodes_df = nodes_df[nodes_df['gender'].isin(['1','2'])]
discount_1 = nodes_df['id'].tolist()
discount = [x for x in discount_old if x in discount_1]

In [32]:
credits['cast_id'] = credits['cast_id'].apply(lambda x: [y for y in x if y in discount])
credits['edges'] = credits['cast_id'].apply(lambda x: list(itertools.combinations(x, 2)))

In [33]:
discarded_movies = set()

for idx, movie in credits.iterrows():
    if len(movie['edges']) == 0:
        discarded_movies.add(movie['movie_id'])

print(len(discarded_movies)) 

663


In [34]:
movies = movies[~movies['id'].isin(discarded_movies)]
movies['bechdel'] = ''
movies.head()

Unnamed: 0,budget,genres,id,original_title,popularity,production_companies,production_countries,revenue,runtime,vote_average,bechdel
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,Avatar,150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2787965087,162.0,7.2,
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,Pirates of the Caribbean: At World's End,139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",961000000,169.0,6.9,
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,Spectre,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",880674609,148.0,6.3,
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,The Dark Knight Rises,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1084939099,165.0,7.6,
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,John Carter,43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",284139100,132.0,6.1,


In [5]:
with open('data/bechdel.pkl', 'rb') as pkl_file: 
    movies_bechdel = pickle.load(pkl_file)
    
movies_bechdel.head()

Unnamed: 0,budget,genres,id,original_title,popularity,production_companies,production_countries,revenue,runtime,vote_average,bechdel
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,Avatar,150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2787965087,162.0,7.2,0.0
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,Pirates of the Caribbean: At World's End,139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",961000000,169.0,6.9,1.0
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,Spectre,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",880674609,148.0,6.3,0.0
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,The Dark Knight Rises,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1084939099,165.0,7.6,0.0
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,John Carter,43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",284139100,132.0,6.1,1.0


In [3]:
movies_bechdel['bechdel'] = movies_bechdel['bechdel'].map({-1:-1, '0':0, '1':0, '2':0, '3':1, ' ':0})
movies_bechdel['bechdel'].value_counts()

-1.0    1646
 1.0    1358
 0.0    1130
Name: bechdel, dtype: int64

In [42]:
movies_bechdel.corr()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,bechdel
budget,1.0,-0.055604,0.489507,0.723982,0.279442,0.087631,0.202766
id,-0.055604,1.0,0.090769,-0.020148,-0.133421,-0.185161,-0.020614
popularity,0.489507,0.090769,1.0,0.636844,0.229755,0.292053,0.292492
revenue,0.723982,-0.020148,0.636844,1.0,0.26457,0.22039,0.266546
runtime,0.279442,-0.133421,0.229755,0.26457,1.0,0.402501,0.156131
vote_average,0.087631,-0.185161,0.292053,0.22039,0.402501,1.0,0.288393
bechdel,0.202766,-0.020614,0.292492,0.266546,0.156131,0.288393,1.0


In [48]:
movies_bechdel[(movies_bechdel['revenue'] != 0) & (movies_bechdel['budget'] != 0) & (movies_bechdel['bechdel'] != -1)].count()

budget                  2117
genres                  2117
id                      2117
original_title          2117
popularity              2117
production_companies    2117
production_countries    2117
revenue                 2117
runtime                 2117
vote_average            2117
bechdel                 2117
dtype: int64

In [46]:
movies_bechdel[(movies_bechdel['bechdel'] == 1) & ]

Unnamed: 0,budget,genres,id,original_title,popularity,production_companies,production_countries,revenue,runtime,vote_average,bechdel
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,Pirates of the Caribbean: At World's End,139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",961000000,169.0,6.9,1.0
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,John Carter,43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",284139100,132.0,6.1,1.0
6,260000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...",38757,Tangled,48.681969,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",591794936,100.0,7.4,1.0
7,280000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",99861,Avengers: Age of Ultron,134.279229,"[{""name"": ""Marvel Studios"", ""id"": 420}, {""name...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1405403694,141.0,7.3,1.0
8,250000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",767,Harry Potter and the Half-Blood Prince,98.885637,"[{""name"": ""Warner Bros."", ""id"": 6194}, {""name""...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",933959197,153.0,7.4,1.0
14,225000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49521,Man of Steel,99.398009,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",662845518,143.0,6.5,1.0
15,225000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 10751...",2454,The Chronicles of Narnia: Prince Caspian,53.978602,"[{""name"": ""Walt Disney"", ""id"": 5888}, {""name"":...","[{""iso_3166_1"": ""CZ"", ""name"": ""Czech Republic""...",419651413,150.0,6.3,1.0
21,200000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",20662,Robin Hood,37.668301,"[{""name"": ""Imagine Entertainment"", ""id"": 23}, ...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",310669540,140.0,6.2,1.0
22,250000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",57158,The Hobbit: The Desolation of Smaug,94.370564,"[{""name"": ""WingNut Films"", ""id"": 11}, {""name"":...","[{""iso_3166_1"": ""NZ"", ""name"": ""New Zealand""}, ...",958400000,161.0,7.6,1.0
23,180000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",2268,The Golden Compass,42.990906,"[{""name"": ""New Line Cinema"", ""id"": 12}, {""name...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",372234864,113.0,5.8,1.0


In [4]:
with open('data/bechdel.pkl', 'wb') as f: 
    pickle.dump(movies_bechdel, f)