**SUMMARY**

Performs simple checks on the data, drops duplicates and re-saves the data, ready for modelling.

In [1]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
df_movies = pd.read_csv("../data/female-representation-in-cinema/movies.csv", index_col=0)
print(df_movies.shape)
df_oscars = pd.read_csv("../data/female-representation-in-cinema/oscar.csv")
print(df_oscars.shape)

(7279, 21)
(5856, 6)


In [3]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7279 entries, 0 to 7634
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   title                       7279 non-null   object 
 1   year                        7279 non-null   int64  
 2   bt_score                    7279 non-null   int64  
 3   dubious                     7279 non-null   int64  
 4   imdbid                      7279 non-null   int64  
 5   tmdbId                      7279 non-null   int64  
 6   genres                      7279 non-null   object 
 7   popularity                  7279 non-null   float64
 8   production_companies        7279 non-null   object 
 9   production_countries        7279 non-null   object 
 10  release_date                7279 non-null   object 
 11  revenue                     7279 non-null   float64
 12  vote_average                7279 non-null   float64
 13  vote_count                  7279 non-n

In [4]:
df_oscars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5856 entries, 0 to 5855
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      5856 non-null   int64 
 1   category  5856 non-null   object
 2   film      5856 non-null   object
 3   name      5856 non-null   object
 4   status    5856 non-null   object
 5   gender    5856 non-null   object
dtypes: int64(1), object(5)
memory usage: 274.6+ KB


# Drop Duplicates

In [5]:
# Drop duplicate rows
n = df_movies.shape[0]
df_movies = df_movies.drop_duplicates()
print(f"{n - df_movies.shape[0]} rows dropped as duplicates")
print(df_movies.shape)

7 rows dropped as duplicates
(7272, 21)


In [6]:
# Do we have a unique key?
[(col, df_movies[col].nunique(), df_movies[col].nunique() == df_movies.shape[0]) for col in df_movies.columns]

[('title', 7101, False),
 ('year', 125, False),
 ('bt_score', 4, False),
 ('dubious', 2, False),
 ('imdbid', 7271, False),
 ('tmdbId', 7271, False),
 ('genres', 1647, False),
 ('popularity', 6255, False),
 ('production_companies', 5859, False),
 ('production_countries', 763, False),
 ('release_date', 5187, False),
 ('revenue', 4166, False),
 ('vote_average', 63, False),
 ('vote_count', 2711, False),
 ('cast', 7271, False),
 ('crew', 7271, False),
 ('budget', 651, False),
 ('cast_gender', 7108, False),
 ('crew_gender', 6648, False),
 ('cast_female_representation', 902, False),
 ('crew_female_representation', 1212, False)]

In [7]:
# How above title and year combination - is this unique?
df_movies.drop_duplicates(["title", "year"]).shape[0]

7271

In [8]:
# Almost. Seems like this is equivalent to imdbid/tmdbId
# Define unique key
key_col = "imdbid"

In [9]:
# Check remaining duplicates
df_movies[df_movies.duplicated(key_col, keep=False) == True].head(10)

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,genres,popularity,production_companies,production_countries,...,revenue,vote_average,vote_count,cast,crew,budget,cast_gender,crew_gender,cast_female_representation,crew_female_representation
3681,Into the Woods,2014,3,0,2180411,224141,"['Fantasy', 'Comedy']",19.911,"['Walt Disney Pictures', 'Marc Platt Productio...",['United States of America'],...,212902372.0,5.8,3855.0,"[{'adult': False, 'gender': 1, 'id': 5064, 'kn...","[{'adult': False, 'gender': 1, 'id': 8384, 'kn...",50000000.0,"[1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, ...","[1, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, ...",57.142857,12.5
3682,Into the Woods,2014,3,1,2180411,224141,"['Fantasy', 'Comedy']",19.911,"['Walt Disney Pictures', 'Marc Platt Productio...",['United States of America'],...,212902372.0,5.8,3855.0,"[{'adult': False, 'gender': 1, 'id': 5064, 'kn...","[{'adult': False, 'gender': 1, 'id': 8384, 'kn...",50000000.0,"[1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, ...","[1, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, ...",57.142857,12.5


In [10]:
# Compare the two duplicate rows
df_movies.loc[3681].compare(df_movies.loc[3682])

Unnamed: 0,self,other
dubious,0,1


In [11]:
# Dubious is the only duplicated field
# better to be conservative and keep the row where dubious == 1

index_to_drop = df_movies[(df_movies["imdbid"] == 2180411) & (df_movies["dubious"] == 0)].index
df_movies = df_movies.drop(index_to_drop, axis=0)
print(df_movies.shape)

(7271, 21)


In [12]:
# Double check that we have one row per key
df_movies[key_col].nunique() == df_movies.shape[0]

True

# Bechedel Score Target

In [13]:
# Score distribution
df_movies["bt_score"].value_counts(normalize=True)

bt_score
3    0.553982
1    0.232293
0    0.107688
2    0.106038
Name: proportion, dtype: float64

In [14]:
# Which score were reviewers most unsure about?
# i.e. Were reviewers more dubious when awarding a 3?
pd.crosstab(df_movies["dubious"], df_movies["bt_score"], normalize="columns")

bt_score,0,1,2,3
dubious,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.977011,0.964476,0.885863,0.877358
1,0.022989,0.035524,0.114137,0.122642


It seems that reviewers were most sure about scores 0 and 1.

Let's use this information to define a binary flag representing 'pass' and 'fail'.

i.e. A movie passes when it has at least two (named) women in it, who talk to each other.

In [15]:
y_col = "bt_pass"
df_movies[y_col] = np.where(df_movies["bt_score"].isin([2, 3]), 1, 0)

# Check counts
df_movies[y_col].value_counts(normalize=True)

bt_pass
1    0.660019
0    0.339981
Name: proportion, dtype: float64

# TODO: Add df_oscars

# Export

In [16]:
df_movies.to_csv("../data/raw.csv", index=False)

In [17]:
df_movies.head()

Unnamed: 0,title,year,bt_score,dubious,imdbid,tmdbId,genres,popularity,production_companies,production_countries,...,vote_average,vote_count,cast,crew,budget,cast_gender,crew_gender,cast_female_representation,crew_female_representation,bt_pass
0,Cinderella,1899,3,0,230,114108,"['Drama', 'Family', 'Fantasy', 'Romance']",3.762,"['Star-Film', 'Georges Méliès']",['France'],...,6.1,79.0,"[{'adult': False, 'gender': 2, 'id': 11523, 'k...","[{'adult': False, 'gender': 2, 'id': 11523, 'k...",0.0,"[2, 0, 1, 0, 1, 0]","[2, 2, 2, 2]",33.333333,0.0,1
1,Gretchen the Greenhorn,1916,3,0,6745,126925,[],1.942,[],[],...,6.1,7.0,"[{'adult': False, 'gender': 1, 'id': 30779, 'k...","[{'adult': False, 'gender': 2, 'id': 42060, 'k...",0.0,"[1, 2, 2, 2]","[2, 2, 2]",25.0,0.0,1
2,Snow White,1916,3,0,7361,174598,"['Fantasy', 'Drama']",0.817,[],[],...,5.7,9.0,"[{'adult': False, 'gender': 0, 'id': 1658947, ...","[{'adult': False, 'gender': 2, 'id': 28968, 'k...",0.0,"[0, 1, 2, 2, 0]",[2],20.0,0.0,1
3,The Poor Little Rich Girl,1917,3,0,8443,95866,"['Romance', 'Comedy', 'Drama', 'Fantasy']",2.023,['Artcraft Pictures Corporation'],['United States of America'],...,5.8,23.0,"[{'adult': False, 'gender': 1, 'id': 100047, '...","[{'adult': False, 'gender': 2, 'id': 13335, 'k...",0.0,"[1, 0, 0, 0, 2, 0, 1, 1, 0, 2, 0, 1, 1]","[2, 2, 1, 1, 2, 2, 0]",38.461538,28.571429,1
4,Stella Maris,1918,3,0,9652,70753,['Drama'],1.31,['Mary Pickford Company'],['United States of America'],...,6.9,19.0,"[{'adult': False, 'gender': 1, 'id': 100047, '...","[{'adult': False, 'gender': 1, 'id': 34741, 'k...",0.0,"[1, 1, 2, 2, 1, 0, 0, 2]","[1, 2, 2, 0, 2, 2]",37.5,16.666667,1
