# Gender distribution across the years

In order to decide the gender of a person, the `gender_guesser` library can be used to analyse the first name. Running
```python
import sys
!{sys.executable} -m pip install gender_guesser --user
```
should install the package.

In [1]:
import gender_guesser.detector as gender # Gender detector for first names.
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import timer

## Across time

In [None]:
df = pd.read_json("imdb_dataset_parsed.json")

# Keep only useful columns and remove NaNs.
df = pd.DataFrame(df[['year', 'actors', 'directors']].dropna())

df['year'] = df['year'].astype(int)

In [16]:
# Keep only first three (protagonists) actors for each film.
def createProtagonists(row):
    names_list = row['actors']
    return names_list[:3]

df['protagonists'] = df.apply(lambda row: createProtagonists(row), axis = 1) 

In [None]:
# Aggregate entries by year.
df = df.groupby('year', as_index = False).agg({'actors':'sum',
                                               'directors':'sum',
                                               'protagonists':'sum'})

# 2019 has very few entries, drop that row.
df = df[df['year'] != 2019]

In [None]:
df.head()

In [15]:
def malePercentage(row, column_name):
    names_list = row[column_name]
    genders = []
    
    # Load gender detector.
    d = gender.Detector(case_sensitive = False)
    
    for name in names_list:
        name = name.split(' ')
        # Transforms names like ['J.', 'Stuart', 'Blackton'] in
        # ['Stuart', 'Blackton'].
        name = [x for x in name if not x.endswith(".")]
        # Some names might now be empty.
        if name:
            genders.append(d.get_gender(name[0]))
        
    # Consider only 'male' or 'female' results,
    # excluding 'unknown', 'mostly_male' and 'mostly_female'.
    genders = list(filter(lambda gen: gen == 'male' or 
                          gen == 'female', genders))
    
    if len(genders) > 0:
        return genders.count('male')/len(genders)
    else:
        return np.nan

In [None]:
df['act_male_perc'] = df.apply(lambda row:
                               malePercentage(row, 'actors'), axis = 1)
df['dir_male_perc'] = df.apply(lambda row:
                               malePercentage(row, 'directors'), axis = 1)
df['pro_male_perc'] = df.apply(lambda row:
                               malePercentage(row, 'protagonists'),axis = 1)

In [None]:
df.head()

In [None]:
df.plot(x = 'year', y = 'act_male_perc', ylim = (0, 1))

Some useful info for annotation:
 * 1913: Alice Guy-Blaché, French filmmaker. The first woman to direct a film. Here with "Dick Whittington and his Cat" an _elaborate and spectacular production adapted from Old English fairy tale!_;
 * 2008: Kathryn Bigelow, first woman to win best director Oscar for "The Hurt Locker";

In [None]:
df.plot(x = 'year', y = 'dir_male_perc', ylim = (0, 1))

In [None]:
df.plot(x = 'year', y = 'pro_male_perc', ylim = (0, 1))

In [None]:
#df.drop(columns = ['actors', 'directors']).to_json("gender.json",
#                                                   orient = 'records')

## Last years

In [4]:
df = pd.read_json("imdb_dataset_parsed.json")

# Keep only useful columns and remove NaNs.
df = pd.DataFrame(df[['year', 'actors', 'directors']].dropna())

df['year'] = df['year'].astype(int)

df = pd.DataFrame(df[df['year'] > 1999])

In [5]:
def directorGender(row):
    director_list = row['directors']
    genders = []
    
    # Load gender detector.
    d = gender.Detector(case_sensitive = False)
    
    for name in director_list:
        name = name.split(' ')
        # Transforms names like ['J.', 'Stuart', 'Blackton'] in
        # ['Stuart', 'Blackton'].
        name = [x for x in name if not x.endswith(".")]
        # Some names might now be empty.
        if name:
            genders.append(d.get_gender(name[0]))
        
    # Consider only 'male' or 'female' results,
    # excluding 'unknown', 'mostly_male' and 'mostly_female'.
    genders = list(filter(lambda gen: gen == 'male' or 
                          gen == 'female', genders))
    
    if len(genders) > 0:
        if genders.count('male')/len(genders) > 0.5:
            return "male"
        else:
            return "female"
    else:
        return np.nan

In [6]:
# approx 2h45min for this cell.
with timer.codeTimer("Director genders"):
    df['dir_gen'] = df.apply(lambda row:
                             directorGender(row), axis = 1)

Executed 'Director genders'.  Elapsed time: 9915.071698s


In [7]:
#df.to_pickle('gender_df.pkl')

In [41]:
gender_df = pd.read_pickle("gender_df.pkl").dropna()
gender_df['protagonists'] = gender_df.apply(lambda row: createProtagonists(row), axis = 1) 
gender_df["count"] = 1
gender_df.head()

Unnamed: 0,year,actors,directors,dir_gen,protagonists,count
0,2018,"[Chris Hemsworth, Michael Shannon, Michael Peñ...",[Nicolai Fuglsig],male,"[Chris Hemsworth, Michael Shannon, Michael Peña]",1
3,2017,"[Hugh Jackman, Patrick Stewart, Dafne Keen, Bo...",[James Mangold],male,"[Hugh Jackman, Patrick Stewart, Dafne Keen]",1
5,2018,"[Jodie Foster, Sterling K. Brown, Sofia Boutel...",[Drew Pearce],male,"[Jodie Foster, Sterling K. Brown, Sofia Boutella]",1
8,2014,"[Ralph Fiennes, F. Murray Abraham, Mathieu Ama...",[Wes Anderson],male,"[Ralph Fiennes, F. Murray Abraham, Mathieu Ama...",1
11,2011,"[Chris Evans, Hayley Atwell, Sebastian Stan, T...",[Joe Johnston],male,"[Chris Evans, Hayley Atwell, Sebastian Stan]",1


In [42]:
# May take a couple of minutes.
gender_df = gender_df.groupby('dir_gen', as_index = False).agg({'actors':'sum', 'protagonists':'sum', 'count':'sum'})
gender_df

Unnamed: 0,dir_gen,actors,protagonists,count
0,female,"[Graham Verchere, Judah Lewis, Caleb Emery, Co...","[Graham Verchere, Judah Lewis, Caleb Emery, Be...",2620
1,male,"[Chris Hemsworth, Michael Shannon, Michael Peñ...","[Chris Hemsworth, Michael Shannon, Michael Peñ...",22206


In [43]:
gender_df['act_male_perc'] = gender_df.apply(lambda row:
                                             malePercentage(row, 'actors'),
                                             axis = 1)
gender_df['pro_male_perc'] = gender_df.apply(lambda row:
                                             malePercentage(row,
                                                            'protagonists'),
                                             axis = 1)

gender_df

Unnamed: 0,dir_gen,actors,protagonists,count,act_male_perc,pro_male_perc
0,female,"[Graham Verchere, Judah Lewis, Caleb Emery, Co...","[Graham Verchere, Judah Lewis, Caleb Emery, Be...",2620,0.572273,0.512658
1,male,"[Chris Hemsworth, Michael Shannon, Michael Peñ...","[Chris Hemsworth, Michael Shannon, Michael Peñ...",22206,0.663395,0.66519


In [None]:
to_json("gender.json", orient = 'records')