# Analysis of Oscar Genre Bias

We're conducting an analysis of how the genres that get rewarded for Oscars differs from the genres that filmmakers make. This involves a few things. First, we're going to load in our data and count the genres.

In [9]:
import collections
import pandas as pd
import numpy as np
IMDB_FILE = "data/imdb_movie_data.csv"
OSCAR_FILE = "data/merged_oscar_data_raw.csv"
oscar_data = pd.read_csv(OSCAR_FILE)
oscar_data.head()

Unnamed: 0,year,award,won,nominee,nominated_film,addl_notes,special_citation,start_year,oscar_id,title_lower,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,title
0,1927/28,DIRECTING (Comedy Picture),True,Lewis Milestone,Two Arabian Knights,,,1927,0,two arabian knights,tt0018515,movie,Two Arabian Knights,Two Arabian Knights,0,1927,,92,"Adventure,Comedy,Romance",two arabian knights
1,1927/28,DIRECTING (Comedy Picture),False,Ted Wilde,Speedy,,,1927,1,speedy,tt0019412,movie,Speedy,Speedy,0,1928,,85,"Action,Comedy,Family",speedy
2,1927/28,DIRECTING (Dramatic Picture),True,Frank Borzage,7th Heaven,,,1927,2,7th heaven,tt0018379,movie,7th Heaven,7th Heaven,0,1927,,110,"Drama,Romance",7th heaven
3,1927/28,DIRECTING (Dramatic Picture),False,Herbert Brenon,Sorrell and Son,,,1927,3,sorrell and son,tt0018429,movie,Sorrell and Son,Sorrell and Son,0,1927,,100,Drama,sorrell and son
4,1927/28,DIRECTING (Dramatic Picture),False,King Vidor,The Crowd,,,1927,4,the crowd,tt0018806,movie,The Crowd,The Crowd,0,1928,,98,"Drama,Romance",the crowd


In [15]:
imdb_data = pd.read_csv(IMDB_FILE, dtype=str).replace("\\N", np.nan).dropna(subset=["startYear", "genres"])
imdb_data.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45.0,Romance
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,20.0,"Documentary,News,Sport"
2,tt0000335,movie,Soldiers of the Cross,Soldiers of the Cross,0,1900,,,"Biography,Drama"
4,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70.0,"Biography,Crime,Drama"
5,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,,,Drama


In [18]:
oscar_genres = oscar_data.genres.str.split(",").apply(collections.Counter)
oscar_genre_g = oscar_genres.sum()
oscar_genre_g

Counter({'Adventure': 43,
         'Comedy': 92,
         'Romance': 118,
         'Action': 23,
         'Family': 19,
         'Drama': 403,
         'Musical': 17,
         'Western': 13,
         'History': 44,
         'Thriller': 35,
         'War': 34,
         'Crime': 65,
         'Sport': 9,
         'Music': 17,
         'Mystery': 30,
         'Biography': 76,
         'Film-Noir': 16,
         'Fantasy': 21,
         'Horror': 3,
         'Sci-Fi': 8})

In [17]:
imdb_genres = imdb_data.genres.str.split(",").apply(collections.Counter)
imdb_genre_g = imdb_genres.sum()
imdb_genre_g

Counter({'Romance': 37633,
         'Documentary': 95632,
         'News': 1729,
         'Sport': 5442,
         'Biography': 13394,
         'Drama': 171596,
         'Crime': 27811,
         'Adventure': 21401,
         'Fantasy': 10002,
         'Comedy': 83724,
         'War': 7729,
         'Family': 13872,
         'History': 11335,
         'Sci-Fi': 7558,
         'Western': 6238,
         'Thriller': 26154,
         'Mystery': 11949,
         'Horror': 21804,
         'Action': 38558,
         'Music': 9584,
         'Short': 16,
         'Animation': 5814,
         'Musical': 8687,
         'Film-Noir': 783,
         'Talk-Show': 67,
         'Adult': 7798,
         'Reality-TV': 171,
         'Game-Show': 11})

In [23]:
{ genre:f"{value/sum(oscar_genre_g.values()) * 100:.2f}%" for genre, value in oscar_genre_g.most_common() }

{'Drama': '37.11%',
 'Romance': '10.87%',
 'Comedy': '8.47%',
 'Biography': '7.00%',
 'Crime': '5.99%',
 'History': '4.05%',
 'Adventure': '3.96%',
 'Thriller': '3.22%',
 'War': '3.13%',
 'Mystery': '2.76%',
 'Action': '2.12%',
 'Fantasy': '1.93%',
 'Family': '1.75%',
 'Musical': '1.57%',
 'Music': '1.57%',
 'Film-Noir': '1.47%',
 'Western': '1.20%',
 'Sport': '0.83%',
 'Sci-Fi': '0.74%',
 'Horror': '0.28%'}

In [24]:
{ genre:f"{value/sum(imdb_genre_g.values()) * 100:.2f}%" for genre, value in imdb_genre_g.most_common() }

{'Drama': '26.54%',
 'Documentary': '14.79%',
 'Comedy': '12.95%',
 'Action': '5.96%',
 'Romance': '5.82%',
 'Crime': '4.30%',
 'Thriller': '4.05%',
 'Horror': '3.37%',
 'Adventure': '3.31%',
 'Family': '2.15%',
 'Biography': '2.07%',
 'Mystery': '1.85%',
 'History': '1.75%',
 'Fantasy': '1.55%',
 'Music': '1.48%',
 'Musical': '1.34%',
 'Adult': '1.21%',
 'War': '1.20%',
 'Sci-Fi': '1.17%',
 'Western': '0.96%',
 'Animation': '0.90%',
 'Sport': '0.84%',
 'News': '0.27%',
 'Film-Noir': '0.12%',
 'Reality-TV': '0.03%',
 'Talk-Show': '0.01%',
 'Short': '0.00%',
 'Game-Show': '0.00%'}