# Analysis of Oscar Genre Bias

In [14]:
import collections
import pandas as pd
import numpy as np
IMDB_FILE = "data/imdb_movie_data.csv"
OSCAR_FILE = "data/merged_major_categories.csv"
oscar_data = pd.read_csv(OSCAR_FILE)
imdb_data = pd.read_csv(IMDB_FILE, low_memory=False)
oscar_data.groupby("award").describe()["start_year"]

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
award,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ACTOR,232.0,1952.086207,13.763978,1927.0,1941.0,1952.0,1964.0,1975.0
ACTOR IN A LEADING ROLE,215.0,1997.0,12.438634,1976.0,1986.0,1997.0,2008.0,2018.0
ACTOR IN A SUPPORTING ROLE,415.0,1977.0,23.987215,1936.0,1956.0,1977.0,1998.0,2018.0
ACTRESS,236.0,1951.728814,13.923903,1927.0,1940.0,1952.0,1964.0,1975.0
ACTRESS IN A LEADING ROLE,215.0,1997.0,12.438634,1976.0,1986.0,1997.0,2008.0,2018.0
ACTRESS IN A SUPPORTING ROLE,415.0,1977.0,23.987215,1936.0,1956.0,1977.0,1998.0,2018.0
BEST MOTION PICTURE,90.0,1952.5,5.217193,1944.0,1948.0,1952.5,1957.0,1961.0
BEST PICTURE,324.0,1992.783951,17.224153,1962.0,1978.0,1994.0,2009.0,2018.0
DIRECTING,444.0,1973.984234,25.856388,1928.0,1952.0,1974.0,1996.0,2018.0
DIRECTING (Comedy Picture),2.0,1927.0,0.0,1927.0,1927.0,1927.0,1927.0,1927.0


In [8]:
best_picture_categories = {"BEST MOTION PICTURE", "BEST PICTURE", "OUTSTANDING MOTION PICTURE", "OUTSTANDING PICTURE", "OUTSTANDING PRODUCTION"}
oscar_data.head()

Unnamed: 0,year,award,won,nominee,nominated_film,addl_notes,special_citation,start_year,oscar_id,title_lower,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,title
0,1927/28,ACTOR,False,Richard Barthelmess,The Noose,,,1927,0,the noose,tt0019217,movie,The Noose,The Noose,0,1928,,65.0,Drama,the noose
1,1927/28,ACTOR,True,Emil Jannings,The Last Command,,,1927,1,the last command,tt0019071,movie,The Last Command,The Last Command,0,1928,,88.0,"Drama,History,Romance",the last command
2,1927/28,ACTRESS,False,Louise Dresser,A Ship Comes In,,,1927,2,a ship comes in,tt0018389,movie,A Ship Comes In,A Ship Comes In,0,1928,,70.0,Drama,a ship comes in
3,1927/28,ACTRESS,True,Janet Gaynor,7th Heaven,,,1927,3,7th heaven,tt0018379,movie,7th Heaven,7th Heaven,0,1927,,110.0,"Drama,Romance",7th heaven
4,1927/28,ACTRESS,False,Gloria Swanson,Sadie Thompson,,,1927,4,sadie thompson,tt0019344,movie,Sadie Thompson,Sadie Thompson,0,1928,,97.0,Drama,sadie thompson


In [70]:
best_picture = oscar_data[
    (oscar_data.award.isin(best_picture_categories)) &
    (oscar_data.start_year >= 1930)
].copy()
genres = best_picture.genres.str.split(",").apply(collections.Counter)
oscar_data_g = genres.sum().most_common()
oscar_data_g

[('Drama', 477),
 ('Romance', 167),
 ('Comedy', 117),
 ('Biography', 108),
 ('History', 64),
 ('Adventure', 63),
 ('Crime', 61),
 ('War', 38),
 ('Action', 32),
 ('Thriller', 32),
 ('Musical', 29),
 ('Fantasy', 27),
 ('Family', 26),
 ('Mystery', 25),
 ('Music', 22),
 ('Western', 14),
 ('Film-Noir', 13),
 ('Sport', 10),
 ('Sci-Fi', 10),
 ('Animation', 3),
 ('Horror', 2)]

In [56]:
import re
imdb_data = imdb_data[imdb_data.startYear.str.match(r"[0-9]+")]
imdb_data["year"] = imdb_data.startYear.apply(int)
imdb_data = imdb_data[(imdb_data.year >= 1930) & (imdb_data.year <= 2018)]
imdb_data.drop(imdb_data[(imdb_data.genres == "\\N") | (imdb_data.genres.str.contains("Documentary"))].index, inplace=True)

In [57]:
imdb_genres = imdb_data.genres.str.split(",").apply(collections.Counter)
imdb_genre_g = imdb_genres.sum().most_common()
imdb_genre_g

[('Drama', 145324),
 ('Comedy', 74777),
 ('Action', 35084),
 ('Romance', 34007),
 ('Crime', 24642),
 ('Thriller', 23637),
 ('Horror', 19416),
 ('Adventure', 17311),
 ('Family', 10931),
 ('Mystery', 10532),
 ('Fantasy', 9093),
 ('Musical', 8025),
 ('Adult', 7668),
 ('Sci-Fi', 6800),
 ('War', 5926),
 ('Music', 5197),
 ('History', 5133),
 ('Animation', 4853),
 ('Biography', 4519),
 ('Western', 4393),
 ('Sport', 3185),
 ('Film-Noir', 778),
 ('Reality-TV', 107),
 ('News', 79),
 ('Talk-Show', 55),
 ('Short', 11),
 ('Game-Show', 8)]

In [58]:
{ genre:f"{value/sum(genres.sum().values()) * 100:.2f}%" for genre, value in oscar_data_g }

{'Drama': '35.60%',
 'Romance': '12.46%',
 'Comedy': '8.73%',
 'Biography': '8.06%',
 'History': '4.78%',
 'Adventure': '4.70%',
 'Crime': '4.55%',
 'War': '2.84%',
 'Action': '2.39%',
 'Thriller': '2.39%',
 'Musical': '2.16%',
 'Fantasy': '2.01%',
 'Family': '1.94%',
 'Mystery': '1.87%',
 'Music': '1.64%',
 'Western': '1.04%',
 'Film-Noir': '0.97%',
 'Sport': '0.75%',
 'Sci-Fi': '0.75%',
 'Animation': '0.22%',
 'Horror': '0.15%'}

In [59]:
num_genres = sum([val for genre, val in imdb_genre_g])
{ genre:f"{(value/num_genres) * 100:.2f}%" for genre, value in imdb_genre_g }

{'Drama': '31.49%',
 'Comedy': '16.20%',
 'Action': '7.60%',
 'Romance': '7.37%',
 'Crime': '5.34%',
 'Thriller': '5.12%',
 'Horror': '4.21%',
 'Adventure': '3.75%',
 'Family': '2.37%',
 'Mystery': '2.28%',
 'Fantasy': '1.97%',
 'Musical': '1.74%',
 'Adult': '1.66%',
 'Sci-Fi': '1.47%',
 'War': '1.28%',
 'Music': '1.13%',
 'History': '1.11%',
 'Animation': '1.05%',
 'Biography': '0.98%',
 'Western': '0.95%',
 'Sport': '0.69%',
 'Film-Noir': '0.17%',
 'Reality-TV': '0.02%',
 'News': '0.02%',
 'Talk-Show': '0.01%',
 'Short': '0.00%',
 'Game-Show': '0.00%'}

In [91]:
best_picture["decade"] = best_picture.start_year.apply(str).str.extract(r"(^[0-9]{3})").apply(lambda x: x + "0s")
best_picture["drama_count"] = best_picture.genres.str.split(",").apply(collections.Counter).apply(lambda x: x["Drama"])
best_picture.groupby("decade").sum()["drama_count"] / best_picture.groupby("decade").count()["drama_count"] * 100

decade
1930s    75.862069
1940s    88.571429
1950s    90.000000
1960s    82.000000
1970s    90.000000
1980s    96.000000
1990s    98.000000
2000s    90.909091
2010s    89.873418
Name: drama_count, dtype: float64

In [92]:
imdb_data["decade"] = imdb_data.startYear.apply(str).str.extract(r"(^[0-9]{3})").apply(lambda x: x + "0s")
imdb_data["drama_count"] = imdb_data.genres.str.split(",").apply(collections.Counter).apply(lambda x: x["Drama"])
imdb_data.groupby("decade").sum()["drama_count"] / imdb_data.groupby("decade").count()["drama_count"]

decade
1930s    0.575422
1940s    0.537599
1950s    0.569858
1960s    0.522761
1970s    0.499103
1980s    0.493603
1990s    0.502042
2000s    0.535839
2010s    0.508984
Name: drama_count, dtype: float64

In [93]:
best_picture.groupby("decade").count().drama_count

decade
1930s    87
1940s    70
1950s    50
1960s    50
1970s    50
1980s    50
1990s    50
2000s    55
2010s    79
Name: drama_count, dtype: int64

In [98]:
best_picture.drama_count.sum() / len(best_picture)

0.8817005545286506

In [99]:
imdb_data.drama_count.sum() / len(imdb_data)

0.5192498025890314