In [1]:
# Import dependecies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# Note that encoder.inverse_transform(df['popularity_encoded']) will return the original
# string values
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from imblearn.ensemble import BalancedRandomForestClassifier


In [2]:
# Import data for pre-processing
df = pd.read_csv('merged_dataframe.csv')
df.head(10)

Unnamed: 0,id,imdb_id,popularity,budget,revenue,cast,homepage,director,tagline,keywords,...,release_year,budget_adj,revenue_adj,film,year_film,year_ceremony,ceremony,category,name,winner
0,135397,tt0369610,32.985763,150000000,1513528810,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,monster|dna|tyrannosaurus rex|velociraptor|island,...,2015,137999900.0,1392446000.0,Jurassic World,,,,,,
1,76341,tt1392190,28.419936,150000000,378436354,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,future|chase|post-apocalyptic|dystopia|australia,...,2015,137999900.0,348161300.0,Mad Max: Fury Road,2015.0,2016.0,88.0,CINEMATOGRAPHY,John Seale,False
2,262500,tt2908446,13.112507,110000000,295238201,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,based on novel|revolution|dystopia|sequel|dyst...,...,2015,101200000.0,271619000.0,Insurgent,,,,,,
3,140607,tt2488496,11.173104,200000000,2068178225,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,android|spaceship|jedi|space opera|3d,...,2015,183999900.0,1902723000.0,Star Wars: The Force Awakens,2015.0,2016.0,88.0,FILM EDITING,Maryann Brandon and Mary Jo Markey,False
4,168259,tt2820852,9.335014,190000000,1506249360,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,car race|speed|revenge|suspense|car,...,2015,174799900.0,1385749000.0,Furious 7,,,,,,
5,281957,tt1663202,9.1107,135000000,532950503,Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...,http://www.foxmovies.com/movies/the-revenant,Alejandro GonzÃ¡lez IÃ±Ã¡rritu,"(n. One who has returned, as if from the dead.)",father-son relationship|rape|based on novel|mo...,...,2015,124199900.0,490314200.0,The Revenant,2015.0,2016.0,88.0,ACTOR IN A LEADING ROLE,Leonardo DiCaprio,True
6,87101,tt1340138,8.654359,155000000,440603537,Arnold Schwarzenegger|Jason Clarke|Emilia Clar...,http://www.terminatormovie.com/,Alan Taylor,Reset the future,saving the world|artificial intelligence|cybor...,...,2015,142599900.0,405355100.0,Terminator Genisys,,,,,,
7,286217,tt3659388,7.6674,108000000,595380321,Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...,http://www.foxmovies.com/movies/the-martian,Ridley Scott,Bring Him Home,based on novel|mars|nasa|isolation|botanist,...,2015,99359960.0,547749700.0,The Martian,2015.0,2016.0,88.0,ACTOR IN A LEADING ROLE,Matt Damon,False
8,211672,tt2293640,7.404165,74000000,1156730962,Sandra Bullock|Jon Hamm|Michael Keaton|Allison...,http://www.minionsmovie.com/,Kyle Balda|Pierre Coffin,"Before Gru, they had a history of bad bosses",assistant|aftercreditsstinger|duringcreditssti...,...,2015,68079970.0,1064192000.0,Minions,,,,,,
9,150540,tt2096673,6.326804,175000000,853708609,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,http://movies.disney.com/inside-out,Pete Docter,Meet the little voices inside your head.,dream|cartoon|imaginary friend|animation|kid,...,2015,160999900.0,785411600.0,Inside Out,2015.0,2016.0,88.0,ANIMATED FEATURE FILM,Pete Docter and Jonas Rivera,True


In [3]:
df['profit_adj'] = df['revenue_adj'] - df['budget_adj']
df['profit_adj']

0        1.254446e+09
1        2.101614e+08
2        1.704191e+08
3        1.718723e+09
4        1.210949e+09
             ...     
10541    0.000000e+00
10542    0.000000e+00
10543    0.000000e+00
10544    0.000000e+00
10545   -1.276423e+05
Name: profit_adj, Length: 10546, dtype: float64

In [4]:
# Drop irrelevant or redundant columns
df = df.drop(['id', 'imdb_id', 'budget', 'revenue', 'homepage', 'tagline',
              'overview', 'year_film', 'year_ceremony', 'ceremony', 'name', 'film', 'keywords',
              'release_year', 'category', 'vote_count', 'vote_average'], axis = 1)
df.head(10)

Unnamed: 0,popularity,cast,director,runtime,genres,production_companies,release_date,budget_adj,revenue_adj,winner,profit_adj
0,32.985763,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,137999900.0,1392446000.0,,1254446000.0
1,28.419936,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,137999900.0,348161300.0,False,210161400.0
2,13.112507,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,101200000.0,271619000.0,,170419100.0
3,11.173104,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,183999900.0,1902723000.0,False,1718723000.0
4,9.335014,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,174799900.0,1385749000.0,,1210949000.0
5,9.1107,Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...,Alejandro GonzÃ¡lez IÃ±Ã¡rritu,156,Western|Drama|Adventure|Thriller,Regency Enterprises|Appian Way|CatchPlay|Anony...,12/25/2015,124199900.0,490314200.0,True,366114300.0
6,8.654359,Arnold Schwarzenegger|Jason Clarke|Emilia Clar...,Alan Taylor,125,Science Fiction|Action|Thriller|Adventure,Paramount Pictures|Skydance Productions,6/23/2015,142599900.0,405355100.0,,262755100.0
7,7.6674,Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...,Ridley Scott,141,Drama|Adventure|Science Fiction,Twentieth Century Fox Film Corporation|Scott F...,9/30/2015,99359960.0,547749700.0,False,448389700.0
8,7.404165,Sandra Bullock|Jon Hamm|Michael Keaton|Allison...,Kyle Balda|Pierre Coffin,91,Family|Animation|Adventure|Comedy,Universal Pictures|Illumination Entertainment,6/17/2015,68079970.0,1064192000.0,,996112000.0
9,6.326804,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,94,Comedy|Animation|Family,Walt Disney Pictures|Pixar Animation Studios|W...,6/9/2015,160999900.0,785411600.0,True,624411600.0


In [5]:
print(df['release_date'].dtype)

object


In [6]:
# release_month[0:10]
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_month'] = df['release_date'].dt.month
df['release_month']

0         6
1         5
2         3
3        12
4         4
         ..
10541     6
10542    12
10543     1
10544    11
10545    11
Name: release_month, Length: 10546, dtype: int64

In [7]:
df = df.drop('release_date', axis = 1)
df.head(10)

Unnamed: 0,popularity,cast,director,runtime,genres,production_companies,budget_adj,revenue_adj,winner,profit_adj,release_month
0,32.985763,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,137999900.0,1392446000.0,,1254446000.0,6
1,28.419936,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,137999900.0,348161300.0,False,210161400.0,5
2,13.112507,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,101200000.0,271619000.0,,170419100.0,3
3,11.173104,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,183999900.0,1902723000.0,False,1718723000.0,12
4,9.335014,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,174799900.0,1385749000.0,,1210949000.0,4
5,9.1107,Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...,Alejandro GonzÃ¡lez IÃ±Ã¡rritu,156,Western|Drama|Adventure|Thriller,Regency Enterprises|Appian Way|CatchPlay|Anony...,124199900.0,490314200.0,True,366114300.0,12
6,8.654359,Arnold Schwarzenegger|Jason Clarke|Emilia Clar...,Alan Taylor,125,Science Fiction|Action|Thriller|Adventure,Paramount Pictures|Skydance Productions,142599900.0,405355100.0,,262755100.0,6
7,7.6674,Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...,Ridley Scott,141,Drama|Adventure|Science Fiction,Twentieth Century Fox Film Corporation|Scott F...,99359960.0,547749700.0,False,448389700.0,9
8,7.404165,Sandra Bullock|Jon Hamm|Michael Keaton|Allison...,Kyle Balda|Pierre Coffin,91,Family|Animation|Adventure|Comedy,Universal Pictures|Illumination Entertainment,68079970.0,1064192000.0,,996112000.0,6
9,6.326804,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,94,Comedy|Animation|Family,Walt Disney Pictures|Pixar Animation Studios|W...,160999900.0,785411600.0,True,624411600.0,6


In [8]:
df['director']

0           Colin Trevorrow
1             George Miller
2          Robert Schwentke
3               J.J. Abrams
4                 James Wan
                ...        
10541           Bruce Brown
10542    John Frankenheimer
10543        Eldar Ryazanov
10544           Woody Allen
10545      Harold P. Warren
Name: director, Length: 10546, dtype: object

In [9]:
df['cast'][0]

"Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vincent D'Onofrio|Nick Robinson"

In [10]:
type(df['cast'][0])

str

In [11]:
df['cast'] = df['cast'].str.split("|")
df['cast'][0]

['Chris Pratt',
 'Bryce Dallas Howard',
 'Irrfan Khan',
 "Vincent D'Onofrio",
 'Nick Robinson']

In [12]:
df['director'] = df['director'].str.split("|")
df['director'][0]

['Colin Trevorrow']

In [13]:
df['genres'] = df['genres'].str.split("|")
df['genres'][0]

['Action', 'Adventure', 'Science Fiction', 'Thriller']

In [14]:
df['production_companies'] = df['production_companies'].str.split("|")
df['production_companies'][0]

['Universal Studios',
 'Amblin Entertainment',
 'Legendary Pictures',
 'Fuji Television Network',
 'Dentsu']

In [15]:
df.shape

(10546, 11)

In [16]:
# Because some of the column values are lists of strings, rather than string or numeric types, 
# we separate the column and create a stacked Pandas Series object. This is then converted to a 
# dataframe.
new_df = df.drop('cast', axis = 1).join(
             pd.get_dummies(
             pd.DataFrame(df.cast.tolist(), df.index).stack(),
             prefix = 'cast_').astype(int).groupby(level=0).sum()
        )
new_df.head(10)

Unnamed: 0,popularity,director,runtime,genres,production_companies,budget_adj,revenue_adj,winner,profit_adj,release_month,...,cast__Ã‰milie Dequenne,cast__Ã‰ric Caravaca,cast__Ã‰va IgÃ³,cast__Ã€lex BrendemÃ¼hl,cast__Ã€lex Monner,cast__Äá»— Thá»‹ Háº£i Yáº¿n,cast__Å arÅ«nas Bartas,cast__Åžafak Sezer,cast__à¸ªà¸£à¸žà¸‡à¸©à¹Œ à¸Šà¸²à¸•à¸£à¸µ,cast__é‚±æ·‘è²ž
0,32.985763,[Colin Trevorrow],124,"[Action, Adventure, Science Fiction, Thriller]","[Universal Studios, Amblin Entertainment, Lege...",137999900.0,1392446000.0,,1254446000.0,6,...,0,0,0,0,0,0,0,0,0,0
1,28.419936,[George Miller],120,"[Action, Adventure, Science Fiction, Thriller]","[Village Roadshow Pictures, Kennedy Miller Pro...",137999900.0,348161300.0,False,210161400.0,5,...,0,0,0,0,0,0,0,0,0,0
2,13.112507,[Robert Schwentke],119,"[Adventure, Science Fiction, Thriller]","[Summit Entertainment, Mandeville Films, Red W...",101200000.0,271619000.0,,170419100.0,3,...,0,0,0,0,0,0,0,0,0,0
3,11.173104,[J.J. Abrams],136,"[Action, Adventure, Science Fiction, Fantasy]","[Lucasfilm, Truenorth Productions, Bad Robot]",183999900.0,1902723000.0,False,1718723000.0,12,...,0,0,0,0,0,0,0,0,0,0
4,9.335014,[James Wan],137,"[Action, Crime, Thriller]","[Universal Pictures, Original Film, Media Righ...",174799900.0,1385749000.0,,1210949000.0,4,...,0,0,0,0,0,0,0,0,0,0
5,9.1107,[Alejandro GonzÃ¡lez IÃ±Ã¡rritu],156,"[Western, Drama, Adventure, Thriller]","[Regency Enterprises, Appian Way, CatchPlay, A...",124199900.0,490314200.0,True,366114300.0,12,...,0,0,0,0,0,0,0,0,0,0
6,8.654359,[Alan Taylor],125,"[Science Fiction, Action, Thriller, Adventure]","[Paramount Pictures, Skydance Productions]",142599900.0,405355100.0,,262755100.0,6,...,0,0,0,0,0,0,0,0,0,0
7,7.6674,[Ridley Scott],141,"[Drama, Adventure, Science Fiction]","[Twentieth Century Fox Film Corporation, Scott...",99359960.0,547749700.0,False,448389700.0,9,...,0,0,0,0,0,0,0,0,0,0
8,7.404165,"[Kyle Balda, Pierre Coffin]",91,"[Family, Animation, Adventure, Comedy]","[Universal Pictures, Illumination Entertainment]",68079970.0,1064192000.0,,996112000.0,6,...,0,0,0,0,0,0,0,0,0,0
9,6.326804,[Pete Docter],94,"[Comedy, Animation, Family]","[Walt Disney Pictures, Pixar Animation Studios...",160999900.0,785411600.0,True,624411600.0,6,...,0,0,0,0,0,0,0,0,0,0


In [17]:
new_df.shape

(10546, 18648)

In [18]:
new_df = new_df.drop('genres', axis = 1).join(
             pd.get_dummies(
             pd.DataFrame(df.genres.tolist(), df.index).stack(),
             prefix = 'genre_').astype(int).groupby(level=0).sum()
        )
new_df.head(10)

Unnamed: 0,popularity,director,runtime,production_companies,budget_adj,revenue_adj,winner,profit_adj,release_month,cast__ Larry Mullen Jr.,...,genre__Horror,genre__Music,genre__Mystery,genre__None,genre__Romance,genre__Science Fiction,genre__TV Movie,genre__Thriller,genre__War,genre__Western
0,32.985763,[Colin Trevorrow],124,"[Universal Studios, Amblin Entertainment, Lege...",137999900.0,1392446000.0,,1254446000.0,6,0,...,0,0,0,0,0,1,0,1,0,0
1,28.419936,[George Miller],120,"[Village Roadshow Pictures, Kennedy Miller Pro...",137999900.0,348161300.0,False,210161400.0,5,0,...,0,0,0,0,0,1,0,1,0,0
2,13.112507,[Robert Schwentke],119,"[Summit Entertainment, Mandeville Films, Red W...",101200000.0,271619000.0,,170419100.0,3,0,...,0,0,0,0,0,1,0,1,0,0
3,11.173104,[J.J. Abrams],136,"[Lucasfilm, Truenorth Productions, Bad Robot]",183999900.0,1902723000.0,False,1718723000.0,12,0,...,0,0,0,0,0,1,0,0,0,0
4,9.335014,[James Wan],137,"[Universal Pictures, Original Film, Media Righ...",174799900.0,1385749000.0,,1210949000.0,4,0,...,0,0,0,0,0,0,0,1,0,0
5,9.1107,[Alejandro GonzÃ¡lez IÃ±Ã¡rritu],156,"[Regency Enterprises, Appian Way, CatchPlay, A...",124199900.0,490314200.0,True,366114300.0,12,0,...,0,0,0,0,0,0,0,1,0,1
6,8.654359,[Alan Taylor],125,"[Paramount Pictures, Skydance Productions]",142599900.0,405355100.0,,262755100.0,6,0,...,0,0,0,0,0,1,0,1,0,0
7,7.6674,[Ridley Scott],141,"[Twentieth Century Fox Film Corporation, Scott...",99359960.0,547749700.0,False,448389700.0,9,0,...,0,0,0,0,0,1,0,0,0,0
8,7.404165,"[Kyle Balda, Pierre Coffin]",91,"[Universal Pictures, Illumination Entertainment]",68079970.0,1064192000.0,,996112000.0,6,0,...,0,0,0,0,0,0,0,0,0,0
9,6.326804,[Pete Docter],94,"[Walt Disney Pictures, Pixar Animation Studios...",160999900.0,785411600.0,True,624411600.0,6,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
new_df.shape

(10546, 18668)

In [20]:
director_dummies = pd.get_dummies(new_df['director'].apply(pd.Series).stack()).groupby(level=0).sum()
director_dummies.columns = [f"{col}_director" for col in director_dummies.columns]

# concatenate the one-hot encoded columns back into the original dataframe
new_df = pd.concat([new_df.drop(['director'], axis=1), director_dummies], axis=1)
new_df.head(10)

Unnamed: 0,popularity,runtime,production_companies,budget_adj,revenue_adj,winner,profit_adj,release_month,cast__ Larry Mullen Jr.,cast__ Nicole MuÃ±oz,...,Zoya Akhtar_director,ÃdÃ¡m CsÃ¡szi_director,Ãlex de la Iglesia_director,Ãlvaro de ArmiÃ±Ã¡n_director,Ã–zgÃ¼r Yildirim_director,Ã‡aÄŸan Irmak_director,Ã‰mile Gaudreault_director,Ã‰ric Besnard_director,Ã‰tienne Chatiliez_director,Ã€lex Pastor_director
0,32.985763,124,"[Universal Studios, Amblin Entertainment, Lege...",137999900.0,1392446000.0,,1254446000.0,6,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,28.419936,120,"[Village Roadshow Pictures, Kennedy Miller Pro...",137999900.0,348161300.0,False,210161400.0,5,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13.112507,119,"[Summit Entertainment, Mandeville Films, Red W...",101200000.0,271619000.0,,170419100.0,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.173104,136,"[Lucasfilm, Truenorth Productions, Bad Robot]",183999900.0,1902723000.0,False,1718723000.0,12,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.335014,137,"[Universal Pictures, Original Film, Media Righ...",174799900.0,1385749000.0,,1210949000.0,4,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,9.1107,156,"[Regency Enterprises, Appian Way, CatchPlay, A...",124199900.0,490314200.0,True,366114300.0,12,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,8.654359,125,"[Paramount Pictures, Skydance Productions]",142599900.0,405355100.0,,262755100.0,6,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7.6674,141,"[Twentieth Century Fox Film Corporation, Scott...",99359960.0,547749700.0,False,448389700.0,9,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,7.404165,91,"[Universal Pictures, Illumination Entertainment]",68079970.0,1064192000.0,,996112000.0,6,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,6.326804,94,"[Walt Disney Pictures, Pixar Animation Studios...",160999900.0,785411600.0,True,624411600.0,6,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
new_df.shape

(10546, 23957)

In [22]:
new_df = new_df.drop('production_companies', axis = 1).join(
             pd.get_dummies(
             pd.DataFrame(df.production_companies.tolist(), df.index).stack(),
             prefix = 'production_companies_').astype(int).groupby(level=0).sum()
        )
new_df.head(10)

Unnamed: 0,popularity,runtime,budget_adj,revenue_adj,winner,profit_adj,release_month,cast__ Larry Mullen Jr.,cast__ Nicole MuÃ±oz,cast__ Therese Giehse,...,production_companies__platinum studios,production_companies__teamWorx Produktion fÃ¼r Kino und Fernsehen GmbH,production_companies__the filmgroup,production_companies__thefyzz,production_companies__thinkfilm,production_companies__uFilm,production_companies__uFund,production_companies__verture Films,production_companies__Ã–sterreichischer Rundfunk (ORF),production_companies__í”„ë¡œë•ì…˜M
0,32.985763,124,137999900.0,1392446000.0,,1254446000.0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,28.419936,120,137999900.0,348161300.0,False,210161400.0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13.112507,119,101200000.0,271619000.0,,170419100.0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11.173104,136,183999900.0,1902723000.0,False,1718723000.0,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9.335014,137,174799900.0,1385749000.0,,1210949000.0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,9.1107,156,124199900.0,490314200.0,True,366114300.0,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,8.654359,125,142599900.0,405355100.0,,262755100.0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,7.6674,141,99359960.0,547749700.0,False,448389700.0,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,7.404165,91,68079970.0,1064192000.0,,996112000.0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,6.326804,94,160999900.0,785411600.0,True,624411600.0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
new_df.shape

(10546, 31676)

In [24]:
new_df['winner']

0          NaN
1        False
2          NaN
3        False
4          NaN
         ...  
10541      NaN
10542     True
10543      NaN
10544      NaN
10545      NaN
Name: winner, Length: 10546, dtype: object

In [25]:
# Convert null values to False
new_df['winner'] = df['winner'].fillna(0).astype(float)

In [26]:
# Convert boolean values to binary values
new_df['winner'] = df['winner'].astype(float)

In [27]:
new_df['winner']

0        NaN
1        0.0
2        NaN
3        0.0
4        NaN
        ... 
10541    NaN
10542    1.0
10543    NaN
10544    NaN
10545    NaN
Name: winner, Length: 10546, dtype: float64

In [28]:
new_df = new_df.fillna(0)

In [29]:
new_df.isnull().sum()

popularity                                                0
runtime                                                   0
budget_adj                                                0
revenue_adj                                               0
winner                                                    0
                                                         ..
production_companies__uFilm                               0
production_companies__uFund                               0
production_companies__verture Films                       0
production_companies__Ã–sterreichischer Rundfunk (ORF)    0
production_companies__í”„ë¡œë•ì…˜M                        0
Length: 31676, dtype: int64

In [30]:
y = new_df['winner']
X = new_df.copy().drop(columns='winner')
X.head(10)

Unnamed: 0,popularity,runtime,budget_adj,revenue_adj,profit_adj,release_month,cast__ Larry Mullen Jr.,cast__ Nicole MuÃ±oz,cast__ Therese Giehse,"cast__""Weird Al"" Yankovic",...,production_companies__platinum studios,production_companies__teamWorx Produktion fÃ¼r Kino und Fernsehen GmbH,production_companies__the filmgroup,production_companies__thefyzz,production_companies__thinkfilm,production_companies__uFilm,production_companies__uFund,production_companies__verture Films,production_companies__Ã–sterreichischer Rundfunk (ORF),production_companies__í”„ë¡œë•ì…˜M
0,32.985763,124,137999900.0,1392446000.0,1254446000.0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,28.419936,120,137999900.0,348161300.0,210161400.0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13.112507,119,101200000.0,271619000.0,170419100.0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11.173104,136,183999900.0,1902723000.0,1718723000.0,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9.335014,137,174799900.0,1385749000.0,1210949000.0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,9.1107,156,124199900.0,490314200.0,366114300.0,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,8.654359,125,142599900.0,405355100.0,262755100.0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,7.6674,141,99359960.0,547749700.0,448389700.0,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,7.404165,91,68079970.0,1064192000.0,996112000.0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,6.326804,94,160999900.0,785411600.0,624411600.0,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# Check the balance of our target values
y.value_counts()

0.0    10314
1.0      232
Name: winner, dtype: int64

In [32]:
X.shape

(10546, 31675)

In [33]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [34]:
# Now that we have the results from the neural network, train and evaluate a random forest model for comparision.
# Resample the training data with the BalancedRandomForestClassifier
balanced_rf = BalancedRandomForestClassifier(random_state = 1)
balanced_rf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [35]:
y_pred = balanced_rf.predict(X_test)

In [36]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.7591575091575091

In [37]:
# List the features sorted in descending order by feature importance
# Get the feature importances
importances = balanced_rf.feature_importances_

# Sort the feature importances in descending order
feature_importances = sorted(zip(importances, X_train.columns), reverse=True)

# Print the feature importances
for feature_importance in feature_importances:
    print(feature_importance)

(0.041809958125734195, 'runtime')
(0.03995493952665378, 'popularity')
(0.03930454813539783, 'revenue_adj')
(0.035871199216358896, 'profit_adj')
(0.02558886328387114, 'budget_adj')
(0.018531152473151038, 'release_month')
(0.0073743841833165055, 'genre__Comedy')
(0.007319571175760352, 'genre__Music')
(0.00668965570585445, 'genre__Romance')
(0.006151677934413818, 'genre__Animation')
(0.0054756639512062435, 'genre__Thriller')
(0.005403070018123408, 'genre__Drama')
(0.004643662943935921, 'genre__Horror')
(0.004605104113192036, 'genre__History')
(0.00399414891334056, 'genre__Adventure')
(0.003799443643491458, 'genre__Action')
(0.0037724716549158233, 'genre__Science Fiction')
(0.003196450811654921, 'production_companies__Universal Pictures')
(0.0031778693047999147, 'production_companies__Columbia Pictures Corporation')
(0.003043583814463905, 'production_companies__Pixar Animation Studios')
(0.0029421807445639088, 'production_companies__None')
(0.0028883464423375927, 'genre__Documentary')
(0.0