In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
df = pd.read_csv('../Data/imdb_clean.csv').drop('Unnamed: 0', axis=1)
df = df.drop(['movie_imdb_link'], axis=1)
df['ROI'] = df['gross'] - df['budget']
df.head()

Unnamed: 0,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,actor_1_name,movie_title,...,Sport,Crime,Horror,War,Biography,Music,Documentary,Short,Film-Noir,ROI
0,James Cameron,723,178,0,855.0,Joel David Moore,1000.0,760505847,CCH Pounder,Avatar,...,0,0,0,0,0,0,0,0,0,523505847
1,Gore Verbinski,302,169,563,1000.0,Orlando Bloom,40000.0,309404152,Johnny Depp,Pirates of the Caribbean: At World's End,...,0,0,0,0,0,0,0,0,0,9404152
2,Sam Mendes,602,148,0,161.0,Rory Kinnear,11000.0,200074175,Christoph Waltz,Spectre,...,0,0,0,0,0,0,0,0,0,-44925825
3,Christopher Nolan,813,164,22000,23000.0,Christian Bale,27000.0,448130642,Tom Hardy,The Dark Knight Rises,...,0,0,0,0,0,0,0,0,0,198130642
4,Andrew Stanton,462,132,475,530.0,Samantha Morton,640.0,73058679,Daryl Sabara,John Carter,...,0,0,0,0,0,0,0,0,0,-190641321


In [3]:
def one_hot_encode(df0, col):
    all_cols = []
    for target in df0.loc[:, col]:
        all_cols.append(target)

    all_cols = np.unique(all_cols)
    for target in all_cols:
        df0[target] = 0
        
    for idx in range(len(df0)):
        target = df0.loc[idx, col]
        df0.loc[idx, target] = 1

    df0 = df0.drop([col], axis=1)
    return df0.copy()

In [4]:
cols_with_strings = ['director_name', 'actor_2_name', 'actor_1_name', 'actor_3_name', 'language', 'country', 'content_rating', 'movie_title' ]

for col in cols_with_strings:
    df = one_hot_encode(df.copy(), col)
    

In [5]:
df.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,plot_keywords,...,Zombieland,Zookeeper,Zoolander,Zoolander 2,Zoom,[Rec] 2,eXistenZ,xXx,xXx: State of the Union,Æon Flux
0,723,178,0,855.0,1000.0,760505847,886204,4834,0,"avatar,future,marine,native,paraplegic",...,0,0,0,0,0,0,0,0,0,0
1,302,169,563,1000.0,40000.0,309404152,471220,48350,0,"goddess,marriage ceremony,marriage proposal,pi...",...,0,0,0,0,0,0,0,0,0,0
2,602,148,0,161.0,11000.0,200074175,275868,11700,1,"bomb,espionage,sequel,spy,terrorist",...,0,0,0,0,0,0,0,0,0,0
3,813,164,22000,23000.0,27000.0,448130642,1144337,106759,0,"deception,imprisonment,lawlessness,police offi...",...,0,0,0,0,0,0,0,0,0,0
4,462,132,475,530.0,640.0,73058679,212204,1873,1,"alien,american civil war,male nipple,mars,prin...",...,0,0,0,0,0,0,0,0,0,0


In [6]:
def nested_one_hot_encode(df0, col):
    all_cols = []
    for target in df0.loc[:, col]:
        for sub_target in target.split(','):
            all_cols.append(sub_target)

    all_cols = np.unique(all_cols)
    for target in all_cols:
        df0[target] = 0
        
    for idx in range(len(df0)):
        target = df0.loc[idx, col]
        for sub_target in target.split(','):
            df0.loc[idx, sub_target] = 1

    df0 = df0.drop([col], axis=1)
    return df0.copy()

In [7]:
cols_with_nested_strings = ['plot_keywords']

for col in cols_with_nested_strings:
    df = nested_one_hot_encode(df.copy(), col)

In [8]:
df.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,...,zeus,zodiac,zodiac killer,zoloft,zombie,zombie apocalypse,zombie spoof,zoo,zoologist,zorro
0,723,178,0,855.0,1000.0,760505847,886204,4834,0,3054,...,0,0,0,0,0,0,0,0,0,0
1,302,169,563,1000.0,40000.0,309404152,471220,48350,0,1238,...,0,0,0,0,0,0,0,0,0,0
2,602,148,0,161.0,11000.0,200074175,275868,11700,1,994,...,0,0,0,0,0,0,0,0,0,0
3,813,164,22000,23000.0,27000.0,448130642,1144337,106759,0,2701,...,0,0,0,0,0,0,0,0,0,0
4,462,132,475,530.0,640.0,73058679,212204,1873,1,738,...,0,0,0,0,0,0,0,0,0,0


In [12]:
ignore_cols = ['num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_1_facebook_likes', 'gross', 'num_voted_users', 'cast_total_facebook_likes', 'num_user_for_reviews', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'ROI', 'facenumber_in_poster']

for col in ignore_cols:
    df[col] =(df[col]-df[col].min())/(df[col].max()-df[col].min())

In [14]:
df.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,...,zeus,zodiac,zodiac killer,zoloft,zombie,zombie apocalypse,zombie spoof,zoo,zoologist,zorro
0,0.889026,0.481229,0.0,0.037174,0.001563,1.0,0.524429,0.007361,0.0,0.603244,...,0,0,0,0,0,0,0,0,0,0
1,0.369914,0.450512,0.024478,0.043478,0.0625,0.40684,0.278829,0.073622,0.0,0.244066,...,0,0,0,0,0,0,0,0,0,0
2,0.739827,0.37884,0.0,0.007,0.017188,0.26308,0.163213,0.017816,0.023256,0.195807,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.433447,0.956522,1.0,0.042188,0.589253,0.6772,0.162561,0.0,0.533426,...,0,0,0,0,0,0,0,0,0,0
4,0.567201,0.324232,0.020652,0.023043,0.001,0.096066,0.125535,0.002852,0.023256,0.145174,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df.to_csv('../Data/AI Dataset.csv')