In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/interim/movies_post_eda.csv')

In [3]:
df.describe()

Unnamed: 0,budget,popularity,runtime,revenue,homepage_bool,belongs_to_collection_bool,prod_company_num,prod_country_num,prod_United States of America,prod_United Kingdom,...,Music,Mystery,None,Romance,Science Fiction,TV Movie,Thriller,War,Western,key_num
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,22661350.0,8.463274,107.856571,15.970738,0.315333,0.201333,2.698333,1.326333,0.776667,0.126667,...,0.033333,0.075,0.002333,0.190333,0.096667,0.000333,0.263,0.033333,0.014333,7.226667
std,37026620.0,12.104,22.079069,3.045649,0.464726,0.401063,2.014121,0.752349,0.416549,0.332655,...,0.179535,0.263435,0.048256,0.39263,0.295553,0.018257,0.440336,0.179535,0.118881,6.665891
min,0.0,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,4.018053,94.0,14.691625,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
50%,8000000.0,7.374861,104.0,16.63731,0.0,0.0,2.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
75%,30000000.0,10.890983,118.0,18.046365,1.0,0.0,4.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0
max,380000000.0,294.337037,338.0,21.141685,1.0,1.0,17.0,8.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,149.0


In [4]:
df.isna().sum().sort_values(ascending=False).head(10)

budget                   0
crew_Robert Rodriguez    0
act_John Turturro        0
act_Willem Dafoe         0
act_Forest Whitaker      0
act_Nicolas Cage         0
act_Bill Murray          0
act_Owen Wilson          0
act_Keith David          0
act_Jason Statham        0
dtype: int64

In [5]:
df.columns

Index(['budget', 'original_language', 'popularity', 'runtime', 'revenue',
       'homepage_bool', 'belongs_to_collection_bool', 'prod_company_num',
       'prod_country_num', 'prod_United States of America',
       ...
       'Music', 'Mystery', 'None', 'Romance', 'Science Fiction', 'TV Movie',
       'Thriller', 'War', 'Western', 'key_num'],
      dtype='object', length=195)

In [6]:
df.dtypes.value_counts()

int64      182
float64     12
object       1
dtype: int64

We will now get dummies for our last categorical category: original language.

In [7]:
dummies = pd.get_dummies(df['original_language'],prefix='orig')

df = df.drop('original_language',axis = 1)
# Join the encoded df
df = df.join(dummies)
df.head()

Unnamed: 0,budget,popularity,runtime,revenue,homepage_bool,belongs_to_collection_bool,prod_company_num,prod_country_num,prod_United States of America,prod_United Kingdom,...,orig_ro,orig_ru,orig_sr,orig_sv,orig_ta,orig_te,orig_tr,orig_ur,orig_vi,orig_zh
0,14000000,6.575393,93.0,16.3263,0,1,3,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,40000000,8.248895,113.0,18.370959,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3300000,64.29999,105.0,16.387512,1,0,3,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1200000,3.174936,122.0,16.588099,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1.14807,118.0,15.182614,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
print(df.isna().sum())
df.loc[df['lang_num'].isna(),'lang_num'] = 1

budget           0
popularity       0
runtime          0
revenue          0
homepage_bool    0
                ..
orig_te          0
orig_tr          0
orig_ur          0
orig_vi          0
orig_zh          0
Length: 230, dtype: int64


No more missing values!

In [9]:
df.isna().sum().sort_values(ascending=False).head(10)

budget                      0
crew_Avy Kaufman            0
crew_Deborah Aquila         0
crew_James Newton Howard    0
crew_Mary Vernieu           0
crew_Steven Spielberg       0
crew_Jerry Goldsmith        0
crew_Luc Besson             0
crew_Francine Maisler       0
crew_Tricia Wood            0
dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(labels='revenue',axis=1),df['revenue'],test_size = 0.3)

All we have to do is replace the missing runtime values with the mean, as runtime is fairly normally distributed, as shown in data wrangling stage.

We don't have any categorical variables left, so all we have to do is scale our data.

We will use this scaled data if we end up using models where scaling is particularly important, but we will also save the unscaled data.

In [11]:
scaler = StandardScaler()
scaler.fit(X_train.values)
X_train_scaled = scaler.transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)
names = X_train.columns
X_train_scaled = pd.DataFrame(X_train_scaled,columns=names)
X_train_scaled.head()


Unnamed: 0,budget,popularity,runtime,homepage_bool,belongs_to_collection_bool,prod_company_num,prod_country_num,prod_United States of America,prod_United Kingdom,prod_France,...,orig_ro,orig_ru,orig_sr,orig_sv,orig_ta,orig_te,orig_tr,orig_ur,orig_vi,orig_zh
0,-0.074702,-0.462508,-0.123293,-0.671026,-0.5,-0.871652,-0.436716,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622
1,-0.33627,-0.327206,-0.168534,-0.671026,-0.5,0.652892,-0.436716,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622
2,0.059912,0.063854,0.73629,1.490255,-0.5,-0.871652,-0.436716,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622
3,-0.594313,-0.016319,-0.620946,-0.671026,2.0,-0.871652,-0.436716,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622
4,-0.155471,-0.20654,-0.575705,-0.671026,-0.5,-0.871652,-0.436716,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622


In [12]:
X_test_scaled = pd.DataFrame(X_test_scaled,columns=names)
X_test_scaled.head()

Unnamed: 0,budget,popularity,runtime,homepage_bool,belongs_to_collection_bool,prod_company_num,prod_country_num,prod_United States of America,prod_United Kingdom,prod_France,...,orig_ro,orig_ru,orig_sr,orig_sv,orig_ta,orig_te,orig_tr,orig_ur,orig_vi,orig_zh
0,-0.613159,-0.028002,0.781531,-0.671026,2.0,0.144711,4.918629,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622
1,2.886808,0.480665,0.555325,1.490255,2.0,1.161073,-0.436716,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622
2,-0.613159,-0.22403,-0.123293,-0.671026,-0.5,-0.871652,-0.436716,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622
3,-0.605351,-0.649602,-1.028117,-0.671026,-0.5,-0.871652,-0.436716,0.530348,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622
4,-0.330469,-0.509416,1.86732,1.490255,-0.5,-0.871652,-0.436716,-1.885554,-0.379198,-0.292013,...,-0.037823,-0.116248,-0.021827,-0.048853,-0.057831,-0.030875,-0.037823,0.0,0.0,-0.087622


In [13]:
X_train_scaled.to_csv('data/preprocessed/X_sc_train.csv',index=False)
X_test_scaled.to_csv('data/preprocessed/X_sc_test.csv',index=False)
X_train.to_csv('data/preprocessed/X_train.csv',index=False)
X_test.to_csv('data/preprocessed/X_test.csv',index=False)
y_train.to_csv('data/preprocessed/y_train.csv',index=False)
y_test.to_csv('data/preprocessed/y_test.csv',index=False)

Save our data to preprocessed data folder.