In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
plt.style.use('ggplot')
#Plot styling/configurations
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 14,8
plt.rcParams['axes.facecolor']='white'
plt.rcParams.update({'font.size': 13})
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
seq_col_brew = sns.color_palette("YlGnBu_r", 4)
sns.set_palette(seq_col_brew)
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
ls

box-office.ipynb       [31mtest.csv[m[m*
[31msample_submission.csv[m[m* [31mtrain.csv[m[m*


In [4]:
data = pd.read_csv('train.csv')

In [5]:
cols = ['budget', 'release_date', 'runtime', 'popularity', 'revenue']

In [6]:
x_train = pd.read_csv('train.csv', usecols=[*cols])
y_train = x_train['revenue']
test = pd.read_csv('test.csv')

In [7]:
x_train.drop(labels='revenue', axis=1, inplace=True)

In [9]:
x_train.head()

Unnamed: 0,budget,popularity,release_date,runtime
0,14000000,6.575393,2/20/15,93.0
1,40000000,8.248895,8/6/04,113.0
2,3300000,64.29999,10/10/14,105.0
3,1200000,3.174936,3/9/12,122.0
4,0,1.14807,2/5/09,118.0


# Data Analysis

# Feature Engineering

### pd.to_datetime

In [10]:
date = pd.to_datetime(x_train['release_date'])

In [11]:
date

0      2015-02-20
1      2004-08-06
2      2014-10-10
3      2012-03-09
4      2009-02-05
5      1987-08-06
6      2012-08-30
7      2004-01-15
8      1996-02-16
9      2003-04-16
10     1976-11-21
11     1987-07-10
12     1999-09-15
13     2005-03-04
14     2002-06-20
15     2010-10-06
16     2005-08-04
17     2013-12-25
18     2011-02-02
19     2005-08-02
20     1998-04-03
21     1982-08-13
22     2012-07-28
23     2011-09-02
24     2006-09-09
25     1992-10-23
26     1997-09-08
27     2013-09-07
28     1986-08-08
29     2002-04-07
30     2011-11-02
31     2005-05-02
32     2015-10-21
33     2014-10-02
34     2007-08-08
35     2012-04-16
36     2000-01-28
37     1995-03-10
38     2013-09-13
39     2065-04-07
40     2006-04-26
41     2067-06-22
42     2000-03-03
43     2014-09-05
44     2005-02-04
45     2009-10-09
46     2006-03-23
47     2015-12-25
48     2017-03-24
49     1994-11-17
50     2008-01-30
51     2065-07-01
52     1980-06-24
53     1988-01-15
54     2016-02-19
55     199

In [12]:
x_train['release_month'] = date.dt.month

In [13]:
x_train['release_year'] = date.dt.year

In [14]:
x_train['release_day'] = date.dt.day

In [15]:
x_train.drop(labels='release_date', axis=1, inplace=True)

In [16]:
x_train.head()

Unnamed: 0,budget,popularity,runtime,release_month,release_year,release_day
0,14000000,6.575393,93.0,2,2015,20
1,40000000,8.248895,113.0,8,2004,6
2,3300000,64.29999,105.0,10,2014,10
3,1200000,3.174936,122.0,3,2012,9
4,0,1.14807,118.0,2,2009,5


### Indicating Missingness

In [19]:
x_train['Is_Missing'] = np.where(x_train['runtime'].isnull(), 1, 0)

In [20]:
x_train.head()

Unnamed: 0,budget,popularity,runtime,release_month,release_year,release_day,Is_Missing
0,14000000,6.575393,93.0,2,2015,20,0
1,40000000,8.248895,113.0,8,2004,6,0
2,3300000,64.29999,105.0,10,2014,10,0
3,1200000,3.174936,122.0,3,2012,9,0
4,0,1.14807,118.0,2,2009,5,0


In [22]:
x_train[x_train.isnull().any(axis=1)]

Unnamed: 0,budget,popularity,runtime,release_month,release_year,release_day,Is_Missing
1335,6000000,0.292296,,10,2007,29,1
2302,0,0.002229,,3,1996,14,1


### Handling Missing Values

In [24]:
x_train['runtime'].mean()

107.85657104736491

In [27]:
x_train.fillna(x_train['runtime'].mean(), inplace=True)

In [28]:
x_train.isnull().sum()

budget           0
popularity       0
runtime          0
release_month    0
release_year     0
release_day      0
Is_Missing       0
dtype: int64

In [30]:
x_train.head(100)

Unnamed: 0,budget,popularity,runtime,release_month,release_year,release_day,Is_Missing
0,14000000,6.575393,93.0,2,2015,20,0
1,40000000,8.248895,113.0,8,2004,6,0
2,3300000,64.29999,105.0,10,2014,10,0
3,1200000,3.174936,122.0,3,2012,9,0
4,0,1.14807,118.0,2,2009,5,0
5,8000000,0.743274,83.0,8,1987,6,0
6,14000000,7.286477,92.0,8,2012,30,0
7,0,1.949044,84.0,1,2004,15,0
8,0,6.902423,100.0,2,1996,16,0
9,6000000,4.672036,91.0,4,2003,16,0


### Budget

In [31]:
x_train['no_budget'] = np.where(x_train['budget'] == 0, 1, 0)

In [32]:
x_train.head()

Unnamed: 0,budget,popularity,runtime,release_month,release_year,release_day,Is_Missing,no_budget
0,14000000,6.575393,93.0,2,2015,20,0,0
1,40000000,8.248895,113.0,8,2004,6,0,0
2,3300000,64.29999,105.0,10,2014,10,0,0
3,1200000,3.174936,122.0,3,2012,9,0,0
4,0,1.14807,118.0,2,2009,5,0,1


### Feature Scaling

In [83]:
budget = x_train['budget']

In [86]:
type(budget)

pandas.core.series.Series

In [88]:
budget.reshape(1, -1)

array([[14000000, 40000000,  3300000, ..., 65000000, 42000000, 35000000]])

In [84]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [87]:
# This does the same as the following code (data – data.min()) / (data.max() – data.min())
x = MinMaxScaler().fit_transform(budget)

ValueError: Expected 2D array, got 1D array instead:
array=[14000000. 40000000.  3300000. ... 65000000. 42000000. 35000000.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

# Choosing The Right Model

In [54]:
# Linear Regression Models
from sklearn.linear_model import LinearRegression

In [68]:
lg = LinearRegression()
fitted = lg.fit(x_train, y_train)
fitted.score(x_train, y_train)

0.6182797298380005

In [67]:
# fitted.predict(test)