In [335]:
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.ticker as ticker
import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from scipy.stats import boxcox
from collections import Counter
import warnings

In [348]:
# loading the imdb_full_info_df and inflation_date
imdb_full_info_df = pd.read_csv('imdb_full_info_df.csv')
inflation_df = pd.read_csv('inflation_data.csv')

In [349]:
# dropping irrelevant columns
imdb_full_info_df.drop(columns=['release_date', 'domestic_gross', 'title', 'cast_1', 'cast_2', 'cast_3', 'movie_url', 'imdb_id', 'director', 'director.1','rating_check'], inplace=True)

In [355]:
# calculating the compounded inflation with base year 2000
for i in range(0, len(inflation_df)):
    if i == 0:
        inflation_df['dollar'][i] = 1
    else:
        inflation_df['dollar'][i] = inflation_df['dollar'][i-1]*(1+inflation_df['inflation'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inflation_df['dollar'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inflation_df['dollar'][i] = inflation_df['dollar'][i-1]*(1+inflation_df['inflation'][i])


In [358]:
# creating a dictionary for year and compounded inflation
inflation = pd.Series(inflation_df.dollar.values, index=inflation_df.year).to_dict()

In [363]:
inflation[2019]

1.4846035042424859

In [344]:
# extracting runtime values
runtime = []
for i in imdb_full_info_df['runtime']:
    runtime.append(int(i[2:][:-2]))
    
imdb_full_info_df['runtime'] = runtime

In [339]:
# one hot encoding for genre (multiple values)
x = imdb_full_info_df['genre'].apply(eval)
x = x.apply(Counter)
x = pd.json_normalize(x)
x = x.replace(np.nan, 0)
x = x.astype('int')

In [340]:
# merging two dataframes
model1_df = pd.concat([imdb_full_info_df, x], axis=1)
model1_df

Unnamed: 0,budget,worldwide_gross,imdb_rating,runtime,genre,Action,Adventure,Drama,Sci-Fi,Fantasy,...,Crime,Western,History,Sport,Horror,Music,War,Biography,Documentary,Short
0,400000000,2797800564,8.4,181,"['Action', 'Adventure', 'Drama', 'Sci-Fi']",1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,379000000,1045713802,6.6,137,"['Action', 'Adventure', 'Fantasy']",1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,365000000,1395316979,7.3,141,"['Action', 'Adventure', 'Sci-Fi']",1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,306000000,2064615817,7.9,138,"['Action', 'Adventure', 'Sci-Fi']",1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,300000000,2048359754,8.5,149,"['Action', 'Adventure', 'Sci-Fi']",1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3309,5000000,5476793,7.5,89,"['Comedy', 'Horror']",0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3310,5000000,861325,6.1,90,"['Crime', 'Thriller', 'War']",0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3311,5000000,584499,7.4,105,"['Comedy', 'Drama', 'Music']",0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3312,5000000,9109597,7.2,102,"['Crime', 'Drama', 'Thriller']",0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [315]:
# one hot encoding for genre (multiple values) - alternative
genre = []
for i in imdb_full_info_df['genre']:
    genre.append((i[2:][:-2]).replace("'","").replace(', ',','))
    
imdb_full_info_df['genre'] = genre

# finding unique values for genre
genre_cat = imdb_full_info_df['genre'].value_counts().reset_index()
genre_cat.head

genre_cat.rename(columns={'index':'genre_name', 'genre':'count'}, inplace=True)

genre_cat['genre_name'] = genre_cat['genre_name'].str.split(',')

df1 = []
for i in range(genre_cat.shape[0]):
    for j in genre_cat['genre_name'][i]:
        df1.append(j)
        
# creating a list of unique genres
list1 = list(set(df1))
list1

# creating a new dataframe with genre columns
df2 = pd.DataFrame(columns=list1)

# merging two dataframes
df3 = pd.concat([imdb_full_info_df, df2], axis=1)
df3

# filling binary values for genre columns
for i in range(0, len(df3)):
    for j in range(0, len(list1)):
        if df3.columns[5+j]in df3['genre'][i]:
            df3[df3.columns[5+j]][i] = 1
        else:
            df3[df3.columns[5+j]][i] = 0

In [277]:
# X, y split
X = df3.drop('worldwide_gross', axis=1)
y = df3['worldwide_gross']

In [281]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 100)

In [282]:
# standardizing the data
std_scaler = StandardScaler().fit(X_train)
X_train_scaled = std_scaler.transform(X_train)

In [283]:
X_test_scaled = std_scaler.transform(X_test)

In [284]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

LinearRegression()

In [285]:
model.coef_

array([ 1.48606886e+08,  4.68340476e+07,  9.04901343e+06,  4.26395635e+06,
        3.85628204e+03,  1.05685015e+07, -1.91296318e+06, -5.51420546e+06,
        2.13069253e+06,  7.47738759e+05, -3.53057517e+06,  1.15543201e+07,
       -6.36616581e+06, -8.51088006e+06,  2.94787470e+05, -2.15155072e+07,
        1.29329283e+05, -6.90774294e+06,  4.30706245e+06, -1.01092988e+07,
       -7.53272203e+06, -2.55763203e+06, -5.98759908e+06, -9.06090542e+06,
        5.95589466e+05])

In [287]:
model.score(X_train_scaled, y_train)

0.6274430751260955

In [286]:
model.score(X_test_scaled, y_test)

0.6158188013122146

In [288]:
features_importance = pd.DataFrame(data = {
    'Attribute': X_train.columns,
    'Importance': abs(model.coef_)
})
features_importance = features_importance.sort_values(by='Importance', ascending=False)

In [289]:
features_importance

Unnamed: 0,Attribute,Importance
0,budget,148606900.0
1,imdb_rating,46834050.0
15,Drama,21515510.0
11,Horror,11554320.0
5,Adventure,10568500.0
19,Western,10109300.0
23,Action,9060905.0
2,runtime,9049013.0
13,Thriller,8510880.0
20,History,7532722.0


In [299]:
# checking the model prediction
y_pred = model.predict(X_test_scaled)

In [302]:
R2 = r2_score(y_test, y_pred)
R2

0.6158188013122146

In [305]:
Adj_R2 = 1-((1-R2)*(len(y_test)-1)/((len(y_test)-X_test.shape[1]-1)))
Adj_R2

0.6007410462616736

In [304]:
X_test_scaled

array([[ 0.13381509,  0.04459814, -0.93693982, ..., -0.3248147 ,
         1.53759982, -0.53620725],
       [-0.51235066,  1.26977595,  0.74326293, ...,  3.07867843,
        -0.65036428, -0.53620725],
       [-0.637415  ,  0.65718705, -0.12393849, ..., -0.3248147 ,
        -0.65036428,  1.86495055],
       ...,
       [-0.84585556,  1.37187411, -0.17813858, ..., -0.3248147 ,
        -0.65036428, -0.53620725],
       [-0.78332339,  1.06557965,  0.14706195, ..., -0.3248147 ,
        -0.65036428, -0.53620725],
       [-0.66868108, -0.56799077, -0.23233867, ..., -0.3248147 ,
        -0.65036428, -0.53620725]])