In [76]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats

import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
## fixing random for lesson generation
np.random.seed(321)

In [77]:
df = pd.read_csv('big_data/combined_years.csv.gz')
df


Unnamed: 0,imdb_id,belongs_to_collection,original_language,budget,popularity,revenue,certification,runtime,genres
0,0,,,,,,,,
1,tt0249516,,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam..."
2,tt0285252,,,4000000.0,,0.0,R,100.0,"[{'id': 35, 'name': 'Comedy'}]"
3,tt0293069,,,0.0,,0.0,,86.0,"[{'id': 53, 'name': 'Thriller'}]"
4,tt0337692,,,25000000.0,,8784318.0,R,137.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '..."
...,...,...,...,...,...,...,...,...,...
78307,tt9895024,,de,0.0,1.138,0.0,,118.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name..."
78308,tt9896876,,en,0.0,4.059,0.0,PG-13,101.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
78309,tt9898844,,en,0.0,198.016,0.0,,91.0,"[{'id': 27, 'name': 'Horror'}]"
78310,tt9900940,,en,0.0,3.102,0.0,,87.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name..."


In [78]:
# remove ids that are 0
df = df.loc[ df['imdb_id']!='0']
df

Unnamed: 0,imdb_id,belongs_to_collection,original_language,budget,popularity,revenue,certification,runtime,genres
1,tt0249516,,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam..."
2,tt0285252,,,4000000.0,,0.0,R,100.0,"[{'id': 35, 'name': 'Comedy'}]"
3,tt0293069,,,0.0,,0.0,,86.0,"[{'id': 53, 'name': 'Thriller'}]"
4,tt0337692,,,25000000.0,,8784318.0,R,137.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '..."
5,tt0383010,,,30000000.0,,54819301.0,PG,92.0,"[{'id': 35, 'name': 'Comedy'}]"
...,...,...,...,...,...,...,...,...,...
78307,tt9895024,,de,0.0,1.138,0.0,,118.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name..."
78308,tt9896876,,en,0.0,4.059,0.0,PG-13,101.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
78309,tt9898844,,en,0.0,198.016,0.0,,91.0,"[{'id': 27, 'name': 'Horror'}]"
78310,tt9900940,,en,0.0,3.102,0.0,,87.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name..."


In [79]:
df = df.reset_index(drop=True)
df

Unnamed: 0,imdb_id,belongs_to_collection,original_language,budget,popularity,revenue,certification,runtime,genres
0,tt0249516,,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam..."
1,tt0285252,,,4000000.0,,0.0,R,100.0,"[{'id': 35, 'name': 'Comedy'}]"
2,tt0293069,,,0.0,,0.0,,86.0,"[{'id': 53, 'name': 'Thriller'}]"
3,tt0337692,,,25000000.0,,8784318.0,R,137.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '..."
4,tt0383010,,,30000000.0,,54819301.0,PG,92.0,"[{'id': 35, 'name': 'Comedy'}]"
...,...,...,...,...,...,...,...,...,...
78287,tt9895024,,de,0.0,1.138,0.0,,118.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name..."
78288,tt9896876,,en,0.0,4.059,0.0,PG-13,101.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
78289,tt9898844,,en,0.0,198.016,0.0,,91.0,"[{'id': 27, 'name': 'Horror'}]"
78290,tt9900940,,en,0.0,3.102,0.0,,87.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name..."


In [80]:

## Use .notna() to get True if it belongs to a collection
df['belongs_to_collection'] = df['belongs_to_collection'].notna()
df['belongs_to_collection'].value_counts()

False    75974
True      2318
Name: belongs_to_collection, dtype: int64

In [81]:
df.head()

Unnamed: 0,imdb_id,belongs_to_collection,original_language,budget,popularity,revenue,certification,runtime,genres
0,tt0249516,False,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam..."
1,tt0285252,False,,4000000.0,,0.0,R,100.0,"[{'id': 35, 'name': 'Comedy'}]"
2,tt0293069,False,,0.0,,0.0,,86.0,"[{'id': 53, 'name': 'Thriller'}]"
3,tt0337692,False,,25000000.0,,8784318.0,R,137.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '..."
4,tt0383010,False,,30000000.0,,54819301.0,PG,92.0,"[{'id': 35, 'name': 'Comedy'}]"


In [82]:
df.shape

(78292, 9)

In [83]:
#turn genre into lists

def get_genre(x):
    x=x.replace("'",'"')
    x=json.loads(x)
    
    genres = []
    for genre in x:
        genres.append(genre['name'])
    return genres

In [84]:
df['genres_list']= df['genres'].apply(get_genre)

df_explode = df.explode('genres_list')
df_explode

Unnamed: 0,imdb_id,belongs_to_collection,original_language,budget,popularity,revenue,certification,runtime,genres,genres_list
0,tt0249516,False,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...",Action
0,tt0249516,False,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...",Animation
0,tt0249516,False,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...",Comedy
0,tt0249516,False,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...",Family
1,tt0285252,False,,4000000.0,,0.0,R,100.0,"[{'id': 35, 'name': 'Comedy'}]",Comedy
...,...,...,...,...,...,...,...,...,...,...
78290,tt9900940,False,en,0.0,3.102,0.0,,87.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",Drama
78290,tt9900940,False,en,0.0,3.102,0.0,,87.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",Thriller
78291,tt9907782,False,en,0.0,19.292,4588389.0,R,111.0,"[{'id': 27, 'name': 'Horror'}, {'id': 14, 'nam...",Horror
78291,tt9907782,False,en,0.0,19.292,4588389.0,R,111.0,"[{'id': 27, 'name': 'Horror'}, {'id': 14, 'nam...",Fantasy


In [85]:
## save unique genres
unique_genres = df_explode['genres_list'].dropna().unique()
unique_genres

array(['Action', 'Animation', 'Comedy', 'Family', 'Thriller', 'Adventure',
       'Drama', 'Science Fiction', 'Crime', 'Horror', 'History', 'War',
       'Romance', 'Western', 'Fantasy', 'Mystery', 'Music', 'Documentary',
       'TV Movie'], dtype=object)

In [86]:
## Manually One-Hot-Encode Genres
for genre in unique_genres:
    df[f"Genre_{genre}"] = df['genres'].str.contains(genre,regex=False)    
df

Unnamed: 0,imdb_id,belongs_to_collection,original_language,budget,popularity,revenue,certification,runtime,genres,genres_list,...,Genre_Horror,Genre_History,Genre_War,Genre_Romance,Genre_Western,Genre_Fantasy,Genre_Mystery,Genre_Music,Genre_Documentary,Genre_TV Movie
0,tt0249516,False,,65000000.0,,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","[Action, Animation, Comedy, Family]",...,False,False,False,False,False,False,False,False,False,False
1,tt0285252,False,,4000000.0,,0.0,R,100.0,"[{'id': 35, 'name': 'Comedy'}]",[Comedy],...,False,False,False,False,False,False,False,False,False,False
2,tt0293069,False,,0.0,,0.0,,86.0,"[{'id': 53, 'name': 'Thriller'}]",[Thriller],...,False,False,False,False,False,False,False,False,False,False
3,tt0337692,False,,25000000.0,,8784318.0,R,137.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...","[Adventure, Drama]",...,False,False,False,False,False,False,False,False,False,False
4,tt0383010,False,,30000000.0,,54819301.0,PG,92.0,"[{'id': 35, 'name': 'Comedy'}]",[Comedy],...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78287,tt9895024,False,de,0.0,1.138,0.0,,118.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...","[Drama, Comedy]",...,False,False,False,False,False,False,False,False,False,False
78288,tt9896876,False,en,0.0,4.059,0.0,PG-13,101.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[Comedy, Drama, Romance]",...,False,False,False,True,False,False,False,False,False,False
78289,tt9898844,False,en,0.0,198.016,0.0,,91.0,"[{'id': 27, 'name': 'Horror'}]",[Horror],...,True,False,False,False,False,False,False,False,False,False
78290,tt9900940,False,en,0.0,3.102,0.0,,87.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...","[Crime, Drama, Thriller]",...,False,False,False,False,False,False,False,False,False,False


In [87]:
## Drop original genre cols
df = df.drop(columns=['genres','genres_list'])
df

Unnamed: 0,imdb_id,belongs_to_collection,original_language,budget,popularity,revenue,certification,runtime,Genre_Action,Genre_Animation,...,Genre_Horror,Genre_History,Genre_War,Genre_Romance,Genre_Western,Genre_Fantasy,Genre_Mystery,Genre_Music,Genre_Documentary,Genre_TV Movie
0,tt0249516,False,,65000000.0,,73706.0,PG-13,87.0,True,True,...,False,False,False,False,False,False,False,False,False,False
1,tt0285252,False,,4000000.0,,0.0,R,100.0,False,False,...,False,False,False,False,False,False,False,False,False,False
2,tt0293069,False,,0.0,,0.0,,86.0,False,False,...,False,False,False,False,False,False,False,False,False,False
3,tt0337692,False,,25000000.0,,8784318.0,R,137.0,False,False,...,False,False,False,False,False,False,False,False,False,False
4,tt0383010,False,,30000000.0,,54819301.0,PG,92.0,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78287,tt9895024,False,de,0.0,1.138,0.0,,118.0,False,False,...,False,False,False,False,False,False,False,False,False,False
78288,tt9896876,False,en,0.0,4.059,0.0,PG-13,101.0,False,False,...,False,False,False,True,False,False,False,False,False,False
78289,tt9898844,False,en,0.0,198.016,0.0,,91.0,False,False,...,True,False,False,False,False,False,False,False,False,False
78290,tt9900940,False,en,0.0,3.102,0.0,,87.0,False,False,...,False,False,False,False,False,False,False,False,False,False


In [88]:
## Checking Certification values
df['certification'].value_counts(dropna=False)

NaN        61912
R           5970
NR          4780
PG-13       3340
PG          1560
G            510
NC-17        206
Unrated        8
R              2
PG-13          2
10             2
Name: certification, dtype: int64

In [89]:
# fix extra space certs
df['certification'] = df['certification'].str.strip()

In [90]:
## fix certification col
repl_cert = {'Unrated':'NR',
             '10':np.nan,
             }
df['certification'] = df['certification'].replace(repl_cert)
df['certification'].value_counts(dropna=False)

NaN      61914
R         5972
NR        4788
PG-13     3342
PG        1560
G          510
NC-17      206
Name: certification, dtype: int64

In [91]:
#original language
df['original_language'].value_counts(dropna=False)

NaN    39146
en     22738
ja      1717
es      1690
fr      1624
       ...  
fo         1
mi         1
nb         1
cr         1
sh         1
Name: original_language, Length: 116, dtype: int64

In [92]:
df = df.drop(columns='original_language')

In [93]:
df.head()

Unnamed: 0,imdb_id,belongs_to_collection,budget,popularity,revenue,certification,runtime,Genre_Action,Genre_Animation,Genre_Comedy,...,Genre_Horror,Genre_History,Genre_War,Genre_Romance,Genre_Western,Genre_Fantasy,Genre_Mystery,Genre_Music,Genre_Documentary,Genre_TV Movie
0,tt0249516,False,65000000.0,,73706.0,PG-13,87.0,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,tt0285252,False,4000000.0,,0.0,R,100.0,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,tt0293069,False,0.0,,0.0,,86.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,tt0337692,False,25000000.0,,8784318.0,R,137.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,tt0383010,False,30000000.0,,54819301.0,PG,92.0,False,False,True,...,False,False,False,False,False,False,False,False,False,False


### Model

In [94]:

df_model= df.drop(columns='imdb_id')

In [95]:
## Make x and y variables
y = df_model['revenue']
X = df_model.drop(columns='revenue')

X_train, X_test, y_train, y_test = train_test_split(X,y,  random_state=1)
X_train.head()

Unnamed: 0,belongs_to_collection,budget,popularity,certification,runtime,Genre_Action,Genre_Animation,Genre_Comedy,Genre_Family,Genre_Thriller,...,Genre_Horror,Genre_History,Genre_War,Genre_Romance,Genre_Western,Genre_Fantasy,Genre_Mystery,Genre_Music,Genre_Documentary,Genre_TV Movie
57309,True,175000000.0,135.24,PG-13,133.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
15696,False,0.0,,,79.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
75059,False,0.0,3.342,,85.0,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
11728,False,0.0,,,86.0,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
5849,False,0.0,,G,75.0,False,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False


In [96]:
X_train.isna().sum()

belongs_to_collection        0
budget                       0
popularity               29341
certification            46553
runtime                      0
Genre_Action                 0
Genre_Animation              0
Genre_Comedy                 0
Genre_Family                 0
Genre_Thriller               0
Genre_Adventure              0
Genre_Drama                  0
Genre_Science Fiction        0
Genre_Crime                  0
Genre_Horror                 0
Genre_History                0
Genre_War                    0
Genre_Romance                0
Genre_Western                0
Genre_Fantasy                0
Genre_Mystery                0
Genre_Music                  0
Genre_Documentary            0
Genre_TV Movie               0
dtype: int64

In [97]:
## make cat selector and using it to save list of column names
cat_select = make_column_selector(dtype_include='object')
cat_cols = cat_select(X_train)
cat_cols

['certification']

In [98]:
## make num selector and using it to save list of column names
num_select = make_column_selector(dtype_include='number')
num_cols = num_select(X_train)
num_cols

['budget', 'popularity', 'runtime']

In [99]:
## select manually OHE cols for later
bool_select = make_column_selector(dtype_include='bool')
already_ohe_cols = bool_select(X_train)
already_ohe_cols

['belongs_to_collection',
 'Genre_Action',
 'Genre_Animation',
 'Genre_Comedy',
 'Genre_Family',
 'Genre_Thriller',
 'Genre_Adventure',
 'Genre_Drama',
 'Genre_Science Fiction',
 'Genre_Crime',
 'Genre_Horror',
 'Genre_History',
 'Genre_War',
 'Genre_Romance',
 'Genre_Western',
 'Genre_Fantasy',
 'Genre_Mystery',
 'Genre_Music',
 'Genre_Documentary',
 'Genre_TV Movie']

In [100]:
## convert manual ohe to int
X_train[already_ohe_cols] = X_train[already_ohe_cols].astype(int)
X_test[already_ohe_cols] = X_test[already_ohe_cols].astype(int)

In [101]:
## make pipelines
cat_pipe = make_pipeline(SimpleImputer(strategy='constant',
                                       fill_value='MISSING'),
                         OneHotEncoder(handle_unknown='ignore', sparse=False))
num_pipe = make_pipeline(SimpleImputer(strategy='mean'),#StandardScaler()
                        )

preprocessor = make_column_transformer((cat_pipe,cat_cols),
                                        (num_pipe, num_cols),
                                       ('passthrough',already_ohe_cols))# remainder='passthrough')
preprocessor

In [102]:
## fit the col transformer
preprocessor.fit(X_train)

## Finding the categorical pipeline in our col transformer.
preprocessor.named_transformers_['pipeline-1']

In [103]:
## B) Using list-slicing to find the encoder 
cat_features = preprocessor.named_transformers_['pipeline-1'][-1].get_feature_names_out(cat_cols)


## Create the empty list
final_features = [*cat_features,*num_cols,*already_ohe_cols]
len(final_features)

30

In [104]:
preprocessor.transform(X_train).shape

(58719, 30)

In [105]:
X_train_tf = pd.DataFrame( preprocessor.transform(X_train), 
                          columns=final_features, index=X_train.index)
X_train_tf.head()

Unnamed: 0,certification_G,certification_MISSING,certification_NC-17,certification_NR,certification_PG,certification_PG-13,certification_R,budget,popularity,runtime,...,Genre_Horror,Genre_History,Genre_War,Genre_Romance,Genre_Western,Genre_Fantasy,Genre_Mystery,Genre_Music,Genre_Documentary,Genre_TV Movie
57309,0.0,0.0,0.0,0.0,0.0,1.0,0.0,175000000.0,135.24,133.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15696,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.819979,79.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75059,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.342,85.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11728,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.819979,86.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5849,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.819979,75.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [106]:
X_test_tf = pd.DataFrame( preprocessor.transform(X_test),
                         columns=final_features, index=X_test.index)
X_test_tf.head()

Unnamed: 0,certification_G,certification_MISSING,certification_NC-17,certification_NR,certification_PG,certification_PG-13,certification_R,budget,popularity,runtime,...,Genre_Horror,Genre_History,Genre_War,Genre_Romance,Genre_Western,Genre_Fantasy,Genre_Mystery,Genre_Music,Genre_Documentary,Genre_TV Movie
77135,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.167,119.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
51496,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.965,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29154,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.819979,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9687,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.819979,65.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51594,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.943,79.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Adding a Constant for Statsmodels

In [107]:
## Make final X_train_df and X_test_df with constants added
X_train_df = sm.add_constant(X_train_tf, prepend=False)
X_test_df = sm.add_constant(X_test_tf, prepend=False)
display(X_train_df.head(2),X_test_df.head(2))

Unnamed: 0,certification_G,certification_MISSING,certification_NC-17,certification_NR,certification_PG,certification_PG-13,certification_R,budget,popularity,runtime,...,Genre_History,Genre_War,Genre_Romance,Genre_Western,Genre_Fantasy,Genre_Mystery,Genre_Music,Genre_Documentary,Genre_TV Movie,const
57309,0.0,0.0,0.0,0.0,0.0,1.0,0.0,175000000.0,135.24,133.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15696,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.819979,79.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Unnamed: 0,certification_G,certification_MISSING,certification_NC-17,certification_NR,certification_PG,certification_PG-13,certification_R,budget,popularity,runtime,...,Genre_History,Genre_War,Genre_Romance,Genre_Western,Genre_Fantasy,Genre_Mystery,Genre_Music,Genre_Documentary,Genre_TV Movie,const
77135,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.167,119.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
51496,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.965,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Modeling

In [108]:
## instantiate an OLS model WITH the training data.
model = sm.OLS(y_train, X_train_df)

## Fit the model and view the summary
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,revenue,R-squared:,0.633
Model:,OLS,Adj. R-squared:,0.633
Method:,Least Squares,F-statistic:,3495.0
Date:,"Tue, 08 Nov 2022",Prob (F-statistic):,0.0
Time:,10:00:19,Log-Likelihood:,-1105900.0
No. Observations:,58719,AIC:,2212000.0
Df Residuals:,58689,BIC:,2212000.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
certification_G,-3.281e+06,1.72e+06,-1.905,0.057,-6.66e+06,9.53e+04
certification_MISSING,1.608e+06,5.23e+05,3.076,0.002,5.83e+05,2.63e+06
certification_NC-17,7.84e+05,2.66e+06,0.295,0.768,-4.43e+06,6e+06
certification_NR,1.156e+06,7.36e+05,1.572,0.116,-2.86e+05,2.6e+06
certification_PG,2.13e+06,1.1e+06,1.940,0.052,-2.24e+04,4.28e+06
certification_PG-13,-1.097e+06,8.37e+05,-1.310,0.190,-2.74e+06,5.44e+05
certification_R,-4.644e+06,7.04e+05,-6.596,0.000,-6.02e+06,-3.26e+06
budget,3.1773,0.012,270.843,0.000,3.154,3.200
popularity,1.799e+05,8125.485,22.141,0.000,1.64e+05,1.96e+05

0,1,2,3
Omnibus:,99298.287,Durbin-Watson:,2.014
Prob(Omnibus):,0.0,Jarque-Bera (JB):,404895069.463
Skew:,10.999,Prob(JB):,0.0
Kurtosis:,409.211,Cond. No.,2.87e+16


In [109]:
## Get train data performance from skearn to confirm matches OLS
y_hat_train = result.predict(X_train_df)
print(f'Training R^2: {r2_score(y_train, y_hat_train):.3f}')

## Get test data performance
y_hat_test = result.predict(X_test_df)
print(f'Testing R^2: {r2_score(y_test, y_hat_test):.3f}')

Training R^2: 0.633
Testing R^2: 0.651
