In [1]:
%matplotlib inline
import pandas as pd
import re
import seaborn as sns
import csv
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import pickle
import multiprocessing as mp
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LassoCV, RidgeCV, ElasticNetCV, lars_path

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

from scipy import stats

In [5]:
some_features = pickle.load(open("some_features.pickle", "rb"))
no_nans = pickle.load(open('no_nan.pickle', 'rb'))

In [25]:
X, y = no_nans.drop('domestic_total_gross', axis=1), no_nans['domestic_total_gross']
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state = 11)

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .25, random_state = 3)

In [27]:
lm = LinearRegression()
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)


lm_reg = Ridge(alpha=1)
poly = PolynomialFeatures(degree=2)

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

lm_poly = LinearRegression()

In [28]:
lm.fit(X_train, y_train)
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')

lm_reg.fit(X_train_scaled, y_train)
print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}')

lm_poly.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val): .3f}')

Linear Regression val R^2: 0.452
Ridge Regression val R^2: 0.453
Degree 2 polynomial regression val R^2:  0.413


In [29]:
lm_poly.fit(X, y)
print(f'Polynomial Regression test R^2: {lm_poly.score(X_test, y_test):.3f}')

Polynomial Regression test R^2: 0.699


In [32]:
X, y = no_nans.drop(columns=['domestic_total_gross']), no_nans['domestic_total_gross']
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state = 69)
X, y = np.array(X), np.array(y)

In [34]:
kf = KFold(n_splits=5, shuffle=True, random_state=71)
cv_lm_r2s, cv_lm_reg_r2s = [], []

for train_ind, val_ind in kf.split(X, y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    lm = LinearRegression()
    lm_reg = Ridge(alpha=1)
    
    lm.fit(X_train, y_train)
    cv_lm_r2s.append(lm.score(X_val, y_val))
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    lm_reg.fit(X_train_scaled, y_train)
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
print('Simple regression scores: ', cv_lm_r2s)
print('Ridge scores: ', cv_lm_reg_r2s, '\n')

print(f'Simple mean cv R^2: {np.mean(cv_lm_r2s): .3f} += {np.std(cv_lm_r2s): .3f}')
print(f'Ridge mean cv R^2: {np.mean(cv_lm_reg_r2s): .3f} += {np.std(cv_lm_reg_r2s): .3f}')

Simple regression scores:  [0.7751223662084907, 0.6731210945770573, 0.804860307892181, 0.026628753029898644, 0.6283776408709962]
Ridge scores:  [0.774900853780679, 0.6730836574685535, 0.8045110903299021, 0.027959600955594577, 0.6285739169339921] 

Simple mean cv R^2:  0.582 +=  0.285
Ridge mean cv R^2:  0.582 +=  0.284


In [35]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

lm_reg = Ridge(alpha = 1)
lm_reg.fit(X_scaled, y)
print(f'Ridge Regression test R^2: {lm_reg.score(X_test_scaled, y_test): .3f}')

Ridge Regression test R^2:  0.542


In [36]:
lm = LinearRegression()
cross_val_score(lm, X, y, cv=5, scoring='r2')

array([0.5748915 , 0.50370713, 0.75628264, 0.76811653, 0.71966858])

In [None]:
kf = 