# Intro to Multiple Linear Regression

In this notebook, we'll be working with data on sales prices for homes in King County, Washington. This dataset was obtained from https://www.kaggle.com/harlfoxem/housesalesprediction.

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

In [27]:
train = pd.read_csv('../data/train.csv')

In [28]:
test = pd.read_csv('../data/test.csv')

In [29]:
train.head()

Unnamed: 0,permeability,phi,s,tau,Fss_1,Fss_2,Fss_3,Fss_4,Fss_5,Fss_6,...,Fvv_87,Fvv_88,Fvv_89,Fvv_90,Fvv_91,Fvv_92,Fvv_93,Fvv_94,Fvv_95,Fvv_96
0,1.50168,0.348046,0.066175,1.076254,0.218608,0.016738,0.013973,0.010761,0.007845,0.006058,...,0.123109,0.12342,0.123285,0.124253,0.123834,0.124025,0.123978,0.123162,0.123861,0.123608
1,1.06523,0.372941,0.091209,1.066492,0.303711,0.022067,0.020279,0.015333,0.011599,0.010443,...,0.137641,0.138798,0.138436,0.139147,0.139919,0.139985,0.140295,0.139937,0.140854,0.141176
2,5.89208,0.543737,0.067494,1.044616,0.202395,0.013323,0.017857,0.010725,0.007723,0.007268,...,0.296769,0.296148,0.296531,0.295164,0.295948,0.295024,0.295062,0.294175,0.295345,0.294795
3,2.90822,0.348942,0.051625,1.079729,0.151966,0.006017,0.007843,0.009391,0.006032,0.005419,...,0.123245,0.122754,0.12345,0.122395,0.122467,0.12206,0.122641,0.122185,0.122437,0.122321
4,1.0491,0.359812,0.069486,1.139012,0.217286,0.011861,0.014697,0.009664,0.008409,0.006183,...,0.129377,0.129125,0.128754,0.12866,0.129041,0.128493,0.129539,0.129319,0.12896,0.128993


In [30]:
train_easy = train[['permeability', 'phi', 's', 'tau', 'Fss_1', 'Fsv_1', 'Fvv_1']]

In [None]:
train_easy.head()

In [None]:
train_easy.columns

In [None]:
train_log = train_easy

In [None]:
to_log = ['permeability', 's', 'tau', 'Fss_1', 'Fsv_1']
train_log = train_log.apply(lambda x: np.log(x + 1) if x.name in to_log else x)

In [None]:
train_log.head()

In [None]:
sns.set(style="ticks", color_codes=True)
sns.pairplot(train_log);

In [None]:
sns.heatmap(train_easy.corr());

In [None]:
plt.figure(figsize = (10, 6))
plt.scatter(data=train_log, x='s', y='permeability', alpha=0.03);

In [None]:
X = train_log[['s']]
y = train_log['permeability']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=21)

In [None]:
MSE = {}

for degree in range(1,11):
    
    pipe = Pipeline(steps = [
        ('poly', PolynomialFeatures(degree = degree, include_bias = False)),
        ('linreg', LinearRegression())
    ])
    
    pipe.fit(X_train, y_train)
    
    MSE[degree] = mean_squared_error(y_test, pipe.predict(X_test))
    
MSE

In [None]:
pipe = Pipeline(steps = [
    ('poly', PolynomialFeatures(degree = 2, include_bias = False)),
    ('linreg', LinearRegression())
])

pipe.fit(X, y)

MSE_full = mean_squared_error(y, pipe.predict(X))

In [None]:
x_grid = np.linspace(min(X['s']), max(X['s']), num = 100).reshape(-1,1)

pipe = Pipeline(steps = [
    ('poly', PolynomialFeatures(degree = 2, include_bias = False)),
    ('linreg', LinearRegression())
])

pipe.fit(X, y)

plt.figure(figsize = (10,6))
plt.scatter(X, y, alpha=0.05)
plt.plot(x_grid, pipe.predict(x_grid), color = 'black', linewidth = 2);

In [None]:
pipe['linreg'].intercept_

In [None]:
pipe['linreg'].coef_

In [None]:
linreg = LinearRegression().fit(X_train, y_train)

In [None]:
linreg.coef_

In [None]:
linreg.intercept_

In [None]:
mean_squared_error(y_test, linreg.predict(X_test))

In [None]:
y_pred_dumb = [y_train.mean()]*len(X_test)

In [None]:
mean_squared_error(y_test, y_pred_dumb)

In [None]:
linreg.predict(X_test).shape

In [None]:
#Create an "X_test" using the test data from kaggle
X_test_final = test[['s']]

In [None]:
#Run the regression on this new X_test, assigning it to result
result = linreg.predict(X_test_final)

In [None]:
result

In [None]:
result.shape

In [None]:
#Convert to DataFrame and (in following cells), format to match submission requirement
result_df = pd.DataFrame(result)

In [None]:
result_df = result_df.reset_index()

In [None]:
result_df.columns = ['id', 'permeability']

In [None]:
result_df

In [None]:
#Export, making sure to remove index
result_df.to_csv('scary_submission_mmkay.csv', index=False)

## Add Column Transformations - Starting with only Log

In [43]:
X = train_easy.drop('permeability', axis=1)
y = train_easy['permeability']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 21)

In [45]:
# Create list of columns that are skewed (we'll later run this through a yeo-johnson Power Transformer)
train_skewed = pd.DataFrame(train_easy.drop('permeability', axis=1).skew())
train_skewed.columns = ['skew']

In [46]:
train_skewed

Unnamed: 0,skew
phi,0.014769
s,1.185875
tau,1.054007
Fss_1,1.880758
Fsv_1,1.188534
Fvv_1,0.024736


In [47]:
min(train_skewed['skew'])

0.014768806876057179

In [48]:
train_skewed = train_skewed[(train_skewed['skew'] > .5) | (train_skewed['skew'] < -0.5)]

In [49]:
train_skewed = list(train_skewed.index.values)

In [50]:
# Create list of all columns (we'll later run this through a Polynomial Features)
train_poly = pd.DataFrame(train_easy.drop('permeability', axis=1))

In [51]:
train_poly = list(train_poly.columns)

In [52]:
# Run linear regression, with a ColumnTransformer to correct skewed predictor variables.
ct = ColumnTransformer(transformers = [
    ('yeo-johnson', PowerTransformer(), train_skewed),
], remainder = 'passthrough')

In [53]:
pipe = Pipeline(steps=[
    ('transform', ct),
    ('inreg', LinearRegression())
])

In [54]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('yeo-johnson',
                                                  PowerTransformer(),
                                                  ['s', 'tau', 'Fss_1',
                                                   'Fsv_1'])])),
                ('inreg', LinearRegression())])

In [55]:
np.sqrt(mean_squared_error(y_test, pipe.predict(X_test)))

4.903110711335676

In [56]:
mean_squared_error(y_test, pipe.predict(X_test))

24.040494647614647

In [57]:
# Add a log value to the target variable
ttr = TransformedTargetRegressor(regressor = pipe, func = np.log, inverse_func = np.exp)

In [58]:
ttr.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                           regressor=Pipeline(steps=[('transform',
                                                      ColumnTransformer(remainder='passthrough',
                                                                        transformers=[('yeo-johnson',
                                                                                       PowerTransformer(),
                                                                                       ['s',
                                                                                        'tau',
                                                                                        'Fss_1',
                                                                                        'Fsv_1'])])),
                                                     ('inreg',
                                                      LinearRegression())]))

In [59]:
np.sqrt(mean_squared_error(y_test, ttr.predict(X_test)))

1.8354812563687244

In [60]:
mean_squared_error(y_test, ttr.predict(X_test))

3.368991442480911

## Add Polynomial Transformations

In [61]:
# How do we determine which columns to do a Polynomial transformation on??
# How do we determine how many degrees for each column?

In [79]:
# Perform the same ColumnTransformer as last time, but now adding Polynomial Features to all predictor variables.
ct = ColumnTransformer(transformers = [
    ('yeo-johnson', PowerTransformer(), train_skewed),
    ('poly', PolynomialFeatures(
        degree = 2, include_bias = False), train_poly)], remainder = 'passthrough')

pipe = Pipeline(steps = [
    ('transform', ct),
    ('linear', LinearRegression())
])

In [80]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('yeo-johnson',
                                                  PowerTransformer(),
                                                  ['s', 'tau', 'Fss_1',
                                                   'Fsv_1']),
                                                 ('poly',
                                                  PolynomialFeatures(include_bias=False),
                                                  ['phi', 's', 'tau', 'Fss_1',
                                                   'Fsv_1', 'Fvv_1'])])),
                ('linear', LinearRegression())])

In [81]:
np.sqrt(mean_squared_error(y_test, pipe.predict(X_test)))

2.6947514147824507

In [82]:
mean_squared_error(y_test, ttr.predict(X_test))

1.3972550639398646

In [83]:
ttr = TransformedTargetRegressor(regressor = pipe, func = np.log, inverse_func = np.exp)

In [84]:
ttr.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                           regressor=Pipeline(steps=[('transform',
                                                      ColumnTransformer(remainder='passthrough',
                                                                        transformers=[('yeo-johnson',
                                                                                       PowerTransformer(),
                                                                                       ['s',
                                                                                        'tau',
                                                                                        'Fss_1',
                                                                                        'Fsv_1']),
                                                                                      ('poly',
                                                                         

In [85]:
np.sqrt(mean_squared_error(y_test, ttr.predict(X_test)))

1.162549848221901

In [86]:
mean_squared_error(y_test, ttr.predict(X_test))

1.351522149600765

In [88]:
#Create an "X_test" using the test data from kaggle
X_test_final = test[['phi', 's', 'tau', 'Fss_1', 'Fsv_1', 'Fvv_1']]

In [90]:
#Run the regression on this new X_test, assigning it to result
result = ttr.predict(X_test_final)

In [91]:
result

array([41.50045764,  0.91702175,  5.42796411, ...,  3.94024604,
        3.16254574,  0.08238435])

In [92]:
result.shape

(5000,)

In [93]:
#Convert to DataFrame and (in following cells), format to match submission requirement
result_df = pd.DataFrame(result)

In [94]:
result_df = result_df.reset_index()

In [95]:
result_df.columns = ['id', 'permeability']

In [96]:
result_df

Unnamed: 0,id,permeability
0,0,41.500458
1,1,0.917022
2,2,5.427964
3,3,5.400764
4,4,4.393820
...,...,...
4995,4995,0.576879
4996,4996,1.497834
4997,4997,3.940246
4998,4998,3.162546


In [97]:
#Export, making sure to remove index
result_df.to_csv('submission_parker_21_03_18.csv', index=False)