# Intro to Multiple Linear Regression

In this notebook, we'll be working with data on sales prices for homes in King County, Washington. This dataset was obtained from https://www.kaggle.com/harlfoxem/housesalesprediction.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

In [2]:
train = pd.read_csv('../data/train.csv')

In [3]:
test = pd.read_csv('../data/test.csv')

In [4]:
train.head()

Unnamed: 0,permeability,phi,s,tau,Fss_1,Fss_2,Fss_3,Fss_4,Fss_5,Fss_6,...,Fvv_87,Fvv_88,Fvv_89,Fvv_90,Fvv_91,Fvv_92,Fvv_93,Fvv_94,Fvv_95,Fvv_96
0,1.50168,0.348046,0.066175,1.076254,0.218608,0.016738,0.013973,0.010761,0.007845,0.006058,...,0.123109,0.12342,0.123285,0.124253,0.123834,0.124025,0.123978,0.123162,0.123861,0.123608
1,1.06523,0.372941,0.091209,1.066492,0.303711,0.022067,0.020279,0.015333,0.011599,0.010443,...,0.137641,0.138798,0.138436,0.139147,0.139919,0.139985,0.140295,0.139937,0.140854,0.141176
2,5.89208,0.543737,0.067494,1.044616,0.202395,0.013323,0.017857,0.010725,0.007723,0.007268,...,0.296769,0.296148,0.296531,0.295164,0.295948,0.295024,0.295062,0.294175,0.295345,0.294795
3,2.90822,0.348942,0.051625,1.079729,0.151966,0.006017,0.007843,0.009391,0.006032,0.005419,...,0.123245,0.122754,0.12345,0.122395,0.122467,0.12206,0.122641,0.122185,0.122437,0.122321
4,1.0491,0.359812,0.069486,1.139012,0.217286,0.011861,0.014697,0.009664,0.008409,0.006183,...,0.129377,0.129125,0.128754,0.12866,0.129041,0.128493,0.129539,0.129319,0.12896,0.128993


## Find columns with highest coefficients

In [5]:
corrs = pd.DataFrame(train.corr()['permeability'][1:] ** 2)

In [6]:
corrs

Unnamed: 0,permeability
phi,0.248081
s,0.243645
tau,0.208658
Fss_1,0.164430
Fss_2,0.142624
...,...
Fvv_92,0.262242
Fvv_93,0.262159
Fvv_94,0.262023
Fvv_95,0.261848


In [7]:
Fss_rows = corrs[corrs.index.str.contains('Fss')]

In [8]:
Fss_rows = Fss_rows.sort_values('permeability', ascending=False).head(20)
Fss_rows

Unnamed: 0,permeability
Fss_3,0.172427
Fss_4,0.166726
Fss_1,0.16443
Fss_5,0.1514
Fss_95,0.145588
Fss_86,0.145514
Fss_96,0.145481
Fss_88,0.14547
Fss_90,0.14547
Fss_83,0.145444


In [9]:
Fss_rows = Fss_rows.index.to_list()
Fss_rows

['Fss_3',
 'Fss_4',
 'Fss_1',
 'Fss_5',
 'Fss_95',
 'Fss_86',
 'Fss_96',
 'Fss_88',
 'Fss_90',
 'Fss_83',
 'Fss_82',
 'Fss_89',
 'Fss_79',
 'Fss_73',
 'Fss_92',
 'Fss_85',
 'Fss_94',
 'Fss_93',
 'Fss_45',
 'Fss_91']

In [10]:
Fsv_rows = corrs[corrs.index.str.contains('Fsv')]
Fsv_rows = Fsv_rows.sort_values('permeability', ascending=False).head(20)
Fsv_rows = Fsv_rows.index.to_list()

In [11]:
Fvv_rows = corrs[corrs.index.str.contains('Fvv')]
Fvv_rows = Fvv_rows.sort_values('permeability', ascending=False).head(20)
Fvv_rows = Fvv_rows.index.to_list()

## Create smaller dataframe with only the desired variables
#### This is not an intelligent approach...it will be replaced with Lasso eventually.

In [12]:
train_columns = ['permeability', 'phi', 's', 'tau'] + Fss_rows + Fsv_rows + Fvv_rows
train_columns_X = ['phi', 's', 'tau'] + Fss_rows + Fsv_rows + Fvv_rows

In [13]:
train_easy = train[train_columns]

In [14]:
train_easy.head()

Unnamed: 0,permeability,phi,s,tau,Fss_3,Fss_4,Fss_1,Fss_5,Fss_95,Fss_86,...,Fvv_7,Fvv_18,Fvv_6,Fvv_19,Fvv_20,Fvv_5,Fvv_21,Fvv_22,Fvv_4,Fvv_23
0,1.50168,0.348046,0.066175,1.076254,0.013973,0.010761,0.218608,0.007845,0.004582,0.004467,...,0.244991,0.136362,0.260208,0.131388,0.126601,0.274696,0.122591,0.119673,0.290951,0.117293
1,1.06523,0.372941,0.091209,1.066492,0.020279,0.015333,0.303711,0.011599,0.007851,0.00827,...,0.236961,0.129262,0.254868,0.126844,0.12547,0.274845,0.124586,0.124901,0.295869,0.124567
2,5.89208,0.543737,0.067494,1.044616,0.017857,0.010725,0.202395,0.007723,0.00429,0.004526,...,0.436792,0.311829,0.452433,0.305588,0.299631,0.468244,0.295003,0.289764,0.484594,0.286323
3,2.90822,0.348942,0.051625,1.079729,0.007843,0.009391,0.151966,0.006032,0.002575,0.002827,...,0.266265,0.165364,0.278829,0.159048,0.1537,0.290655,0.148284,0.143463,0.303257,0.139164
4,1.0491,0.359812,0.069486,1.139012,0.014697,0.009664,0.217286,0.008409,0.004713,0.004604,...,0.250218,0.121196,0.266567,0.115799,0.111502,0.282751,0.108594,0.107502,0.298948,0.106044


## Create Training/Test Sets and Column Lists for Transformations

In [15]:
X = train_easy.drop('permeability', axis=1)
y = train_easy['permeability']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 21)

In [17]:
# Create list of columns that are skewed (we'll later run this through a yeo-johnson Power Transformer)
train_skewed = pd.DataFrame(train_easy.drop('permeability', axis=1).skew())
train_skewed.columns = ['skew']

In [18]:
train_skewed

Unnamed: 0,skew
phi,0.014769
s,1.185875
tau,1.054007
Fss_3,1.882528
Fss_4,2.042620
...,...
Fvv_5,0.062866
Fvv_21,0.272125
Fvv_22,0.275241
Fvv_4,0.054720


In [19]:
min(train_skewed['skew'])

0.014768806876057179

In [20]:
train_skewed = train_skewed[(train_skewed['skew'] > .5) | (train_skewed['skew'] < -0.5)]

In [21]:
train_skewed = list(train_skewed.index.values)

In [22]:
# Create list of all columns (we'll later run this through a Polynomial Features)
train_poly = pd.DataFrame(train_easy.drop('permeability', axis=1))

In [23]:
train_poly = list(train_poly.columns)

## Begin Creating Regression

In [24]:
# How do we determine which columns to do a Polynomial transformation on??
# How do we determine how many degrees for each column?

In [25]:
# Perform the same ColumnTransformer as last time, but now adding Polynomial Features to all predictor variables.
ct = ColumnTransformer(transformers = [
    ('yeo-johnson', PowerTransformer(), train_skewed),
    ('poly', PolynomialFeatures(
        degree = 2, include_bias = False), train_poly)], remainder = 'passthrough')

pipe = Pipeline(steps = [
    ('transform', ct),
    ('scaler', StandardScaler()),
    ('linear', LinearRegression())
])

In [26]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('yeo-johnson',
                                                  PowerTransformer(),
                                                  ['s', 'tau', 'Fss_3', 'Fss_4',
                                                   'Fss_1', 'Fss_5', 'Fss_95',
                                                   'Fss_86', 'Fss_96', 'Fss_88',
                                                   'Fss_90', 'Fss_83', 'Fss_82',
                                                   'Fss_89', 'Fss_79', 'Fss_73',
                                                   'Fss_92', 'Fss_85', 'Fss_94',
                                                   'Fss_93', 'Fss_45', 'Fss_91',
                                                   'Fsv_1', 'Fsv_2', 'Fsv_3',
                                                   'Fsv_4', 'Fsv_5', 'Fsv...
                                                  Polyn

In [27]:
np.sqrt(mean_squared_error(y_test, pipe.predict(X_test)))

1.1838330068099225

In [28]:
mean_squared_error(y_test, pipe.predict(X_test))

1.401460588012622

In [29]:
ttr = TransformedTargetRegressor(regressor = pipe, func = np.log, inverse_func = np.exp)

In [30]:
ttr.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                           regressor=Pipeline(steps=[('transform',
                                                      ColumnTransformer(remainder='passthrough',
                                                                        transformers=[('yeo-johnson',
                                                                                       PowerTransformer(),
                                                                                       ['s',
                                                                                        'tau',
                                                                                        'Fss_3',
                                                                                        'Fss_4',
                                                                                        'Fss_1',
                                                                         

In [31]:
np.sqrt(mean_squared_error(y_test, ttr.predict(X_test)))

0.9270834481352397

In [32]:
mean_squared_error(y_test, ttr.predict(X_test))

0.8594837198063257

In [33]:
#Create an "X_test" using the test data from kaggle
X_test_final = test[train_columns_X]

In [34]:
#Run the regression on this new X_test, assigning it to result
result = ttr.predict(X_test_final)

In [35]:
result

array([39.981426  ,  0.92365598,  5.44648639, ...,  4.13526507,
        3.27612789,  0.08225299])

In [36]:
result.shape

(5000,)

In [37]:
#Convert to DataFrame and (in following cells), format to match submission requirement
result_df = pd.DataFrame(result)

In [38]:
result_df = result_df.reset_index()

In [39]:
result_df.columns = ['id', 'permeability']

In [40]:
result_df

Unnamed: 0,id,permeability
0,0,39.981426
1,1,0.923656
2,2,5.446486
3,3,5.287280
4,4,4.677603
...,...,...
4995,4995,0.517740
4996,4996,1.512336
4997,4997,4.135265
4998,4998,3.276128


In [277]:
#Export, making sure to remove index
#result_df.to_csv('submission_parker_21_03_20.csv', index=False)