In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [2]:
df = pd.read_csv('../data/cleaned.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,PID,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,...,Screen_Porch,Pool_Area,Pool_QC,Fence,Misc_Feature,Misc_Val,Mo_Sold,Yr_Sold,Sale_Type,SalePrice
0,0,109,533352170,60,RL,0.0,13517,Pave,,IR1,...,0,0,,,,0,3,2010,WD,130500
1,1,544,531379050,60,RL,43.0,11492,Pave,,IR1,...,0,0,,,,0,4,2009,WD,220000
2,2,153,535304180,20,RL,68.0,7922,Pave,,Reg,...,0,0,,,,0,1,2010,WD,109000
3,3,318,916386060,60,RL,73.0,9802,Pave,,Reg,...,0,0,,,,0,4,2010,WD,174000
4,4,255,906425045,50,RL,82.0,14235,Pave,,IR1,...,0,0,,,,0,3,2010,WD,138500


In [4]:
models_df = pd.DataFrame(columns=['score','explained_variance','mean_absolute_error','mean_squared_error','median_absolute_error','r2'],
                         index=['Model_1','Model_1_test','Model_2','Model_2_test','Model_3','Model_3_test','Model_4','Model_4_test','Model_5','Model_5_test'])

In [5]:
def all_the_metrics(model_name,score_1,y_t,y_p,score_2,y_t_test,y_p_test):
    models_df.loc[model_name,'score'] = score_1
    models_df.loc[model_name,'explained_variance'] = metrics.explained_variance_score(y_t,y_p)
    models_df.loc[model_name,'mean_absolute_error'] = metrics.mean_absolute_error(y_t,y_p)
    models_df.loc[model_name,'mean_squared_error'] = metrics.mean_squared_error(y_t,y_p)
    #models_df.loc[model_name,'mean_squared_log_error'] = metrics.mean_squared_log_error(y_t,y_p)
    models_df.loc[model_name,'median_absolute_error'] = metrics.median_absolute_error(y_t,y_p)
    models_df.loc[model_name,'r2'] = metrics.r2_score(y_t,y_p)
    models_df.loc[model_name+"_test",'score'] = score_2
    models_df.loc[model_name+"_test",'explained_variance'] = metrics.explained_variance_score(y_t_test,y_p_test)
    models_df.loc[model_name+"_test",'mean_absolute_error'] = metrics.mean_absolute_error(y_t_test,y_p_test)
    models_df.loc[model_name+"_test",'mean_squared_error'] = metrics.mean_squared_error(y_t_test,y_p_test)
    #models_df.loc[model_name+"_test",'mean_squared_log_error'] = metrics.mean_squared_log_error(y_t_test,y_p_test)
    models_df.loc[model_name+"_test",'median_absolute_error'] = metrics.median_absolute_error(y_t_test,y_p_test)
    models_df.loc[model_name+"_test",'r2'] = metrics.r2_score(y_t_test,y_p_test)
    #return models_df

### All Numeric Models

I'm going to start with all the numeric columns (35 in total), StandardScale them, and then try varying feature selections on them.

In [6]:
lr = LinearRegression()

num_features = list(df.select_dtypes(exclude='object').columns)[2:30]
X = df[num_features]
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

ss = StandardScaler()

X_train_ss = ss.fit_transform(X_train,y_train)
X_test_ss = ss.transform(X_test)

#### Model 1 - Using all numeric columns

In [7]:
lr.fit(X_train_ss,y_train)
all_the_metrics("Model_1",lr.score(X_train_ss,y_train),y_train,lr.predict(X_train_ss),lr.score(X_test_ss,y_test),y_test,lr.predict(X_test_ss))

#### Model 2 - SelectKBest = 9

In [8]:
kb = SelectKBest(score_func=f_regression, k=9)
X_train_kb = kb.fit_transform(X_train_ss,y_train)
X_test_kb = kb.transform(X_test_ss)

lr.fit(X_train_kb,y_train)
all_the_metrics("Model_2",lr.score(X_train_kb,y_train),y_train,lr.predict(X_train_kb),lr.score(X_test_kb,y_test),y_test,lr.predict(X_test_kb))

#### Model 3 - SelectKBest = 15

In [9]:
kb = SelectKBest(score_func=f_regression, k=15)
X_train_kb = kb.fit_transform(X_train_ss,y_train)
X_test_kb = kb.transform(X_test_ss)

lr.fit(X_train_kb,y_train)
all_the_metrics("Model_3",lr.score(X_train_kb,y_train),y_train,lr.predict(X_train_kb),lr.score(X_test_kb,y_test),y_test,lr.predict(X_test_kb))

#### Model 4 - SelectKBest = 5

In [10]:
kb = SelectKBest(score_func=f_regression, k=5)
X_train_kb = kb.fit_transform(X_train_ss,y_train)
X_test_kb = kb.transform(X_test_ss)

lr.fit(X_train_kb,y_train)
all_the_metrics("Model_4",lr.score(X_train_kb,y_train),y_train,lr.predict(X_train_kb),lr.score(X_test_kb,y_test),y_test,lr.predict(X_test_kb))

#### Model 5 - SelectKBest = 21

In [11]:
kb = SelectKBest(score_func=f_regression, k=21)
X_train_kb = kb.fit_transform(X_train_ss,y_train)
X_test_kb = kb.transform(X_test_ss)

lr.fit(X_train_kb,y_train)
all_the_metrics("Model_5",lr.score(X_train_kb,y_train),y_train,lr.predict(X_train_kb),lr.score(X_test_kb,y_test),y_test,lr.predict(X_test_kb))

#### Model 6 - SelectKBest = 27

In [13]:
kb = SelectKBest(score_func=f_regression, k=27)
X_train_kb = kb.fit_transform(X_train_ss,y_train)
X_test_kb = kb.transform(X_test_ss)

lr.fit(X_train_kb,y_train)
all_the_metrics("Model_6",lr.score(X_train_kb,y_train),y_train,lr.predict(X_train_kb),lr.score(X_test_kb,y_test),y_test,lr.predict(X_test_kb))

### It really seems like having more numeric columns helps. Let's try other feature selection types.

Let's look into adjusting the weight of our coefficients.

#### Model 7 - ElasticNet l1 = .5

In [14]:
elnet = ElasticNet(random_state=42)
elnet.fit(X_train_ss,y_train)

all_the_metrics("Model_7",elnet.score(X_train_ss,y_train),y_train,elnet.predict(X_train_ss),elnet.score(X_test_ss,y_test),y_test,elnet.predict(X_test_ss))

#### Model 8 - ElasticNet l1 = .75

In [15]:
elnet = ElasticNet(random_state=42,l1_ratio=.75)
elnet.fit(X_train_ss,y_train)

all_the_metrics("Model_8",elnet.score(X_train_ss,y_train),y_train,elnet.predict(X_train_ss),elnet.score(X_test_ss,y_test),y_test,elnet.predict(X_test_ss))

#### Model 9 - ElasticNet l1 = .25

In [16]:
elnet = ElasticNet(random_state=42,l1_ratio=.25)
elnet.fit(X_train_ss,y_train)

all_the_metrics("Model_9",elnet.score(X_train_ss,y_train),y_train,elnet.predict(X_train_ss),elnet.score(X_test_ss,y_test),y_test,elnet.predict(X_test_ss))

#### Model 10 - ElasticNet l1 = 1

In [17]:
elnet = ElasticNet(random_state=42,l1_ratio=1)
elnet.fit(X_train_ss,y_train)

all_the_metrics("Model_10",elnet.score(X_train_ss,y_train),y_train,elnet.predict(X_train_ss),elnet.score(X_test_ss,y_test),y_test,elnet.predict(X_test_ss))

#### Model 11 - ElasticNet l1 = 0

In [18]:
elnet = ElasticNet(random_state=42,l1_ratio=0)
elnet.fit(X_train_ss,y_train)

all_the_metrics("Model_11",elnet.score(X_train_ss,y_train),y_train,elnet.predict(X_train_ss),elnet.score(X_test_ss,y_test),y_test,elnet.predict(X_test_ss))



In [19]:
models_df.sort_values('r2',ascending=False).head()

Unnamed: 0,score,explained_variance,mean_absolute_error,mean_squared_error,median_absolute_error,r2
Model_6_test,0.869553,0.869683,21506.3,915806000.0,15418.4,0.869553
Model_5_test,0.868634,0.868693,21542.5,922259000.0,15638.6,0.868634
Model_1_test,0.867291,0.867393,21668.8,931689000.0,15504.1,0.867291
Model_10_test,0.867288,0.86739,21668.6,931708000.0,15497.3,0.867288
Model_3_test,0.861654,0.861755,22174.8,971261000.0,15758.7,0.861654


In [20]:
models_df.sort_values('score',ascending=False).head()

Unnamed: 0,score,explained_variance,mean_absolute_error,mean_squared_error,median_absolute_error,r2
Model_6_test,0.869553,0.869683,21506.3,915806000.0,15418.4,0.869553
Model_5_test,0.868634,0.868693,21542.5,922259000.0,15638.6,0.868634
Model_1_test,0.867291,0.867393,21668.8,931689000.0,15504.1,0.867291
Model_10_test,0.867288,0.86739,21668.6,931708000.0,15497.3,0.867288
Model_3_test,0.861654,0.861755,22174.8,971261000.0,15758.7,0.861654


### So here's what I'm seeing. Model's 6, 5, 1, and 10 are doing the best by both metrics.

- Model_6_test: SelectKBest = 27 (using the majority of the numeric columns)
- Model_5_test: SelectKBest = 21 (using the majority of the numeric columns)
- Model_1_test: All Numeric (using the majority of the numeric columns)
- Model_10_test: l1=1 (Fully Lasso)