In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [2]:
train = pd.read_csv('modeling_train.csv')

In [3]:
train['logSalePrice'] = np.log(train['SalePrice'])

In [4]:
train['log_sq_ft'] = np.log(train['total_sq_ft'])

In [5]:
train['log_porch_sf'] = np.log(train['total_porch_sf'] + 1)

In [6]:
train['log_garage_area'] = np.log(train['GarageArea'] + 1)

In [47]:
y = train[['logSalePrice']]
X = train[['log_sq_ft','bathrooms','TotRmsAbvGrd','log_garage_area','log_porch_sf','OverallQual']]
X = pd.get_dummies(X, columns=['OverallQual'], drop_first=True)
X.head()

Unnamed: 0,log_sq_ft,bathrooms,TotRmsAbvGrd,log_garage_area,log_porch_sf,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10
0,7.789869,3.5,8,6.308098,4.127134,0,0,0,0,0,1,0,0,0
1,7.714231,2.5,6,6.133398,5.700444,0,0,0,0,1,0,0,0,0
2,7.728416,3.5,6,6.411818,3.7612,0,0,0,0,0,1,0,0,0
3,7.566828,2.0,7,6.466145,5.7301,0,0,0,0,0,1,0,0,0
4,7.956126,3.5,9,6.729824,5.624018,0,0,0,0,0,0,1,0,0


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [49]:
model = Lasso(fit_intercept=True)
alpha = np.arange(0.0001, 20, 100)
param_grid = {'alpha': alpha}
gs = GridSearchCV(model, param_grid, cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [50]:
pred = gs.predict(X_test)
model_r = r2_score(y_test, pred)
model_mse = mean_squared_error(y_test, pred)
model_rmse = np.sqrt(model_mse)
adjustedr = 1 - (1-model_r)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

In [51]:
print('Model R Squared: ' + str(model_r))
print('Adjusted R Squared: ' + str(adjustedr))
print('RMSE: ' + str(model_rmse)) 
print('MSE: ' + str(model_mse))

Model R Squared: 0.8235804187453666
Adjusted R Squared: 0.8165236354951813
RMSE: 0.16740160391206935
MSE: 0.02802329699233335


In [12]:
gs.best_params_

{'alpha': 0.0001}

In [52]:
test = pd.read_csv('modeling_test.csv')

In [53]:
test['log_sq_ft'] = np.log(test['total_sq_ft'])
test['log_porch_sf'] = np.log(test['total_porch_sf'] + 1)
test['log_garage_area'] = np.log(test['GarageArea'] + 1)

In [54]:
test_model = test[['log_sq_ft','bathrooms','TotRmsAbvGrd','log_garage_area','log_porch_sf','OverallQual']]
test_model = pd.get_dummies(test_model, columns=['OverallQual'], drop_first=True)

In [55]:
test_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 14 columns):
log_sq_ft          1459 non-null float64
bathrooms          1459 non-null float64
TotRmsAbvGrd       1459 non-null int64
log_garage_area    1459 non-null float64
log_porch_sf       1459 non-null float64
OverallQual_2      1459 non-null uint8
OverallQual_3      1459 non-null uint8
OverallQual_4      1459 non-null uint8
OverallQual_5      1459 non-null uint8
OverallQual_6      1459 non-null uint8
OverallQual_7      1459 non-null uint8
OverallQual_8      1459 non-null uint8
OverallQual_9      1459 non-null uint8
OverallQual_10     1459 non-null uint8
dtypes: float64(4), int64(1), uint8(9)
memory usage: 69.9 KB


In [56]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1095 entries, 1066 to 254
Data columns (total 14 columns):
log_sq_ft          1095 non-null float64
bathrooms          1095 non-null float64
TotRmsAbvGrd       1095 non-null int64
log_garage_area    1095 non-null float64
log_porch_sf       1095 non-null float64
OverallQual_2      1095 non-null uint8
OverallQual_3      1095 non-null uint8
OverallQual_4      1095 non-null uint8
OverallQual_5      1095 non-null uint8
OverallQual_6      1095 non-null uint8
OverallQual_7      1095 non-null uint8
OverallQual_8      1095 non-null uint8
OverallQual_9      1095 non-null uint8
OverallQual_10     1095 non-null uint8
dtypes: float64(4), int64(1), uint8(9)
memory usage: 61.0 KB


In [57]:
test_pred = gs.predict(test_model)

In [58]:
test_pred = np.exp(test_pred)

In [59]:
df_dict = {'Id':np.array(list(test['Id'])), 'SalePrice':test_pred.ravel()}
df_dict

{'Id': array([1461, 1462, 1463, ..., 2917, 2918, 2919]),
 'SalePrice': array([124551.38588631, 161981.08149193, 158692.16214116, ...,
        159213.79303348,  98936.97530559, 233682.2098036 ])}

In [60]:
sol_df = pd.DataFrame.from_dict(df_dict)
sol_df.head()

Unnamed: 0,Id,SalePrice
0,1461,124551.385886
1,1462,161981.081492
2,1463,158692.162141
3,1464,176587.384753
4,1465,212529.727469


In [61]:
sol_df[['Id','SalePrice']].to_csv('lasso1.csv', index=False)

In [62]:
y1 = train[['logSalePrice']]
X1 = train[['log_sq_ft','bathrooms','TotRmsAbvGrd','log_garage_area','log_porch_sf','OverallQual','SaleCondition']]
X1 = pd.get_dummies(X1, columns=['OverallQual','SaleCondition'], drop_first=True)
X1.head()

Unnamed: 0,log_sq_ft,bathrooms,TotRmsAbvGrd,log_garage_area,log_porch_sf,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,7.789869,3.5,8,6.308098,4.127134,0,0,0,0,0,1,0,0,0,0,0,0,1,0
1,7.714231,2.5,6,6.133398,5.700444,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2,7.728416,3.5,6,6.411818,3.7612,0,0,0,0,0,1,0,0,0,0,0,0,1,0
3,7.566828,2.0,7,6.466145,5.7301,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,7.956126,3.5,9,6.729824,5.624018,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [63]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25)

In [64]:
model1 = Lasso()
gs1 = GridSearchCV(model, param_grid, cv=5)
gs1.fit(X1_train, y1_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [65]:
pred1 = gs1.predict(X1_test)
model_r1 = r2_score(y1_test, pred1)
model_mse1 = mean_squared_error(y1_test, pred1)
model_rmse1 = np.sqrt(model_mse)
adjustedr1 = 1 - (1-model_r1)*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1)

In [66]:
print('Model R Squared: ' + str(model_r1))
print('Adjusted R Squared: ' + str(adjustedr1))
print('RMSE: ' + str(model_rmse1)) 
print('MSE: ' + str(model_mse1))

Model R Squared: 0.8420546919423587
Adjusted R Squared: 0.8333562546870104
RMSE: 0.16740160391206935
MSE: 0.02869360942790788


In [67]:
test_model1 = test[['log_sq_ft','bathrooms','TotRmsAbvGrd','log_garage_area','log_porch_sf','OverallQual','SaleCondition']]
test_model1 = pd.get_dummies(test_model1, columns=['OverallQual','SaleCondition'], drop_first=True)

In [68]:
test_model1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 19 columns):
log_sq_ft                1459 non-null float64
bathrooms                1459 non-null float64
TotRmsAbvGrd             1459 non-null int64
log_garage_area          1459 non-null float64
log_porch_sf             1459 non-null float64
OverallQual_2            1459 non-null uint8
OverallQual_3            1459 non-null uint8
OverallQual_4            1459 non-null uint8
OverallQual_5            1459 non-null uint8
OverallQual_6            1459 non-null uint8
OverallQual_7            1459 non-null uint8
OverallQual_8            1459 non-null uint8
OverallQual_9            1459 non-null uint8
OverallQual_10           1459 non-null uint8
SaleCondition_AdjLand    1459 non-null uint8
SaleCondition_Alloca     1459 non-null uint8
SaleCondition_Family     1459 non-null uint8
SaleCondition_Normal     1459 non-null uint8
SaleCondition_Partial    1459 non-null uint8
dtypes: float64(4), int64(1), 

In [69]:
X1_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1095 entries, 141 to 1063
Data columns (total 19 columns):
log_sq_ft                1095 non-null float64
bathrooms                1095 non-null float64
TotRmsAbvGrd             1095 non-null int64
log_garage_area          1095 non-null float64
log_porch_sf             1095 non-null float64
OverallQual_2            1095 non-null uint8
OverallQual_3            1095 non-null uint8
OverallQual_4            1095 non-null uint8
OverallQual_5            1095 non-null uint8
OverallQual_6            1095 non-null uint8
OverallQual_7            1095 non-null uint8
OverallQual_8            1095 non-null uint8
OverallQual_9            1095 non-null uint8
OverallQual_10           1095 non-null uint8
SaleCondition_AdjLand    1095 non-null uint8
SaleCondition_Alloca     1095 non-null uint8
SaleCondition_Family     1095 non-null uint8
SaleCondition_Normal     1095 non-null uint8
SaleCondition_Partial    1095 non-null uint8
dtypes: float64(4), int64(1)

In [70]:
test_pred1 = gs1.predict(test_model1)

In [71]:
test_pred1 = np.exp(test_pred1)

In [72]:
df_dict1 = {'Id':np.array(list(test['Id'])), 'SalePrice':test_pred1.ravel()}
df_dict1

{'Id': array([1461, 1462, 1463, ..., 2917, 2918, 2919]),
 'SalePrice': array([126581.56633226, 164475.74152026, 161104.53332045, ...,
        146960.75803781,  99443.77552906, 231790.60019687])}

In [73]:
sol_df1 = pd.DataFrame.from_dict(df_dict1)
sol_df1.head()

Unnamed: 0,Id,SalePrice
0,1461,126581.566332
1,1462,164475.74152
2,1463,161104.53332
3,1464,177410.932731
4,1465,203785.508614


In [74]:
sol_df1[['Id','SalePrice']].to_csv('lasso2.csv', index=False)