In [202]:
import sys
src_path = "../src/"
if src_path not in sys.path:
    sys.path.append(src_path)

from helpers_module import helpers as hlp

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

from math import sqrt
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error,mean_squared_log_error
from sklearn.model_selection import cross_validate

## Load data

In [203]:
RANDOM_STATE=27

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

test_id_col = test_df['Id']
n_train = train_df.shape[0]
n_test = test_df.shape[0]

print(f"Train rows: {n_train}, Test rows: {n_test}")
train_df


Train rows: 1460, Test rows: 1459


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


## Common preprocessing

In [204]:
# drop train rows without target value
train_df.dropna(subset = ['SalePrice'], inplace=True)

# keep target column from train and keep it in variable
target = train_df['SalePrice']
train_df.drop(columns=['SalePrice'], inplace=True)

# Concat train and test to common prepearing 
union_df = pd.concat([train_df, test_df]).reset_index(drop=True)

## Prepearing data

Hard drop id column

In [205]:
union_df = union_df.drop('Id', axis=1)

In [206]:
union_df = hlp.drop_str_cols(union_df)

for col in hlp.get_numeric_cols(union_df):
    union_df[col].fillna(value=0, inplace=True)

## Merge data back

In [207]:
train_df = union_df[:n_train]
test_df = union_df[n_train:]

train_df

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,548.0,0,61,0,0,0,0,0,2,2008
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,460.0,298,0,0,0,0,0,0,5,2007
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,608.0,0,42,0,0,0,0,0,9,2008
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,642.0,0,35,272,0,0,0,0,2,2006
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,836.0,192,84,0,0,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0.0,0.0,...,460.0,0,40,0,0,0,0,0,8,2007
1456,20,85.0,13175,6,6,1978,1988,119.0,790.0,163.0,...,500.0,349,0,0,0,0,0,0,2,2010
1457,70,66.0,9042,7,9,1941,2006,0.0,275.0,0.0,...,252.0,0,60,0,0,0,0,2500,5,2010
1458,20,68.0,9717,5,6,1950,1996,0.0,49.0,1029.0,...,240.0,366,0,112,0,0,0,0,4,2010


## Prepearing for model -  get X, X_test and y

In [208]:
X = train_df
y = np.log1p(target)

X_test = test_df

## Cross score check by rmse and r2

In [209]:
need_make_cross_score=True

if need_make_cross_score: 
    model_for_cross_val = LinearRegression()

    scores = cross_validate(model_for_cross_val, X, y,
                            n_jobs=4, cv=5,
                            scoring=('r2', 'neg_mean_squared_error'),
                            return_train_score=True)

    rmse = abs(scores['test_neg_mean_squared_error'].mean())
    
    score_train_rmse = scores['train_neg_mean_squared_error']
    score_train_r2 = scores['train_r2']
    score_test_rmse = scores['test_neg_mean_squared_error']
    score_test_r2 = scores['test_r2']

    print("RMSE train mean: {:10.5f}+-{:.3f}".format(abs(score_train_rmse.mean()), abs(score_train_rmse.std())))
    print("R2 train mean:   {:10.5f}+-{:.3f}".format(abs(score_train_r2.mean()), abs(score_train_r2.std())))
    print("----")
    print("RMSE test mean:  {:10.5f}+-{:.3f}".format(abs(score_test_rmse.mean()), abs(score_test_rmse.std())))
    print("R2 test mean:    {:10.5f}+-{:.3f}".format(abs(score_test_r2.mean()), abs(score_test_r2.std())))

RMSE train mean:    0.02065+-0.002
R2 train mean:      0.87048+-0.010
----
RMSE test mean:     0.02436+-0.009
R2 test mean:       0.84661+-0.056


## Train model

In [210]:
model_final = LinearRegression()
model_final.fit(X, y)

y_train_pred = model_final.predict(X)

print("RMSE train: {:.5f}".format(sqrt(mean_squared_error(y, y_train_pred))))
print("RMSLE train: {:.5f}".format(sqrt(mean_squared_log_error(y, y_train_pred))))
print("R2 train: {:.5f}".format(r2_score(y, y_train_pred)))

RMSE train: 0.14483
RMSLE train: 0.01112
R2 train: 0.86845


## Prediction and save result

In [211]:
y_test_pred = np.expm1(model_final.predict(X_test))

filename = f"../data/SimpleLinearRegression.csv"

output = pd.DataFrame({'Id': test_id_col, 'SalePrice': y_test_pred})
output.to_csv(filename, index=False)

print("Save resutls to ", filename)

Save resutls to  ../data/SimpleLinearRegression.csv
