In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

from math import sqrt
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error,mean_squared_log_error
from sklearn.model_selection import cross_validate

In [37]:
RANDOM_STATE=27

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

test_id_col = test_df['Id']
n_train = train_df.shape[0]
n_test = test_df.shape[0]

print(f"Train rows: {n_train}, Test rows: {n_test}")
train_df

Train rows: 1460, Test rows: 1459


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [38]:
# drop train rows without target value
train_df.dropna(subset = ['SalePrice'], inplace=True)

# keep target column from train and keep it in variable
target = train_df['SalePrice']
train_df.drop(columns=['SalePrice'], inplace=True)

# Concat train and test to common prepearing 
union_df = pd.concat([train_df, test_df]).reset_index(drop=True)

In [39]:
X = train_df
y = np.log1p(target)

X_test = test_df

In [40]:
need_make_cross_score=True

if need_make_cross_score: 
    model_for_cross_val = DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE)

    scores = cross_validate(model_for_cross_val, X, y,
                            n_jobs=4, cv=5,
                            scoring=('r2', 'neg_mean_squared_error'),
                            return_train_score=True)

    rmse = abs(scores['test_neg_mean_squared_error'].mean())
    
    score_train_rmse = scores['train_neg_mean_squared_error']
    score_train_r2 = scores['train_r2']
    score_test_rmse = scores['test_neg_mean_squared_error']
    score_test_r2 = scores['test_r2']

    print("RMSE train mean: {:10.5f}+-{:.3f}".format(abs(score_train_rmse.mean()), abs(score_train_rmse.std())))
    print("R2 train mean:   {:10.5f}+-{:.3f}".format(abs(score_train_r2.mean()), abs(score_train_r2.std())))
    print("----")
    print("RMSE test mean:  {:10.5f}+-{:.3f}".format(abs(score_test_rmse.mean()), abs(score_test_rmse.std())))
    print("R2 test mean:    {:10.5f}+-{:.3f}".format(abs(score_test_r2.mean()), abs(score_test_r2.std())))

RMSE train mean:    0.19549+-0.009
R2 train mean:      0.22576+-0.036
----
RMSE test mean:     0.19584+-0.017
R2 test mean:       0.23261+-0.077


In [41]:
model_final = DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE)
model_final.fit(X, y)

y_train_pred = model_final.predict(X)

print("RMSE train: {:.5f}".format(sqrt(mean_squared_error(y, y_train_pred))))
print("RMSLE train: {:.5f}".format(sqrt(mean_squared_log_error(y, y_train_pred))))
print("R2 train: {:.5f}".format(r2_score(y, y_train_pred)))

RMSE train: 0.43584
RMSLE train: 0.03330
R2 train: -0.19130


In [42]:
y_test_pred = np.expm1(model_final.predict(X_test))

filename = f"../data/DummyClassifier.csv"

output = pd.DataFrame({'Id': test_id_col, 'SalePrice': y_test_pred})
output.to_csv(filename, index=False)

print("Save resutls to ", filename)

Save resutls to  ../data/DummyClassifier.csv
