In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Read the CSV file using Pandas.
alldata = pd.read_csv("sberbank.csv")

# Convert the timestamp string to an integer representing the year.
def get_year(timestamp):
    return int(timestamp[:4])
alldata['year'] = alldata.timestamp.apply(get_year)

# Select the 9 input columns and the output column.
selected_columns = ['price_doc', 'year', 'full_sq', 'life_sq', 'floor', 'num_room', 'kitch_sq', 'full_all']
alldata = alldata[selected_columns]
alldata = alldata.dropna()

# Shuffle.
alldata_shuffled = alldata.sample(frac=1.0, random_state=0)

# Separate the input and output columns.
X = alldata_shuffled.drop('price_doc', axis=1)
# For the output, we'll use the log of the sales price.
Y = alldata_shuffled['price_doc'].apply(np.log)

# Split into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

The code blocks below will try out different regressors and compare their accuracy (neg-mean-squared-error). For the "best" regressor we will also tune the hyperparameters and see how to further improve the score.

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_validate
m1 = DummyRegressor()
cross_validate(m1, Xtrain, Ytrain, scoring='neg_mean_squared_error')

{'fit_time': array([0.00189114, 0.00213599, 0.00138235, 0.00137305, 0.00109386]),
 'score_time': array([0.00050831, 0.00032997, 0.00038385, 0.00026655, 0.00026512]),
 'test_score': array([-0.39897319, -0.37113485, -0.38083108, -0.39057156, -0.40475168])}

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
cross_validate(reg, Xtrain, Ytrain, scoring='neg_mean_squared_error')


{'fit_time': array([0.01147294, 0.01252604, 0.01252556, 0.01225662, 0.03260326]),
 'score_time': array([0.00759077, 0.00638628, 0.00571728, 0.00623417, 0.00565815]),
 'test_score': array([-0.30222063, -0.32537384, -0.29377903, -0.29296258, -0.29265721])}

In [None]:
from sklearn.linear_model import Ridge
reg = Ridge(alpha=0.5)
cross_validate(reg, Xtrain, Ytrain, scoring='neg_mean_squared_error')


{'fit_time': array([0.01300049, 0.0117476 , 0.01122737, 0.02596211, 0.00482178]),
 'score_time': array([0.00507998, 0.00535345, 0.00525331, 0.00498056, 0.00286078]),
 'test_score': array([-0.30222063, -0.32537215, -0.29377867, -0.29296257, -0.29265723])}

In [None]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
cross_validate(reg, Xtrain, Ytrain, scoring='neg_mean_squared_error')


{'fit_time': array([0.00780582, 0.0056355 , 0.00527096, 0.00544238, 0.0053246 ]),
 'score_time': array([0.0040915 , 0.00273085, 0.00265765, 0.00312972, 0.00304341]),
 'test_score': array([-0.30610375, -0.29565704, -0.29538912, -0.29693918, -0.29680415])}

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor(random_state=0, max_depth=7)
cross_validate(reg, Xtrain, Ytrain, scoring='neg_mean_squared_error')

{'fit_time': array([0.02644587, 0.03170538, 0.02386308, 0.02276182, 0.0233674 ]),
 'score_time': array([0.00199318, 0.00345159, 0.00390291, 0.00198483, 0.00645828]),
 'test_score': array([-0.29285216, -0.26996091, -0.27846808, -0.29068246, -0.28085178])}

In [None]:
from sklearn.ensemble import RandomForestRegressor

scores = []

#RandomForestRegressor returns best accuracy so try to tune its max_depth
for i in range(7,15):
    reg = RandomForestRegressor(random_state=0, max_depth=i)
    score = cross_validate(reg, Xtrain, Ytrain, scoring='neg_mean_squared_error')
    scores.append(np.mean(score['test_score']))
    print(i)
print(scores)

7
8
9
10
11
12
13
14
[-0.2689194348262911, -0.26757810905261775, -0.26636105332193394, -0.2659229822429888, -0.2663712691758204, -0.2669586142977932, -0.26755837751863226, -0.2685012580286584]


The above tried to optimize the RandomForestRegressor for different max_depths. From this we can we that if max_depth = 10 then we get the best mean squared error (-0.2659...). 

In [None]:
score = cross_validate(reg, Xtrain, Ytrain, scoring='neg_mean_squared_error')
print(np.mean(score['test_score']))

-0.2685012580286584


In [None]:
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor(random_state=2, max_iter=500)
cross_validate(reg, Xtrain, Ytrain, scoring='neg_mean_squared_error')

{'fit_time': array([2.45369554, 3.58236217, 2.65997005, 5.92295527, 3.09123421]),
 'score_time': array([0.00534439, 0.00554371, 0.00542736, 0.00527644, 0.00528073]),
 'test_score': array([ -18.16659949, -273.76033253,  -19.43349626,  -46.93306547,
          -7.78907333])}

RandomForestRegressor returns the best accuracy for the validation score and are therefore used for training of the full training set and evaluated below. Moreover, max_depth = 10 returns the best accuracy when tested above.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
regr = RandomForestRegressor(random_state=0, max_depth=10)
regr.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, regr.predict(Xtest))

0.2740850405500036

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=42461cb6-6f72-410b-bb93-ced1edaf8704' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>