In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston

### Task 1

In [9]:
boston = load_boston()

In [10]:
data = boston['data']
feature_names = boston['feature_names']
target = boston['target']

#### Make dataframes

In [11]:
X = pd.DataFrame(data, columns=feature_names)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [12]:
y = pd.DataFrame(target, columns=['price'])
y.head()

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


#### Split dataframes to test and train ones

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Train linear regression model

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
lr = LinearRegression()

In [17]:
lr.fit(X_train, y_train)

LinearRegression()

In [18]:
y_pred = lr.predict(X_test)

In [19]:
check_test = pd.DataFrame({
    'y_test': y_test['price'],
    'y_pred': y_pred.flatten(),
})

In [20]:
check_test['error'] = check_test['y_pred'] - check_test['y_test']
check_test

Unnamed: 0,y_test,y_pred,error
173,23.6,28.648960,5.048960
274,32.4,36.495014,4.095014
491,13.6,15.411193,1.811193
72,22.8,25.403213,2.603213
452,16.1,18.855280,2.755280
...,...,...,...
441,17.1,17.403672,0.303672
23,14.5,13.385941,-1.114059
225,50.0,39.983425,-10.016575
433,14.3,16.682863,2.382863


In [21]:
from sklearn.metrics import r2_score

In [43]:
R2 = r2_score(y_pred, y_test)
R2

0.6693702691495628

### Task 2

In [108]:
from sklearn.ensemble import RandomForestRegressor

In [109]:
model = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42) 

In [110]:
model.fit(X_train, y_train.values[:, 0])

RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)

In [98]:
y_pred_model = model.predict(X_test)

In [99]:
check_test_model = pd.DataFrame({
    'y_test': y_test['price'],
    'y_pred_model': y_pred_model.flatten(),
})

In [100]:
check_test_model['error'] = check_test_model['y_pred_model'] - check_test['y_test']
check_test_model

Unnamed: 0,y_test,y_pred_model,error
173,23.6,22.806412,-0.793588
274,32.4,31.131464,-1.268536
491,13.6,16.339125,2.739125
72,22.8,23.810726,1.010726
452,16.1,17.139521,1.039521
...,...,...,...
441,17.1,13.521640,-3.578360
23,14.5,15.112621,0.612621
225,50.0,42.808700,-7.191300
433,14.3,15.586103,1.286103


In [102]:
R2_model = r2_score(y_pred_model, y_test)
R2_model

0.8479049999699443

In [103]:
R2_better_on = R2_model - R2
R2_better_on

0.17853473082038152

#### RandomForestRegressor works better then LinearRegression model on this dataset

### Task 3

In [113]:
?RandomForestRegressor

#### Sum of all feature importance coefficients

In [147]:
fi = model.feature_importances_
sum(fi)

1.0

#### Two most important features

In [148]:
feats = {}
for importance, feature in zip(fi, feature_names):
    feats[feature] = importance
sorted_feats = sorted(feats.items(), key=lambda item: item[1], reverse=True)
top_two_feats = sorted_feats[:2]
top_two_feats

[('LSTAT', 0.4158473181914483), ('RM', 0.4026817857034993)]