In [1]:
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, \
    GradientBoostingClassifier

from sklearn.datasets import california_housing, load_breast_cancer
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from supervised.linear_regression import LinearRegression as CustomLinearRegression
from supervised.ridge_regression import RidgeRegression as CustomRidgeRegression
from supervised.lasso_regression import LassoRegression as CustomLassoRegression

from supervised.logistic_regression import LogisticRegression as CustomLogisticRegression
from supervised.knn_regression import NearestNeighborsRegression
from supervised.knn_classification import NearestNeighborsClassification
from supervised.naive_bayes import GaussianNaiveBayes

from supervised.decision_tree_regression import DecisionTreeRegressor as CustomDecisionTreeRegressor
from supervised.decision_tree_classification import DecisionTreeClassifier as CustomDecisionTreeClassifier

from supervised.random_forest_regression import RandomForestRegressor as CustomRandomForestRegressor
from supervised.random_forest_classification import RandomForestClassifier as CustomRandomForestClassifier

from supervised.gradient_boosting_regression import GradientBoostingRegressor as CustomGradientBoostingRegressor

In [2]:
%load_ext autoreload
%autoreload 2

### Regression dataset preprocessing

In [176]:
dataset = california_housing.fetch_california_housing()

In [177]:
X = dataset['data']

In [178]:
y = dataset['target']

In [179]:
X.shape, y.shape

((20640, 8), (20640,))

In [180]:
scaled_X = StandardScaler().fit_transform(X)

In [181]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.33, random_state=5)

In [182]:
# baseline
mean_squared_error(y_test, [np.mean(y_train) for _ in range(y_test.shape[0])])

1.374278607063601

# Regression

### Linear Regression

In [58]:
sklearn_lr = LinearRegression(copy_X=False)
sklearn_lr.fit(scaled_X, y)

LinearRegression(copy_X=False, fit_intercept=True, n_jobs=1, normalize=False)

In [59]:
mean_squared_error(y, sklearn_lr.predict(scaled_X))

0.5243209861846072

In [60]:
custom_lr = CustomLinearRegression(learning_rate=0.1, n_iter=15500, verbose=False)
custom_lr.fit(scaled_X, y)

In [61]:
mean_squared_error(y, custom_lr.predict(scaled_X))

0.5243209861846072

### Ridge regression

In [99]:
sklearn_ridge = Ridge(alpha=0.1)
sklearn_ridge.fit(scaled_X, y)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [110]:
sklearn_ridge.coef_

array([ 0.82961664,  0.11875818, -0.26551388,  0.30567906, -0.00450071,
       -0.03932662, -0.89982369, -0.87047846])

In [109]:
mean_squared_error(y, sklearn_ridge.predict(scaled_X))

0.5243209867676113

In [184]:
custom_ridge = CustomRidgeRegression(learning_rate=0.1, alpha=0.1, n_iter=35000)
custom_ridge.fit(scaled_X, y)

In [185]:
custom_ridge.W

array([[ 0.73409381],
       [ 0.15667408],
       [-0.0803548 ],
       [ 0.09597593],
       [ 0.00923195],
       [-0.03800215],
       [-0.40643877],
       [-0.36826385]])

In [186]:
mean_squared_error(y, custom_ridge.predict(scaled_X))

0.6081481442614606

### Lasso

In [167]:
sklearn_lasso = Lasso(alpha=0.01)
sklearn_lasso.fit(scaled_X, y)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [168]:
sklearn_lasso.coef_

array([ 0.77722333,  0.12486709, -0.12940585,  0.16912537, -0.        ,
       -0.02944551, -0.79543737, -0.75899738])

In [200]:
sklearn_lasso.intercept_

2.068558169089147

In [169]:
mean_squared_error(y, sklearn_lasso.predict(scaled_X))

0.5297785450827036

In [196]:
custom_lasso = CustomLassoRegression(learning_rate=0.001, alpha=0.01, n_iter=45000)
custom_lasso.fit(scaled_X, y)

In [197]:
custom_lasso.W

array([[ 8.08523745e-01],
       [ 1.33244759e-01],
       [-1.82408997e-01],
       [ 2.09829971e-01],
       [ 4.42989854e-06],
       [-3.06704447e-02],
       [-6.97976600e-01],
       [-6.64591089e-01]])

In [201]:
custom_lasso.b

2.0585581690889256

In [198]:
mean_squared_error(y, custom_lasso.predict(scaled_X))

0.5321723656424243

In [199]:
for sklearn_coef, custom_coef in zip(sklearn_lasso.coef_, custom_lasso.W):
    print(sklearn_coef, custom_coef)

0.7772233323932395 [0.80852375]
0.12486709258188558 [0.13324476]
-0.1294058524989564 [-0.182409]
0.1691253735078468 [0.20982997]
-0.0 [4.42989854e-06]
-0.029445513755132043 [-0.03067044]
-0.7954373735863367 [-0.6979766]
-0.7589973815681857 [-0.66459109]


### kNN

In [271]:
sklearn_knn = KNeighborsRegressor(n_neighbors=5)
sklearn_knn.fit(X, y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [272]:
mean_squared_error(y, sklearn_knn.predict(X))

0.7042321942059601

In [276]:
custom_knn = NearestNeighborsRegression(k=5)
custom_knn.fit(X, y)

In [285]:
mean_squared_error(y, custom_knn.predict(X))

0.7042321942059601

### Decision tree regressor

In [30]:
sklearn_dt = DecisionTreeRegressor(min_samples_leaf=10, max_depth=5)
sklearn_dt.fit(X_train[:], y_train[:])

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [31]:
sklearn_dt.feature_importances_

array([0.75643439, 0.03997163, 0.03655617, 0.        , 0.00668862,
       0.13993108, 0.02041811, 0.        ])

In [32]:
mean_squared_error(y_train, sklearn_dt.predict(X_train))

0.4762111014069104

In [33]:
mean_squared_error(y_test, sklearn_dt.predict(X_test))

0.5300075082409097

In [10]:
custom_dt = CustomDecisionTreeRegressor(max_objects_in_leaf_num=10, max_depth=5, use_binning=False)
custom_dt.fit(X_train[:], y_train[:])

In [11]:
mean_squared_error(y_train, custom_dt.predict(X_train))

0.5365924773102908

In [12]:
mean_squared_error(y_test, custom_dt.predict(X_test))

0.5690078489487218

In [13]:
custom_dt._tree[0]['split_feature_index']

0

In [14]:
custom_dt._tree.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])

### Random Forest

In [34]:
sklearn_rf = RandomForestRegressor(n_estimators=10, max_depth=10)
sklearn_rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [35]:
sklearn_rf.feature_importances_

array([0.58833808, 0.05039887, 0.04660856, 0.01686735, 0.02348846,
       0.14054961, 0.0655957 , 0.06815336])

In [36]:
mean_squared_error(y_test, sklearn_rf.predict(X_test))

0.3253190084738393

In [44]:
custom_rf = CustomRandomForestRegressor(
    n_estimators=20, max_depth=20, max_objects_in_leaf_num=10, subsample=1., use_binning=False
)
custom_rf.fit(X_train, y_train)

In [45]:
mean_squared_error(y_test, custom_rf.predict(X_test))

1.0038719135609997

### Gradient Boosting

In [46]:
sklearn_gb = GradientBoostingRegressor()
sklearn_gb.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [47]:
mean_squared_error(y_test, sklearn_gb.predict(X_test))

1.5238461717559286

In [51]:
custom_gb = CustomGradientBoostingRegressor(
    learning_rate=0.5, n_estimators=10, max_depth=5, use_binning=True, verbose=True
)
custom_gb.fit(X, y)

  bin_means = [X[:,j][digitized == i].mean() for i in range(1, len(bins))]
  ret = ret.dtype.type(ret / rcount)


Train MSE: 0.6669742719946016


  left_indexes = np.where(X[:,j] < bin_value)[0]
  right_indexes = np.where(X[:,j] >= bin_value)[0]


Train MSE: 0.6306313563015759
Train MSE: 0.6263323101938219
Train MSE: 0.6227673762736794
Train MSE: 0.6100919965743458
Train MSE: 0.6014944894055813
Train MSE: 0.5868465767073081
Train MSE: 0.5834446678850213
Train MSE: 0.5821492370859827
Train MSE: 0.5813300829723521


In [52]:
mean_squared_error(y_test, custom_gb.predict(X_test))

1.5083467592390727

In [53]:
custom_gb._models_weights

[1,
 0.3999999999999808,
 0.09999999999998188,
 0.24999999999998135,
 0.349999999999981,
 -0.20000000000001705,
 0.44999999999998064,
 1.0499999999999785,
 -0.10000000000001741,
 -0.10000000000001741,
 -0.050000000000017586]

### Neural Network

In [190]:
from supervised.neural_net_regression import NeuralNetRegressor
nnr = NeuralNetRegressor(learning_rate=0.001, epochs=1000)
nnr.fit(X_train, y_train)

In [191]:
mean_squared_error(y_train, nnr.predict(X_train))

1.7731847216000292

# classification

### classification dataset preprocessing

In [3]:
classification_dataset = load_breast_cancer()

In [4]:
X = classification_dataset['data']

In [5]:
X = StandardScaler().fit_transform(X)

In [6]:
X.shape

(569, 30)

In [7]:
y = classification_dataset['target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=5)

### LogReg

In [230]:
sklearn_logistic = LogisticRegression()
sklearn_logistic.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [254]:
accuracy_score(y, sklearn_logistic.predict(X))

0.9876977152899824

In [255]:
roc_auc_score(y, sklearn_logistic.predict(X))

0.985406426721632

In [259]:
custom_logistic = CustomLogisticRegression(learning_rate=0.1, n_iter=1000)
custom_logistic.fit(X, y)

In [260]:
accuracy_score(y, custom_logistic.predict_classes(X))

0.9859402460456942

In [261]:
roc_auc_score(y, custom_logistic.predict(X))

0.9965778764335923

In [263]:
for sklearn_coef, custom_coef in zip(sklearn_logistic.coef_[0], custom_logistic.W):
    print(sklearn_coef, custom_coef)

-0.3537224454864797 [-0.15220402]
-0.3850941013363152 [-0.645054]
-0.3423723775899215 [-0.65973023]
-0.4413844621012303 [-0.77149653]
-0.15523715897804158 [-0.42792702]
0.5681635034061283 [0.02565893]
-0.8685186043004927 [-0.80045156]
-0.9681144308936946 [-0.64796378]
0.07328189005198375 [-0.00301748]
0.3112206205399327 [0.46284311]
-1.2952736471318964 [-0.53201201]
0.26995006498914403 [0.00849789]
-0.6662383040329483 [-0.61021687]
-1.0295450787470024 [-1.0277617]
-0.281267800842992 [-0.26559321]
0.7424178833755155 [0.41489704]
0.1135225817011748 [-0.03896074]
-0.3200668495449314 [0.02274222]
0.2898267182938627 [0.517325]
0.6715268871167237 [0.57965855]
-1.030487601943374 [-0.72900505]
-1.3131883039302072 [-0.90020721]
-0.8256397345121426 [-1.05601534]
-1.0291551602948503 [-0.86215098]
-0.6718530104784467 [-0.58129061]
0.04896118824600135 [-0.11483527]
-0.8716223934416776 [-0.39126648]
-0.9113156270892301 [-0.73431424]
-0.8839543014249704 [-0.97562716]
-0.4835462359280868 [-0.17787674]

### kNN

In [297]:
sklearn_knn = KNeighborsClassifier(n_neighbors=5)
sklearn_knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [299]:
accuracy_score(y, sklearn_knn.predict(X))

0.9806678383128296

In [305]:
custom_knn = NearestNeighborsClassification(k=5)
custom_knn.fit(X, y)

In [306]:
accuracy_score(y, custom_knn.predict(X))

0.9806678383128296

### Gaussian Naive Bayes

In [53]:
custom_gaussian_bayes = GaussianNaiveBayes()
custom_gaussian_bayes.fit(X, y)

In [54]:
# считаем вероятности по исходной формуле, без логарифма
accuracy_score(y, custom_gaussian_bayes.predict_classes(X))

0.8804920913884007

In [56]:
# а вот тут уже с логарифмом
accuracy_score(y, custom_gaussian_bayes.predict_classes_loglikelihood(X))

0.9402460456942003

In [59]:
roc_auc_score(y, custom_gaussian_bayes.predict_classes_loglikelihood(X))

0.9322644152000423

In [17]:
sklearn_gaussian_bayes = GaussianNB()
sklearn_gaussian_bayes.fit(X, y)

GaussianNB(priors=None)

In [18]:
accuracy_score(y, sklearn_gaussian_bayes.predict(X))

0.9402460456942003

In [19]:
roc_auc_score(y, sklearn_gaussian_bayes.predict(X))

0.9322644152000423

In [58]:
np.corrcoef(custom_gaussian_bayes.predict_classes_loglikelihood(X), sklearn_gaussian_bayes.predict(X))

array([[1., 1.],
       [1., 1.]])

### Decision Tree

In [9]:
sklearn_dt = DecisionTreeClassifier(max_depth=5)
sklearn_dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
np.argmax(sklearn_dt.feature_importances_)

27

In [11]:
accuracy_score(y_test, sklearn_dt.predict(X_test))

0.9042553191489362

In [12]:
roc_auc_score(y_test, sklearn_dt.predict(X_test))

0.9018877297565822

In [13]:
custom_dt = CustomDecisionTreeClassifier(max_depth=5)
custom_dt.fit(X_train, y_train)

In [14]:
accuracy_score(y_test, custom_dt.predict(X_test))

0.9414893617021277

In [15]:
roc_auc_score(y_test, custom_dt.predict(X_test))

0.9340536512667661

In [16]:
custom_dt._tree[0]['split_feature_index']

27

### Random Forest

In [17]:
sklearn_rf = RandomForestClassifier()
sklearn_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
accuracy_score(y_test, sklearn_rf.predict(X_test))

0.9787234042553191

In [19]:
roc_auc_score(y_test, sklearn_rf.predict(X_test))

0.9731743666169897

In [20]:
custom_rf = CustomRandomForestClassifier(
    max_depth=20, subsample=0.85, col_subsample=1., max_objects_in_leaf_num=1, use_binning=True
)
custom_rf.fit(X_train, y_train)

In [21]:
accuracy_score(y_test, custom_rf.predict(X_test))

0.9787234042553191

In [22]:
roc_auc_score(y_test, custom_rf.predict(X_test))

0.9766517635370096