In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from sklearn.datasets import load_digits, load_boston, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from GradientBoost import GradientBoost

# Lets try binary classification

In [3]:
cancer = load_breast_cancer()

In [4]:
X_train, X_test, y_train, y_test \
= train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=17)

#### Lets try sklearn's GradientBoostingClassifier

In [5]:
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
gb = GradientBoostingClassifier(n_estimators=10, random_state=17, criterion='mse')

In [7]:
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=17, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [8]:
preds = gb.predict(X_test)

In [9]:
accuracy_score(y_test, preds)

0.9649122807017544

In [10]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95        61
           1       0.96      0.99      0.97       110

    accuracy                           0.96       171
   macro avg       0.97      0.95      0.96       171
weighted avg       0.97      0.96      0.96       171



#### Lets try our GradientBoost implementation

In [11]:
gb_my = GradientBoost(n_estimators=10, random_seed=17)

In [12]:
gb_my.fit(X_train, y_train)

In [13]:
preds = gb_my.predict(X_test)

In [14]:
accuracy_score(y_test, preds)

0.9649122807017544

In [15]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95        61
           1       0.96      0.99      0.97       110

    accuracy                           0.96       171
   macro avg       0.97      0.95      0.96       171
weighted avg       0.97      0.96      0.96       171



# Now lets try multiclass classification

In [16]:
digits = load_digits()

In [17]:
X_train, X_test, y_train, y_test \
= train_test_split(digits.data, digits.target, test_size=0.3, random_state=17)

#### Lets try sklearn's GradientBoostingClassifier

In [18]:
gb = GradientBoostingClassifier(n_estimators=10, random_state=17, criterion='mse')

In [19]:
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=17, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [20]:
preds = gb.predict(X_test)

In [21]:
accuracy_score(y_test, preds)

0.8777777777777778

In [22]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.86      0.86      0.86        59
           2       0.94      0.96      0.95        49
           3       0.90      0.81      0.85        57
           4       0.96      0.82      0.88        56
           5       0.89      0.80      0.84        59
           6       0.93      0.87      0.90        47
           7       0.86      1.00      0.93        56
           8       0.76      0.92      0.83        48
           9       0.74      0.79      0.77        58

    accuracy                           0.88       540
   macro avg       0.88      0.88      0.88       540
weighted avg       0.88      0.88      0.88       540



#### Lets try our GradientBoost implementation

In [23]:
gb_my = GradientBoost(n_estimators=10, random_seed=17)

In [24]:
gb_my.fit(X_train, y_train)

In [25]:
preds = gb_my.predict(X_test)

In [26]:
accuracy_score(y_test, preds)

0.8796296296296297

In [27]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.85      0.88      0.87        59
           2       0.94      0.96      0.95        49
           3       0.92      0.82      0.87        57
           4       0.94      0.80      0.87        56
           5       0.88      0.78      0.83        59
           6       0.93      0.91      0.92        47
           7       0.86      1.00      0.93        56
           8       0.75      0.90      0.82        48
           9       0.77      0.79      0.78        58

    accuracy                           0.88       540
   macro avg       0.89      0.88      0.88       540
weighted avg       0.88      0.88      0.88       540



#### Just increasing number of estimators improves model a lot.

In [28]:
gb_my = GradientBoost(n_estimators=50, random_seed=17)

In [29]:
gb_my.fit(X_train, y_train)

In [30]:
preds = gb_my.predict(X_test)

In [31]:
accuracy_score(y_test, preds)

0.9462962962962963

In [32]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.93      0.93      0.93        59
           2       0.96      0.98      0.97        49
           3       0.96      0.89      0.93        57
           4       0.98      0.95      0.96        56
           5       0.92      0.93      0.92        59
           6       1.00      0.94      0.97        47
           7       0.93      1.00      0.97        56
           8       0.83      1.00      0.91        48
           9       0.98      0.88      0.93        58

    accuracy                           0.95       540
   macro avg       0.95      0.95      0.95       540
weighted avg       0.95      0.95      0.95       540



# Regression

Let's try our hands at regression

In [33]:
ds = load_boston()

In [34]:
X_train, X_test, y_train, y_test = train_test_split(ds.data, ds.target, test_size=0.3, random_state=17)

#### Lets try sklearn's GradientBoostingRegressor

In [35]:
from sklearn.ensemble import GradientBoostingRegressor

In [36]:
gb_reg = GradientBoostingRegressor(n_estimators=10, random_state=17, criterion='mse')

In [37]:
gb_reg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=10,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=17, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [38]:
preds = gb_reg.predict(X_test)

In [39]:
mean_squared_error(y_test, preds)

19.211175508346717

#### Let's try our implementation

In [40]:
gb_reg_my = GradientBoost(n_estimators=10, is_classification=False, random_seed=17)

In [41]:
gb_reg_my.fit(X_train, y_train)

In [42]:
preds = gb_reg_my.predict(X_test)

In [43]:
mean_squared_error(y_test, preds)

19.032410516174092

# Using pandas with another dataset.

#### Problem

Predict the presence or absence of cardiovascular disease (CVD) using the patient examination results.

#### Data description

There are 3 types of input features:

- *Objective*: factual information;
- *Examination*: results of medical examination;
- *Subjective*: information given by the patient.

| Feature | Variable Type | Variable      | Value Type |
|---------|--------------|---------------|------------|
| Age | Objective Feature | age | int (days) |
| Height | Objective Feature | height | int (cm) |
| Weight | Objective Feature | weight | float (kg) |
| Gender | Objective Feature | gender | categorical code |
| Systolic blood pressure | Examination Feature | ap_hi | int |
| Diastolic blood pressure | Examination Feature | ap_lo | int |
| Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
| Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
| Smoking | Subjective Feature | smoke | binary |
| Alcohol intake | Subjective Feature | alco | binary |
| Physical activity | Subjective Feature | active | binary |
| Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

All of the dataset values were collected at the moment of medical examination.

In [48]:
import pandas as pd
import numpy as np

In [49]:
df = pd.read_csv('../DecisionTree/train.csv', index_col='id', sep=';')

In [50]:
df['age_in_years'] = np.floor(df['age']/365.25)

In [51]:
labels = df['cardio']

In [52]:
df['gender'] =df['gender'].apply(lambda x: x-1)

In [53]:
df['Age_40-50'] = df['age_in_years'].apply(lambda x: 1 if x >= 40 and x < 50 else 0)
df['Age_50-55'] = df['age_in_years'].apply(lambda x: 1 if x >= 50 and x < 55 else 0)
df['Age_55-60'] = df['age_in_years'].apply(lambda x: 1 if x >= 55 and x < 60 else 0)
df['Age_60-65'] = df['age_in_years'].apply(lambda x: 1 if x >= 60 and x < 65 else 0)

In [54]:
df['aphi_120-140'] = df['ap_hi'].apply(lambda x: 1 if x >= 120 and x < 140 else 0)
df['aphi_140-160'] = df['ap_hi'].apply(lambda x: 1 if x >= 140 and x < 160 else 0)
df['aphi_160-180'] = df['ap_hi'].apply(lambda x: 1 if x >= 160 and x < 180 else 0)

In [55]:
df =pd.get_dummies(df, prefix=['cholesterol'], columns=['cholesterol'])

In [56]:
def f(a, b):
    return b/ (a/100)**2

In [57]:
df['bmi'] = df.apply(lambda x: f(x.height, x.weight), axis=1)

In [58]:
df = df.drop(labels=['gluc', 'ap_lo', 'alco', 'age', 'cardio', 'age_in_years', 'ap_hi', 'height', 'weight'], axis=1)

In [59]:
df['bmi'] = np.floor(df['bmi'])

In [60]:
df.head()

Unnamed: 0_level_0,gender,smoke,active,Age_40-50,Age_50-55,Age_55-60,Age_60-65,aphi_120-140,aphi_140-160,aphi_160-180,cholesterol_1,cholesterol_2,cholesterol_3,bmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1,0,1,0,1,0,0,0,0,0,1,0,0,21.0
1,0,0,1,0,0,1,0,0,1,0,0,0,1,34.0
2,0,0,0,0,1,0,0,1,0,0,0,0,1,23.0
3,1,0,1,1,0,0,0,0,1,0,1,0,0,28.0
4,0,0,0,1,0,0,0,0,0,0,1,0,0,23.0


In [61]:
X_train, X_valid, y_train, y_valid = train_test_split(df, labels, test_size=0.3, random_state=17)

In [62]:
tr_pd = GradientBoost(n_estimators=10, random_seed=17)

In [63]:
tr_pd.fit(X_train, y_train)

In [65]:
preds = tr_pd.predict(X_valid)

In [66]:
accuracy_score(y_valid, preds)

0.7065238095238096

#### Lets try more estimators

In [72]:
tr_pd = GradientBoost(n_estimators=50, random_seed=17, learning_rate=0.01)

In [73]:
tr_pd.fit(X_train, y_train)

In [69]:
preds = tr_pd.predict(X_valid)

In [70]:
accuracy_score(y_valid, preds)

0.716047619047619

In [71]:
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.68      0.82      0.74     10490
           1       0.77      0.62      0.69     10510

    accuracy                           0.72     21000
   macro avg       0.72      0.72      0.71     21000
weighted avg       0.72      0.72      0.71     21000

