# Predicting income levels in Amsterdam

We will use our dataset with remote sensing-based and GSV-based indicators to predict 5 income categories: low, below average, average, above average, and high income with CatBoost gradient boosting algorithm https://github.com/catboost

## Libraries installation

In [None]:
# Enabling interactive charts
!jupyter nbextension enable --py widgetsnbextension

In [None]:
#!pip install catboost
#!pip install ipywidgets
#!pip install shap
#!pip install sklearn
#!pip install --upgrade numpy

In [None]:
# Importing CatBoost library
import catboost
print(catboost.__version__)
!python --version

In [None]:
# Choose figure format
%config InlineBackend.figure_format = 'png'

## Reading the data

In [None]:
# Importing required libraries
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
import catboost
from catboost import *
from catboost import datasets

In [None]:
ams,preds = pd.read_csv('ams_med.csv'), pd.read_csv('ams_med_preds.csv') 

In [None]:
ams.head()

## Preparing data

In [None]:
# Label values extraction
y = ams.income
X = ams.drop('income', axis=1)

In [None]:
dataset_dir = './ams'
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

# We will be able to work with files with/without header and
# with different separators.

ams.to_csv(
    os.path.join(dataset_dir, 'train.csv'),
    index=False, sep=',', header=True
)
preds.to_csv(
    os.path.join(dataset_dir, 'preds.csv'),
    index=False, sep=',', header=True
)

In [None]:
from catboost.utils import create_cd
feature_names = dict()
for column, name in enumerate(ams):
    if column == 0:
        continue
    feature_names[column] = name
    
create_cd(
    label=0,
    feature_names=feature_names,
    output_path=os.path.join(dataset_dir, 'train.cd')
)

In [None]:
# Saving dataframe to pool object suitable for CatBoost
pool1 = Pool(data=X, label=y)

print('Dataset shape')
print('dataset 1:' + str(pool1.shape))

print('\n')
print('Column names')
print('dataset 1:')
print(pool1.get_feature_names()) 

## Splitting Amsterdam data into train and validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=1234)

## Overfitting detector

In [None]:
model_with_early_stop = CatBoostClassifier(
    eval_metric='AUC',
    iterations=200,
    random_seed=63,
    learning_rate=0.5,
    early_stopping_rounds=20
)
model_with_early_stop.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    verbose=False,
    plot=True
)

In [None]:
metrics = model_with_early_stop.eval_metrics(
    data=pool1,
    metrics=['MultiClass','AUC'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
print(model_with_early_stop.tree_count_)

In [None]:
print('AUC values:')
print(np.array(metrics['AUC:type=Mu']))

## Hyperparameter tunning

### Training speed

In [None]:
from catboost import CatBoost
fast_model = CatBoostClassifier(
    random_seed=63,
    iterations=150,
    learning_rate=0.01,
    boosting_type='Ordered',
    bootstrap_type='Bernoulli',
    subsample=0.5,
    one_hot_max_size=20,
    rsm=0.5,
    leaf_estimation_iterations=5,
    max_ctr_complexity=1)

fast_model.fit(
    X_train, y_train,
    verbose=False,
    plot=True
)

### Training accuracy

In [None]:
tuned_model = CatBoostClassifier(
    random_seed=63,
    iterations=1000,
    task_type="CPU",
    learning_rate=0.05,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_strength=1,
    one_hot_max_size=2,
    leaf_estimation_method='Newton'
)
tuned_model.fit(
    X_train, y_train,
    verbose=False,
    eval_set=(X_validation, y_validation),
    plot=True
)

## Training the model after parameter tunning

In [None]:
best_model = CatBoostClassifier(
    random_seed=63,
    iterations=int(tuned_model.tree_count_ * 1.2),
)
best_model.fit(
    X_train, y_train,
    verbose=100
)

In [None]:
best_model.score(X_validation, y_validation)

In [None]:
# Confusion matrix to assess prediction accuracy
from sklearn.metrics import plot_confusion_matrix
class_names = ['00-20 low', '20-40 below average', '40-60 average', '60-80 above average', '80-100 high']
disp = plot_confusion_matrix(best_model, X, y, display_labels=class_names, xticks_rotation='vertical')

In [None]:
# F1 score, precision, recall may be used in addition to AUC
metrics = best_model.eval_metrics(
    data=pool1,
    metrics=['MultiClass','AUC'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
# Features importance - not SHAP
best_model.get_feature_importance(prettified=True)

# SHAP values

In [None]:
shap_values = best_model.get_feature_importance(pool1, fstr_type='ShapValues')
print(shap_values.shape)

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(Pool(X, y))

shap.initjs()
shap.force_plot(explainer.expected_value[0],shap_values[0])

In [None]:
shap.summary_plot(shap_values, X.values, plot_type="bar", class_names= class_names, feature_names = X.columns)
plt.title('The Summary Plot for the Multiclass Model'+'\n'+'Class 2 - Best, Class 1 - Premium, Class 0 - Value')

In [None]:
from sklearn.preprocessing import LabelEncoder

labels = [
    "00-20 low",
    "20-40 below average",
    "40-60 average",
    "60-80 above average",
    "80-100 high",
]
le = LabelEncoder()
y = le.fit_transform(labels)
encoding_scheme = dict(zip(y, labels))
print(encoding_scheme)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.title('Feature importances for Amsterdam')
shap.summary_plot(shap_values, X.values, plot_type="bar", class_names= class_names, feature_names = X.columns)

In [None]:
plt.title('Impact on low income in Amsterdam')
shap.summary_plot(shap_values[0], X.values, feature_names = X.columns)

In [None]:
plt.title('Impact on income below average in Amsterdam')
shap.summary_plot(shap_values[1], X.values, feature_names = X.columns)

In [None]:
plt.title('Impact on average income in Amsterdam')
shap.summary_plot(shap_values[2], X.values, feature_names = X.columns)

In [None]:
plt.title('Impact on income above average in Amsterdam')
shap.summary_plot(shap_values[3], X.values, feature_names = X.columns)

In [None]:
plt.title('Impact on high income in Amsterdam')
shap.summary_plot(shap_values[4], X.values, feature_names = X.columns)

In [None]:
fig = plt.figure(figsize=(20,10))

ax0 = fig.add_subplot(131)
ax0.title.set_text('Low income')
shap.summary_plot(shap_values[0], X.values, feature_names = X.columns, show=False)
ax0.set_xlabel(r'SHAP values', fontsize=11)

ax2 = fig.add_subplot(133)
ax2.title.set_text('High income')
shap.summary_plot(shap_values[4], X.values, feature_names = X.columns, show=False)
ax2.set_xlabel(r'SHAP values', fontsize=11)

# plt.tight_layout(pad=3) # You can also use plt.tight_layout() instead of using plt.subplots_adjust() to add space between plots
plt.show()

## Cross-validation assessment of model's performance

In [None]:
# AUC, F1, Precision, Recall etc. may be used
from catboost import cv

params = {}
params['loss_function'] = 'MultiClass'
params['iterations'] = 100
params['custom_loss'] = 'AUC'
params['random_seed'] = 63
params['learning_rate'] = 0.18

cv_data = cv(
    params = params,
    pool = Pool(X, label=y),
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=False,
    verbose=False
)

## Save model for further usage

In [None]:
best_model.save_model('AMSterdam.bin')
best_model.save_model('AMSterdam.json', format='json')

## Calculate predictions of the income levels in Amsterdam

In [None]:
X_test = preds.drop('id', axis=1)
test_pool = Pool(data=X_test)
contest_predictions = best_model.predict(test_pool)

## Prepare the final file

In [None]:
f = open('AMSterdam_predictions.csv', 'w')
f.write('id,income')
for idx in range(len(contest_predictions)):
    line = str(preds['id'][idx]) + ',' + str(contest_predictions[idx]) + '\n'
    f.write(line)
f.close()