# Predicting income in NYC

We will use our dataset with GSV- and remote sensing-based indicators

## Libraries installation

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
#!pip install catboost
#!pip install ipywidgets
#!pip install shap
#!pip install sklearn
#!pip install --upgrade numpy
#!Before running this notebook, run jupyter nbextension enable --py widgetsnbextension in the Python command line

In [None]:
import catboost
print(catboost.__version__)
!python --version

In [None]:
%config InlineBackend.figure_format = 'svg'

## Reading the data

In [None]:
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
import catboost
from catboost import *
from catboost import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import jupyterlab_widgets

In [None]:
nyc,preds = pd.read_csv('nyc_med.csv'), pd.read_csv('nyc_med_preds.csv') 

In [None]:
preds.head()

## Preparing  data

Label values extraction

In [None]:
y = nyc.income
X = nyc.drop('income', axis=1)

In [None]:
dataset_dir = './nyc'
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

# We will be able to work with files with/without header and
# with different separators.

nyc.to_csv(
    os.path.join(dataset_dir, 'train.csv'),
    index=False, sep=',', header=True
)
preds.to_csv(
    os.path.join(dataset_dir, 'preds.csv'),
    index=False, sep=',', header=True
)

In [None]:
from catboost.utils import create_cd
feature_names = dict()
for column, name in enumerate(nyc):
    if column == 0:
        continue
    feature_names[column] = name
    
create_cd(
    label=0,
    feature_names=feature_names,
    output_path=os.path.join(dataset_dir, 'train.cd')
)

In [None]:
pool1 = Pool(data=X, label=y)

print('Dataset shape')
print('dataset 1:' + str(pool1.shape))

print('\n')
print('Column names')
print('dataset 1:')
print(pool1.get_feature_names()) 

## Splitting NYC data into train and validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=42)

## Overfitting detector

In [None]:
model_with_early_stop = CatBoostRegressor(
    eval_metric='RMSE',
    iterations=200,
    random_seed=63,
    learning_rate=0.5,
    early_stopping_rounds=20
)
model_with_early_stop.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    verbose=False,
    plot=True
)

In [None]:
metrics = model_with_early_stop.eval_metrics(
    data=pool1,
    metrics=['RMSE'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
print(model_with_early_stop.tree_count_)

In [None]:
print('RMSE values:')
print(np.array(metrics['RMSE']))

## Hyperparameter tunning

### Training speed

In [None]:
from catboost import CatBoost
fast_model = CatBoostClassifier(
    random_seed=63,
    iterations=150,
    learning_rate=0.01,
    boosting_type='Ordered',
    bootstrap_type='Bernoulli',
    subsample=0.5,
    one_hot_max_size=20,
    rsm=0.5,
    leaf_estimation_iterations=5,
    max_ctr_complexity=1)

fast_model.fit(
    X_train, y_train,
    verbose=False,
    plot=True
)

### Training for accuracy

In [None]:
tuned_model = CatBoostRegressor(
    random_seed=63,
    iterations=1000,
    task_type="CPU",
    learning_rate=0.025,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_strength=1,
    one_hot_max_size=2,
    leaf_estimation_method='Newton'
)
tuned_model.fit(
    X_train, y_train,
    verbose=False,
    eval_set=(X_validation, y_validation),
    early_stopping_rounds=50,
    plot=True
)

## Training the model after parameter tunning

In [None]:
best_model = CatBoostRegressor(
    random_seed=42,
    iterations=int(tuned_model.tree_count_ * 1.2),
)
best_model.fit(
    X_train, y_train,
    verbose=100
)

In [None]:
best_model.score(X_validation, y_validation)

In [None]:
metrics = best_model.eval_metrics(
    data=pool1,
    metrics=['RMSE', 'R2'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
best_model.get_feature_importance(prettified=True)

In [None]:
# Create a dataframe of feature importance 
df_feature_importance = pd.DataFrame(best_model.get_feature_importance(prettified=True))
#plotting feature importance
plt.figure(figsize=(12, 6));
feature_plot= sns.barplot(x="Importances", y="Feature Id", data=df_feature_importance,palette="cool");
plt.title('Feature importance for New York City');

In [None]:
df_feature_importance.head(20)

In [None]:
# Plot feature importances
importances = best_model.get_feature_importance(pool1, fstr_type=catboost.EFstrType.FeatureImportance)
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12,12))
plt.title('Feature importance for New York City')
plt.barh(X_train.columns[indices][::-1], importances[indices][::-1])

In [None]:
interactions = best_model.get_feature_importance(pool1, fstr_type=catboost.EFstrType.Interaction)
feature_interaction = [[X_train.columns[interaction[0]], X_train.columns[interaction[1]], interaction[2]] for interaction in interactions]
feature_interaction_df = pd.DataFrame(feature_interaction, columns=['feature1', 'feature2', 'interaction_strength'])
feature_interaction_df.head(10)

# SHAP values

In [None]:
shap_values = best_model.get_feature_importance(pool1, fstr_type='ShapValues')
print(shap_values.shape)

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(Pool(X, y))

shap.initjs()
shap.force_plot(explainer.expected_value[0],shap_values[0])

In [None]:
shap_values = best_model.get_feature_importance(catboost.Pool(X_validation, label=y_validation), 
                                                                     type="ShapValues")
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

shap.initjs()
shap.force_plot(expected_value, shap_values[3,:], X_validation.iloc[3,:])

In [None]:
shap.summary_plot(shap_values, X.values, plot_type="bar", feature_names = X.columns)

In [None]:
shap.summary_plot(shap_values, X_validation)

In [None]:
shap.summary_plot(shap_values[4], X.values, feature_names = X.columns)

In [None]:
shap.dependence_plot(5, shap_values[0], X.values, feature_names=X.columns)

## Cross-validation

In [None]:
from catboost import cv

params = {}
params['loss_function'] = 'RMSE'
params['iterations'] = 200
params['custom_loss'] = 'R2'
params['random_seed'] = 63
params['learning_rate'] = 0.07

cv_data = cv(
    params = params,
    pool = Pool(X, label=y),
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=False,
    verbose=False
)

In [None]:
best_model.save_model('NYC_model.bin')
best_model.save_model('NYC_model.json', format='json')

## Calculate predictions of the income levels in NYC

In [None]:
X_test = preds.drop('id', axis=1)
test_pool = Pool(data=X_test)
contest_predictions = best_model.predict(test_pool)

## Prepare the final file

In [None]:
f = open('NYC_pred_income.csv', 'w')
f.write('id,income')
for idx in range(len(contest_predictions)):
    line = str(preds['id'][idx]) + ',' + str(contest_predictions[idx]) + '\n'
    f.write(line)
f.close()