# Machine Learning - CatBoost

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext autoreload
%autoreload 2

In [None]:
required_libs = [ ("numpy", "numpy"),
                 ("pandas", "pandas"),
                 ("seaborn", "seaborn"),
                 ("matplotlib", "matplotlib"),
                 ("catboost", "catboost"),
                 ("sklearn", "sklearn"),
                 ("ipywidgets", "ipywidgets"),
                 ("shap", "shap"),
                 ("colorama", "colorama"),
                 ("emoji", "emoji")
                ]

In [None]:
def is_lib_exists(name):
    import importlib
    lib = importlib.util.find_spec(name)
    return lib is not None

In [None]:
for (clz,lib) in required_libs:
    if not is_lib_exists(clz):
        print(f"Installing {lib}")
        !pip install {lib}
    else:
        print(f"{lib} exists")

In [None]:
COLAB = True

DATASET_NAME = '4D.zip'

FEATURE_DATASET_PREFIX = 'feature_matrix_d2_v3'

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import calendar
import traceback
import catboost
from pathlib import Path
from dateutil.relativedelta import *
from datetime import *
from catboost import *
from catboost import datasets
from catboost import CatBoostClassifier
from scipy import stats
from scipy.stats.stats import pearsonr
np.set_printoptions(precision=4)
pd.options.display.max_columns = None

In [None]:
%matplotlib inline
%aimport

In [None]:
from IPython.display import display

In [None]:
# check catboost version
print(catboost.__version__)
!python --version

In [None]:
# colab setup
if COLAB:
  !rm -rf dl-projects
  !git clone https://github.com/mengwangk/dl-projects
  
  !cp dl-projects/utils* .
  !cp dl-projects/preprocess* .
  !cp dl-projects/plot* .
  
  from google.colab import drive
  drive.mount('/content/gdrive')
  GDRIVE_DATASET_FOLDER = Path('gdrive/My Drive/datasets/')
  DATASET_PATH = GDRIVE_DATASET_FOLDER
  ORIGIN_DATASET_PATH = Path('dl-projects/datasets')
  !ls -l gdrive/"My Drive"/datasets/ --block-size=M

DATASET = DATASET_PATH/f"{FEATURE_DATASET_PREFIX}.ft"
ORIGIN_DATASET = ORIGIN_DATASET_PATH/DATASET_NAME

## EDA

In [None]:
from preprocess import *
from utils import feature_selection, plot_feature_importances
from plot import plot_correlation_matrix, plot_labeled_scatter

In [None]:
data = pd.read_feather(DATASET)
origin_data = format_tabular(ORIGIN_DATASET)

In [None]:
jan_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jan.ft")
feb_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_feb.ft")
mar_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_mar.ft")
apr_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_apr.ft")
may_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_may.ft")
jun_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jun.ft")
jul_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jul.ft")

In [None]:
new_data = data.append(jan_2020[data.columns],ignore_index=True)
new_data = new_data.append(feb_2020[data.columns],ignore_index=True)
new_data = new_data.append(mar_2020[data.columns],ignore_index=True)
new_data = new_data.append(apr_2020[data.columns],ignore_index=True)
new_data = new_data.append(may_2020[data.columns],ignore_index=True)
new_data = new_data.append(jun_2020[data.columns],ignore_index=True)
new_data = new_data.append(jul_2020[data.columns],ignore_index=True)
data.shape, new_data.shape 

In [None]:
data = new_data

In [None]:
data.isna().sum().sort_values(ascending=False)

In [None]:
data.info(max_cols=500, null_counts=True)
#data.columns.tolist()

In [None]:
feature_matrix = data
feature_matrix = data.fillna(0)
feature_matrix.sort_values(by=['time', 'MAX(Results.LuckyNo)'], inplace=True)
feature_matrix.head(20)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2, f_regression

other_features = ['Label', 'NumberId', 'time', 'TotalStrike',  'month', 'year']
#feature_matrix = feature_selection(data)

def select_features(df, func=f_regression):
  X = df.drop(columns=other_features)
  y = df.Label
  fs = SelectKBest(score_func=func, k=50)
  X_selected = fs.fit_transform(X, y)
  mask = fs.get_support()
  return X.columns[mask]

def reduce_features(df, lower_threshold=10, upper_threshold=10000):
  cols_to_drop = []
  for col in df.columns:
    if df[col].nunique() > upper_threshold or df[col].nunique() < lower_threshold:
      cols_to_drop.append(col)
  print(cols_to_drop)
  return cols_to_drop, dfdrop(columns=cols_to_drop)

def show_variations(df):
  pass


In [None]:
# import gc

#features = select_features(feature_matrix)
#numerical_features = features.to_list()

#gc.collect()

#features = select_features(feature_matrix, func=f_classif)
#numerical_features =  features.to_list()

#features = feature_selection(feature_matrix.drop(columns=other_features))
#numerical_features = features.columns.to_list()  

#features = feature_selection(feature_matrix.drop(columns=other_features))
#numerical_features = numerical_features + features.columns.to_list()  

#numerical_features = ['MAX(Results.CUM_SUM(DrawNo))', 'LAST(Results.CUM_SUM(DrawNo))', 'CUM_MEAN(MEAN(Results.DrawNo))', 'CUM_MEAN(LAST(Results.DrawNo))', 'CUM_MEAN(MAX(Results.DrawNo))', 'CUM_MEAN(COUNT(Results))', 'CUM_MEAN(SUM(Results.LuckyNo))', 'CUM_MEAN(STD(Results.DrawNo))', 'CUM_MEAN(SUM(Results.TotalStrike))', 'CUM_MEAN(SUM(Results.DrawNo))']

In [None]:
#selected_features = other_features + numerical_features
selected_features = feature_matrix.columns.to_list()
display(len(selected_features), selected_features)

In [None]:
df_selected_features = feature_matrix[selected_features]

In [None]:
def split_data(df, dt):
  y_train = df.loc[df['time'] < dt, 'Label']
  y_validation = df.loc[df['time'] == dt, 'Label']

  X_train = df[df['time'] < dt].drop(columns = ['NumberId', 'time', 'Label', 'TotalStrike', 'year'])
  X_validation = df[df['time'] == dt].drop(columns = ['NumberId', 'time', 'Label', 'TotalStrike', 'year'])

  # Reduce by standard deviation
  X_train = X_train.loc[:, X_train.std() > .0]
  X_validation = X_validation[X_train.columns]

  # Reduce by number of distinct values
  #cols, X_train = reduce_features(X_train)
  #X_validation = X_validation.drop(columns=cols)
  
  return X_train, X_validation, y_train, y_validation
 

In [None]:
TARGET_MONTH = datetime(2020,1,1)

X_train, X_validation, y_train, y_validation = split_data(df_selected_features, TARGET_MONTH)
X_train.shape, X_validation.shape, y_train.shape, y_validation.shape

In [None]:
#df_test = X_train.loc[:, X_train.std() > .0]
#df_test.columns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

def select_by_variance(threshold=0):
  """Select features by variance"""
  filter = VarianceThreshold(threshold=threshold)
  cols_to_drop = [column for column in train_features.columns
                  if column not in train_features.columns[constant_filter.get_support()]]
  print(len(cols_to_drop))


In [None]:
# https://stackabuse.com/applying-filter-methods-in-python-for-feature-selection/


In [None]:
CAT_FEATURES = []
for col in X_train.select_dtypes(include=['int64']).columns:
  print(f"{col}: Index - {X_train.columns.get_loc(col)}, Unique values - {X_train[col].nunique()}")
  if X_train[col].nunique() <= 200:
    CAT_FEATURES.append(X_train.columns.get_loc(col))
  
#CAT_FEATURES = [0, 2, 3, 4, 8, 9, 10, 12, 13, 24, 26, 27, 34, 36, 46, 47, 48]
#CAT_FEATURES = [2, 3, 4, 8, 10, 27, 34, 46, 47]
CAT_FEATURES
#X_train.select_dtypes(include=['int64']).columns

In [None]:
pool_train = Pool(data=X_train, label=y_train, cat_features=CAT_FEATURES)

In [None]:
# class weights
class_weights = [ 1, y_train.value_counts()[0] / y_train.value_counts()[1] ]
class_weights

In [None]:
#help(CatBoostClassifier)

In [None]:
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    loss_function="Logloss",
    class_weights=class_weights
)
model.fit(
    X_train, y_train,
    cat_features=CAT_FEATURES,
    eval_set=(X_validation, y_validation),
    plot=True
)

# - 0.6920349 (81) - selectkbest 50
# - 0.6933487 (3)  - use all features
# - 0.6924407 (90) - reduce_features
# - 0.6929845 (3)  - std > 0.3

In [None]:
from catboost.utils import get_roc_curve
import sklearn
from sklearn import metrics

eval_pool = pool_train
curve = get_roc_curve(model, eval_pool)
(fpr, tpr, thresholds) = curve
roc_auc = sklearn.metrics.auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))
lw = 2

plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=20)
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve

(thresholds, fpr) = get_fpr_curve(curve=curve)
(thresholds, fnr) = get_fnr_curve(curve=curve)

In [None]:
plt.figure(figsize=(16, 8))
lw = 2

plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5)
plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16)
plt.show()

In [None]:
from catboost.utils import select_threshold

fnr_threshold = select_threshold(model=model, data=eval_pool, FNR=0.005)
fpr_threshold = select_threshold(model=model, data=eval_pool, FPR=0.005)
print(fnr_threshold)
print(fpr_threshold)

In [None]:
probas = model.predict_proba(data=X_validation)[:,1]
preds = model.predict(data=X_validation)
positive = np.where((preds==1))
print(f'Total predicted to be positive: {len(positive[0])} \n')

print("Matched draws\n")
matched_numbers = np.where((preds==1) & (y_validation ==1))
print(f"Count: {len(matched_numbers[0])},  {matched_numbers}")

print('\n\nAll matched')
month_data = feature_matrix.loc[feature_matrix['time'] == TARGET_MONTH]
numbers = month_data.iloc[matched_numbers[0]][['NumberId']]

print(origin_data[(origin_data['DrawDate'].dt.year == TARGET_MONTH.year) & 
                    (origin_data['DrawDate'].dt.month == TARGET_MONTH.month) &  
                    (origin_data['LuckyNo'].isin(numbers['NumberId']))].head(100))   
print(f"\nAverage: {np.average(probas[positive])}\n")
print(f"Probas: {probas[positive]}\n")
print(f"Matched probas: {probas[matched_numbers]}\n")


In [None]:
probas_fpr = np.where(probas > fpr_threshold)
total_draws = origin_data[(origin_data['DrawDate'].dt.year == TARGET_MONTH.year) & 
                    (origin_data['DrawDate'].dt.month == TARGET_MONTH.month)]['DrawDate'].nunique()
print(f"Total draws: {total_draws}")                    
print(f" Total predicted: {len(probas_fpr[0])}")
print(probas_fpr)
numbers = month_data.iloc[probas_fpr][['NumberId']]
print(origin_data[(origin_data['DrawDate'].dt.year == TARGET_MONTH.year) & 
                    (origin_data['DrawDate'].dt.month == TARGET_MONTH.month) &  
                    (origin_data['LuckyNo'].isin(numbers['NumberId']))].head(100))  

In [None]:
model.get_feature_importance(prettified=True)

In [None]:
"""
Class weight

from sklearn.utils import class_weight
cw = list(class_weight.compute_class_weight('balanced',
                                             np.unique(df_train['Target']),
                                             df_train['Target']))
"""

"""
imbalanced-learn package
"""