# Machine Learning - CatBoost

In [23]:
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
required_libs = [ ("numpy", "numpy"),
                 ("pandas", "pandas"),
                 ("seaborn", "seaborn"),
                 ("matplotlib", "matplotlib"),
                 ("catboost", "catboost"),
                 ("sklearn", "sklearn"),
                 ("ipywidgets", "ipywidgets"),
                 ("shap", "shap"),
                 ("colorama", "colorama"),
                 ("emoji", "emoji")
                ]

In [25]:
def is_lib_exists(name):
    import importlib
    lib = importlib.util.find_spec(name)
    return lib is not None

In [26]:
for (clz,lib) in required_libs:
    if not is_lib_exists(clz):
        print(f"Installing {lib}")
        !pip install {lib}
    else:
        print(f"{lib} exists")

numpy exists
pandas exists
seaborn exists
matplotlib exists
catboost exists
sklearn exists
ipywidgets exists
shap exists
colorama exists
emoji exists


In [27]:
COLAB = True

DATASET_NAME = '4D.zip'

FEATURE_DATASET_PREFIX = 'feature_matrix_d2_v3'

In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import calendar
import traceback
import catboost
from pathlib import Path
from dateutil.relativedelta import *
from datetime import *
from catboost import *
from catboost import datasets
np.set_printoptions(precision=4)

In [29]:
%matplotlib inline
%aimport

Modules to reload:
all-except-skipped

Modules to skip:



In [30]:
from IPython.display import display

In [31]:
# check catboost version
print(catboost.__version__)
!python --version

0.24
Python 3.6.9


In [32]:
# colab setup
if COLAB:
  !rm -rf dl-projects
  !git clone https://github.com/mengwangk/dl-projects
  
  !cp dl-projects/utils* .
  !cp dl-projects/preprocess* .
  !cp dl-projects/plot* .
  
  from google.colab import drive
  drive.mount('/content/gdrive')
  GDRIVE_DATASET_FOLDER = Path('gdrive/My Drive/datasets/')
  DATASET_PATH = GDRIVE_DATASET_FOLDER
  ORIGIN_DATASET_PATH = Path('dl-projects/datasets')
  !ls -l gdrive/"My Drive"/datasets/ --block-size=M

DATASET = DATASET_PATH/f"{FEATURE_DATASET_PREFIX}.ft"
ORIGIN_DATASET = ORIGIN_DATASET_PATH/DATASET_NAME

Cloning into 'dl-projects'...
remote: Enumerating objects: 158, done.[K
remote: Counting objects: 100% (158/158), done.[K
remote: Compressing objects: 100% (144/144), done.[K
remote: Total 2135 (delta 97), reused 32 (delta 14), pack-reused 1977[K
Receiving objects: 100% (2135/2135), 79.29 MiB | 28.05 MiB/s, done.
Resolving deltas: 100% (1322/1322), done.
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
total 4948M
-rw------- 1 root root   17M Jul  6 14:06 feature_matrix_2020_apr.ft
-rw------- 1 root root   17M Jul  6 14:06 feature_matrix_2020_apr_orig.pkl
-rw------- 1 root root   17M Mar  1 05:47 feature_matrix_2020_feb.ft
-rw------- 1 root root   17M Mar  1 05:39 feature_matrix_2020_jan.ft
-rw------- 1 root root   17M Jul  7 13:52 feature_matrix_2020_jul.ft
-rw------- 1 root root   17M Jul  7 13:52 feature_matrix_2020_jul_orig.pkl
-rw------- 1 root root   17M Jul  5 07:30 feature_matrix_2020_jun.ft
-

## EDA

In [33]:
from preprocess import *
from utils import feature_selection, plot_feature_importances
from plot import plot_correlation_matrix, plot_labeled_scatter

In [34]:
data = pd.read_feather(DATASET)
origin_data = format_tabular(ORIGIN_DATASET)

In [35]:
jan_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jan.ft")
feb_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_feb.ft")
mar_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_mar.ft")
apr_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_apr.ft")
may_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_may.ft")
jun_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jun.ft")
jul_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jul.ft")

In [36]:
new_data = data.append(jan_2020[data.columns],ignore_index=True)
new_data = new_data.append(feb_2020[data.columns],ignore_index=True)
new_data = new_data.append(mar_2020[data.columns],ignore_index=True)
new_data = new_data.append(apr_2020[data.columns],ignore_index=True)
new_data = new_data.append(may_2020[data.columns],ignore_index=True)
new_data = new_data.append(jun_2020[data.columns],ignore_index=True)
new_data = new_data.append(jul_2020[data.columns],ignore_index=True)
data.shape, new_data.shape 

((959893, 217), (1029893, 217))

In [37]:
data = new_data

In [38]:
data.isna().sum().sort_values(ascending=False)

CUM_SUM(SKEW(Results.TotalStrike))           7685
CUM_MEAN(TREND(Results.DrawNo, DrawDate))    7685
TREND(Results.CUM_SUM(DrawNo), DrawDate)     7685
TREND(Results.CUM_SUM(LuckyNo), DrawDate)    7685
CUM_SUM(SKEW(Results.LuckyNo))               7685
                                             ... 
CUM_SUM(MIN(Results.DrawNo))                    0
NUM_UNIQUE(Results.DAY(DrawDate))               0
NUM_UNIQUE(Results.MONTH(DrawDate))             0
SUM(Results.PERCENTILE(LuckyNo))                0
NumberId                                        0
Length: 217, dtype: int64

In [39]:
data.info(max_cols=500, null_counts=True)
#data.columns.tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029893 entries, 0 to 1029892
Data columns (total 217 columns):
 #   Column                                                  Non-Null Count    Dtype         
---  ------                                                  --------------    -----         
 0   NumberId                                                1029893 non-null  int64         
 1   time                                                    1029893 non-null  datetime64[ns]
 2   STD(Results.DrawNo)                                     1028948 non-null  float64       
 3   STD(Results.TotalStrike)                                1028948 non-null  float64       
 4   STD(Results.LuckyNo)                                    1028948 non-null  float64       
 5   MAX(Results.DrawNo)                                     1029893 non-null  int64         
 6   MAX(Results.TotalStrike)                                1029893 non-null  int64         
 7   MAX(Results.LuckyNo)               

In [40]:
feature_matrix = data

In [44]:
#columns = ['Number', 'time', 'TotalStrike',  'month', 'year', 'Label']
#cols = ['Number', 'time', 'TotalStrike',  'month', 'year', 'Label']
#feature_matrix = feature_selection(data)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

feature_matrix = data.fillna(0)
X = feature_matrix.drop(columns=['year', 'time', 'Label', 'TotalStrike'])
y = feature_matrix.Label
fs = SelectKBest(score_func=f_classif)
X_selected = fs.fit_transform(X, y)
print(X_selected.shape)

  88  89 101 122 127 130 133 146 148 149 154 157 159 160 161 162 169 170
 173 175 176 210 211] are constant.
  f = msb / msw


(1029893, 10)


In [50]:
feature_names = list(feature_matrix.columns.values)
mask = fs.get_support() #list of booleans
new_features = [] # The list of your K best features
for bool, feature in zip(mask, feature_names):
    if bool:
        new_features.append(feature)
print(new_features)

# Get columns to keep and create new dataframe with those only
cols = fs.get_support(indices=True)
feature_matrix_selected = feature_matrix.iloc[:,cols]
feature_matrix_selected.shape, feature_matrix_selected

['MAX(Results.PERCENTILE(TotalStrike))', 'LAST(Results.MONTH(DrawDate))', 'CUM_MEAN(MIN(Results.DrawNo))', 'CUM_MEAN(STD(Results.LuckyNo))', 'CUM_MEAN(MEAN(Results.LuckyNo))', 'CUM_MEAN(MAX(Results.DrawNo))', 'CUM_MEAN(MIN(Results.LuckyNo))', 'CUM_MEAN(LAST(Results.TotalStrike))', 'CUM_MEAN(TREND(Results.LuckyNo, DrawDate))', 'CUM_MEAN(SKEW(Results.TotalStrike))']


((1029893, 10),
          MAX(Results.PERCENTILE(TotalStrike))  ...  CUM_MEAN(SKEW(Results.TotalStrike))
 0                                    0.500007  ...                                  0.0
 1                                    0.500007  ...                                  0.0
 2                                    0.500007  ...                                  0.0
 3                                    0.500007  ...                                  0.0
 4                                    0.500007  ...                                  0.0
 ...                                       ...  ...                                  ...
 1029888                              0.500005  ...                                  0.0
 1029889                              0.500005  ...                                  0.0
 1029890                              0.500005  ...                                  0.0
 1029891                              0.500005  ...                                  0.0
 1029