# Machine Learning - CatBoost

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
required_libs = [ ("numpy", "numpy"),
                 ("pandas", "pandas"),
                 ("seaborn", "seaborn"),
                 ("matplotlib", "matplotlib"),
                 ("catboost", "catboost"),
                 ("sklearn", "sklearn"),
                 ("ipywidgets", "ipywidgets"),
                 ("shap", "shap"),
                 ("colorama", "colorama"),
                 ("emoji", "emoji")
                ]

In [4]:
def is_lib_exists(name):
    import importlib
    lib = importlib.util.find_spec(name)
    return lib is not None

In [5]:
for (clz,lib) in required_libs:
    if not is_lib_exists(clz):
        print(f"Installing {lib}")
        !pip install {lib}
    else:
        print(f"{lib} exists")

numpy exists
pandas exists
seaborn exists
matplotlib exists
Installing catboost
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/6c/6608210b29649267de52001b09e369777ee2a5cfe1c71fa75eba82a4f2dc/catboost-0.24-cp36-none-manylinux1_x86_64.whl (65.9MB)
[K     |████████████████████████████████| 65.9MB 57kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24
sklearn exists
ipywidgets exists
Installing shap
Collecting shap
[?25l  Downloading https://files.pythonhosted.org/packages/a8/77/b504e43e21a2ba543a1ac4696718beb500cfa708af2fb57cb54ce299045c/shap-0.35.0.tar.gz (273kB)
[K     |████████████████████████████████| 276kB 2.7MB/s 
Building wheels for collected packages: shap
  Building wheel for shap (setup.py) ... [?25l[?25hdone
  Created wheel for shap: filename=shap-0.35.0-cp36-cp36m-linux_x86_64.whl size=394127 sha256=e0253413fb78878e7aa76507f9d5468c0e86415206b8f7a162398589315462f4
  Stored in directory: /root/.cache/pip/whee

In [6]:
COLAB = True

DATASET_NAME = '4D.zip'

FEATURE_DATASET_PREFIX = 'feature_matrix_d2_v3'

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import calendar
import traceback
import catboost
from pathlib import Path
from dateutil.relativedelta import *
from datetime import *
from catboost import *
from catboost import datasets
np.set_printoptions(precision=4)

  import pandas.util.testing as tm


In [8]:
%matplotlib inline
%aimport

Modules to reload:
all-except-skipped

Modules to skip:



In [9]:
from IPython.display import display

In [10]:
# check catboost version
print(catboost.__version__)
!python --version

0.24
Python 3.6.9


In [11]:
# colab setup
if COLAB:
  !rm -rf dl-projects
  !git clone https://github.com/mengwangk/dl-projects
  
  !cp dl-projects/utils* .
  !cp dl-projects/preprocess* .
  !cp dl-projects/plot* .
  
  from google.colab import drive
  drive.mount('/content/gdrive')
  GDRIVE_DATASET_FOLDER = Path('gdrive/My Drive/datasets/')
  DATASET_PATH = GDRIVE_DATASET_FOLDER
  ORIGIN_DATASET_PATH = Path('dl-projects/datasets')
  !ls -l gdrive/"My Drive"/datasets/ --block-size=M

DATASET = DATASET_PATH/f"{FEATURE_DATASET_PREFIX}.ft"
ORIGIN_DATASET = ORIGIN_DATASET_PATH/DATASET_NAME

Cloning into 'dl-projects'...
remote: Enumerating objects: 169, done.[K
remote: Counting objects: 100% (169/169), done.[K
remote: Compressing objects: 100% (153/153), done.[K
remote: Total 2146 (delta 104), reused 36 (delta 16), pack-reused 1977[K
Receiving objects: 100% (2146/2146), 79.30 MiB | 12.89 MiB/s, done.
Resolving deltas: 100% (1329/1329), done.
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
total 4948M
-rw------- 1 root root   17M Jul  6 14:06 feature_matrix_2020_apr.ft
-rw------- 1 root ro

## EDA

In [12]:
from preprocess import *
from utils import feature_selection, plot_feature_importances
from plot import plot_correlation_matrix, plot_labeled_scatter

In [13]:
data = pd.read_feather(DATASET)
origin_data = format_tabular(ORIGIN_DATASET)

In [14]:
jan_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jan.ft")
feb_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_feb.ft")
mar_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_mar.ft")
apr_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_apr.ft")
may_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_may.ft")
jun_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jun.ft")
jul_2020 = pd.read_feather(DATASET_PATH/f"feature_matrix_2020_jul.ft")

In [15]:
new_data = data.append(jan_2020[data.columns],ignore_index=True)
new_data = new_data.append(feb_2020[data.columns],ignore_index=True)
new_data = new_data.append(mar_2020[data.columns],ignore_index=True)
new_data = new_data.append(apr_2020[data.columns],ignore_index=True)
new_data = new_data.append(may_2020[data.columns],ignore_index=True)
new_data = new_data.append(jun_2020[data.columns],ignore_index=True)
new_data = new_data.append(jul_2020[data.columns],ignore_index=True)
data.shape, new_data.shape 

((959893, 217), (1029893, 217))

In [16]:
data = new_data

In [17]:
data.isna().sum().sort_values(ascending=False)

CUM_SUM(SKEW(Results.TotalStrike))           7685
CUM_MEAN(TREND(Results.DrawNo, DrawDate))    7685
TREND(Results.CUM_SUM(DrawNo), DrawDate)     7685
TREND(Results.CUM_SUM(LuckyNo), DrawDate)    7685
CUM_SUM(SKEW(Results.LuckyNo))               7685
                                             ... 
CUM_SUM(MIN(Results.DrawNo))                    0
NUM_UNIQUE(Results.DAY(DrawDate))               0
NUM_UNIQUE(Results.MONTH(DrawDate))             0
SUM(Results.PERCENTILE(LuckyNo))                0
NumberId                                        0
Length: 217, dtype: int64

In [18]:
data.info(max_cols=500, null_counts=True)
#data.columns.tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029893 entries, 0 to 1029892
Data columns (total 217 columns):
 #   Column                                                  Non-Null Count    Dtype         
---  ------                                                  --------------    -----         
 0   NumberId                                                1029893 non-null  int64         
 1   time                                                    1029893 non-null  datetime64[ns]
 2   STD(Results.DrawNo)                                     1028948 non-null  float64       
 3   STD(Results.TotalStrike)                                1028948 non-null  float64       
 4   STD(Results.LuckyNo)                                    1028948 non-null  float64       
 5   MAX(Results.DrawNo)                                     1029893 non-null  int64         
 6   MAX(Results.TotalStrike)                                1029893 non-null  int64         
 7   MAX(Results.LuckyNo)               

In [19]:
feature_matrix = data
feature_matrix = data.fillna(0)

In [23]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2

#columns = ['Number', 'time', 'TotalStrike',  'month', 'year', 'Label']
#cols = ['Number', 'time', 'TotalStrike',  'month', 'year', 'Label']
#feature_matrix = feature_selection(data)
def select_features(df):
  X = df.drop(columns=['year', 'time', 'Label', 'TotalStrike'])
  y = df.Label
  fs = SelectKBest(score_func=f_classif, k=10)
  X_selected = fs.fit_transform(X, y)
  mask = fs.get_support()
  return X.columns[mask]

In [35]:
#selected_features = select_features(feature_matrix)
#selected_features = ['NumberId', 'month'] + selected_features.to_list()  
selected_features = ['NumberId', 'month', 'MAX(Results.CUM_SUM(DrawNo))', 'LAST(Results.CUM_SUM(DrawNo))', 'CUM_MEAN(MEAN(Results.DrawNo))', 'CUM_MEAN(LAST(Results.DrawNo))', 'CUM_MEAN(MAX(Results.DrawNo))', 'CUM_MEAN(COUNT(Results))', 'CUM_MEAN(SUM(Results.LuckyNo))', 'CUM_MEAN(STD(Results.DrawNo))', 'CUM_MEAN(SUM(Results.TotalStrike))', 'CUM_MEAN(SUM(Results.DrawNo))']
display(len(selected_features), selected_features)

12

['NumberId',
 'month',
 'MAX(Results.CUM_SUM(DrawNo))',
 'LAST(Results.CUM_SUM(DrawNo))',
 'CUM_MEAN(MEAN(Results.DrawNo))',
 'CUM_MEAN(LAST(Results.DrawNo))',
 'CUM_MEAN(MAX(Results.DrawNo))',
 'CUM_MEAN(COUNT(Results))',
 'CUM_MEAN(SUM(Results.LuckyNo))',
 'CUM_MEAN(STD(Results.DrawNo))',
 'CUM_MEAN(SUM(Results.TotalStrike))',
 'CUM_MEAN(SUM(Results.DrawNo))']

In [36]:
df_selected_features = feature_matrix[selected_features]
df_selected_features.head(10)

Unnamed: 0,NumberId,month,MAX(Results.CUM_SUM(DrawNo)),LAST(Results.CUM_SUM(DrawNo)),CUM_MEAN(MEAN(Results.DrawNo)),CUM_MEAN(LAST(Results.DrawNo)),CUM_MEAN(MAX(Results.DrawNo)),CUM_MEAN(COUNT(Results)),CUM_MEAN(SUM(Results.LuckyNo)),CUM_MEAN(STD(Results.DrawNo)),CUM_MEAN(SUM(Results.TotalStrike)),CUM_MEAN(SUM(Results.DrawNo))
0,72,1,11495781906,11495781906,196252.105443,320235.911096,320235.911096,7.685925,38273.046355,93810.296906,7.685925,1523592.0
1,98,1,12188302975,12188302975,182088.440653,320338.948478,320338.948478,8.519906,42294.318501,102972.123933,8.519906,1571202.0
2,121,1,9379634343,9379634343,196627.205652,320292.638369,320292.638369,7.673501,38234.993904,93602.289102,7.673501,1523614.0
3,166,1,14907006484,14907006484,183613.75982,321389.714521,321389.714521,8.556106,42439.10231,102655.335926,8.556106,1593655.0
4,192,1,12964606141,12964606141,196466.83765,320315.234187,320315.234187,7.681403,38266.548886,93709.527267,7.681403,1524125.0
5,249,1,12957842041,12957842041,191492.449552,320040.548749,320040.548749,7.893204,39295.996445,96511.740193,7.893204,1531456.0
6,276,1,13117595851,13117595851,186753.297918,319273.143586,319273.143586,8.083562,40381.989015,98940.339748,8.083562,1532265.0
7,285,1,14846192836,14846192836,184064.938551,319428.858538,319428.858538,8.239761,41164.584121,100787.269205,8.239761,1540060.0
8,290,1,6158342426,6158342426,181794.599518,319458.461121,319458.461121,8.396926,41547.275769,102160.1097,8.396926,1550424.0
9,346,1,14392219921,14392219921,182129.500256,319332.489083,319332.489083,8.357351,41327.684134,102146.522019,8.357351,1545596.0


In [37]:
def split_data(df):
  # Split the dataset
  pass