<a href="https://colab.research.google.com/github/mengwangk/dl-projects/blob/master/04_04_auto_ml_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automated ML - Modeling

In [0]:
COLAB = True

DATASET_NAME = '4D.zip'

FEATURE_DATASET_PREFIX = 'feature_matrix_d2_v1'

In [0]:
if COLAB:
  # !sudo apt-get install git-lfs && git lfs install
  !rm -rf dl-projects
  !git clone https://github.com/mengwangk/dl-projects
  #!cd dl-projects && ls -l --block-size=M

In [0]:
if COLAB:
  !cp dl-projects/utils* .
  !cp dl-projects/preprocess* .

In [0]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
import math 
import matplotlib
import sys

from scipy import stats
from collections import Counter
from pathlib import Path

plt.style.use('fivethirtyeight')

sns.set(style="ticks")

# Automated feature engineering
import featuretools as ft

import warnings
warnings.filterwarnings('ignore')

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, precision_recall_curve, roc_curve, mean_squared_error, accuracy_score, average_precision_score
from sklearn.model_selection import train_test_split, cross_val_score
#from sklearn.ensemble import RandomForestClassifier
#from scikitplot.plotters import plot_precision_recall_curve
from dateutil.relativedelta import relativedelta

from IPython.display import display

from utils import *
from preprocess import *

np.set_printoptions(threshold=sys.maxsize)

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [0]:
%aimport

## Preparation

In [0]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive')
  GDRIVE_DATASET_FOLDER = Path('gdrive/My Drive/datasets/')

In [0]:
if COLAB:
  DATASET_PATH = GDRIVE_DATASET_FOLDER
  ORIGIN_DATASET_PATH = Path('dl-projects/datasets')
else:
  DATASET_PATH = Path("datasets")
  ORIGIN_DATASET_PATH = Path('datasets')

DATASET = DATASET_PATH/f"{FEATURE_DATASET_PREFIX}.pkl"
ORIGIN_DATASET = ORIGIN_DATASET_PATH/DATASET_NAME

if COLAB:
  !ls -l gdrive/"My Drive"/datasets/ --block-size=M
  !ls -l dl-projects/datasets --block-size=M

In [0]:
#data = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['time'])
#data = pd.read_csv(DATASET_PATH/"feature_matrix_2.csv", header=0, sep=',', quotechar='"', parse_dates=['time'])
data = pd.read_pickle(DATASET)
origin_data = format_tabular(ORIGIN_DATASET)

In [0]:
data.info()

## Exploratory Data Analysis

In [0]:
feature_matrix = data

In [0]:
display(feature_matrix.columns)

In [0]:
display(feature_matrix.head(4).T)

In [0]:
origin_data[origin_data['LuckyNo']==911].head(10)

In [0]:
# feature_matrix.groupby('time')['COUNT(Results)'].mean().plot()
# plt.title('Average Monthly Count of Results')
# plt.ylabel('Strike Per Number')

## Feature Selection

In [0]:
from utils import feature_selection

%load_ext autoreload
%autoreload 2

In [0]:
feature_matrix_selection = feature_selection(feature_matrix.drop(columns = ['time', 'NumberId']))

In [0]:
feature_matrix_selection['time'] = feature_matrix['time']
feature_matrix_selection['NumberId'] = feature_matrix['NumberId']
feature_matrix_selection['Label'] = feature_matrix['Label']

In [0]:
len(feature_matrix_selection.columns), feature_matrix_selection.columns

## Correlations

In [0]:
# feature_matrix_selection = feature_matrix

In [0]:
feature_matrix_selection.shape

In [0]:
corrs = feature_matrix_selection.corr().sort_values('TotalStrike')
corrs['TotalStrike'].head()

In [0]:
corrs['Label'].dropna().tail(15)

In [0]:
corrs['TotalStrike'].dropna().tail(8)

## Visualization

In [0]:
#pip install autoviz

In [0]:
#from autoviz.AutoViz_Class import AutoViz_Class

### XgBoost

In [0]:
import xgboost as xgb

In [0]:
# https://xgboost.readthedocs.io/en/latest/parameter.html
model = xgb.XGBClassifier( max_delta_step=8)
# model = xgb.XGBClassifier(max_depth=50, min_child_weight=1,  n_estimators=200,
#                           n_jobs=-1 , verbose=1,learning_rate=0.16, max_delta_step=5)

In [0]:
def predict_dt(dt, feature_matrix, return_probs = False):

    feature_matrix['date'] = feature_matrix['time']

    # Subset labels
    test_labels = feature_matrix.loc[feature_matrix['date'] == dt, 'Label']
    train_labels = feature_matrix.loc[feature_matrix['date'] < dt, 'Label']

    print(f"Size of test labels {len(test_labels)}")
    print(f"Size of train labels {len(train_labels)}")
    
    # Features
    X_train = feature_matrix[feature_matrix['date'] < dt].drop(columns = ['NumberId', 'time',
                                                                                     'date', 'Label', 'TotalStrike', 'month', 'year', 'index'], errors='ignore')
    X_test = feature_matrix[feature_matrix['date'] == dt].drop(columns = ['NumberId', 'time',
                                                                                     'date', 'Label', 'TotalStrike', 'month', 'year', 'index'], errors='ignore')
    print(f"Size of X train {len(X_train)}")
    print(f"Size of X test  {len(X_test)}")
   
    feature_names = list(X_train.columns)
    
    # Impute and scale features
    pipeline = Pipeline([('imputer', SimpleImputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])

    # Fit and transform training data
    X_train = pipeline.fit_transform(X_train)
    X_test = pipeline.transform(X_test)
    
    # Labels
    y_train = np.array(train_labels).reshape((-1, ))
    y_test = np.array(test_labels).reshape((-1, ))
    
    print('Training on {} observations.'.format(len(X_train)))
    print('Testing on {} observations.\n'.format(len(X_test)))
    
    # Train 
    model.fit(X_train, y_train)
    
    # Make predictions
    predictions = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]
    
    # Total positive
    positive = np.where((predictions==1))
    print('Total predicted to be positive: ', len(positive[0]))
  
    # Calculate metrics
    p_score = precision_score(y_test, predictions)
    r_score = recall_score(y_test, predictions)
    f_score = f1_score(y_test, predictions)
    auc_score = roc_auc_score(y_test, probs)
    a_score = accuracy_score(y_test, predictions)
    avg_p_score = average_precision_score(y_test, predictions)
    cm = confusion_matrix(y_test, predictions)
    
    print(f'Precision: {round(p_score, 5)}')
    print(f'Recall: {round(r_score, 5)}')
    print(f'F1 Score: {round(f_score, 5)}')
    print(f'ROC AUC: {round(auc_score, 5)}')
    print(f'Accuracy: {round(a_score, 5)}')
    print(f'Average precision: {round(avg_p_score, 5)}')
    
    print('Confusion matrix')
    print(cm)

    # Total predicted matches
    print('Predicted matches')
    # print(predictions)
    m = np.where((predictions==1))
    print(len(m[0]), m)

    if len(positive[0]) > 0:
      # Matching draws
      print('Matched draws')
      m = np.where((predictions==1) & (y_test==1))
      print(len(m[0]), m)
      month_data = feature_matrix.loc[feature_matrix['date'] == dt]
      numbers = month_data.iloc[m[0]][['NumberId']]
      #print(numbers)
      display(origin_data[(origin_data['DrawDate'].dt.year == dt.year) & 
                          (origin_data['DrawDate'].dt.month == dt.month) & 
                          (origin_data['LuckyNo'].isin(numbers.NumberId))].head(len(positive[0])))                                                   
    else:
      print('No luck this month')                 

    # Feature importances
    fi = pd.DataFrame({'feature': feature_names, 'importance': model.feature_importances_})
    
    if return_probs:
        return fi, probs
    
    return fi
    

In [0]:
# All the months
len(feature_matrix_selection['time'].unique()), feature_matrix_selection['time'].unique()

### Prediction by months

In [0]:
from utils import plot_feature_importances

In [0]:
%time june_2019 = predict_dt(pd.datetime(2019,6,1), feature_matrix_selection)

In [0]:
norm_june_2019_fi = plot_feature_importances(june_2019)

In [0]:
## Loop through from June to Dec
start_mt = pd.datetime(2019,6,1)
how_many_mt = 7
for i in range(how_many_mt):
  month_to_predict = start_mt + relativedelta(months=i)
  print(f"\n{month_to_predict}\n-------------------\n")
  %time predict_dt(month_to_predict, feature_matrix_selection)

In [0]:
 # %time oct_2019 = predict_dt(pd.datetime(2019,10,1), feature_matrix_selection)

In [0]:
#month_data = feature_matrix_selection.loc[feature_matrix_selection['time'] == pd.datetime(2019,6,1)]
# aa = np.sort(np.array([month_data.NumberId]))
# print(len(aa[0]),aa)
# print(np.array([month_data.NumberId]))
#month_data[month_data['Label'] == 1]['NumberId']

In [0]:
#dd = pd.read_csv(DATASET_PATH/"feature_matrix_2.csv", header=0, sep=',', quotechar='"', parse_dates=['time'])
#prev_data = dd.loc[dd['time'] == pd.datetime(2019,6,1)]
#prev_data[prev_data['Label'] == 1]['NumberId']
#print(np.array([prev_data.NumberId]))

## Check Raw Data

In [0]:
# number = [[128], [511]] 
# d = pd.DataFrame(number, columns = ['NumberId']) 
# d

In [0]:
#display(origin_data[(origin_data['DrawDate'].dt.year == 2019) & (origin_data['DrawDate'].dt.month == 6) & (origin_data['LuckyNo'].isin(d.NumberId))].head(10))

In [0]:
#display(origin_data[(origin_data['DrawDate'].dt.year == 2019) & (origin_data['DrawDate'].dt.month == 10)].head(30))

In [0]:
origin_data[(origin_data['DrawDate'].dt.year == 2019) & (origin_data['DrawDate'].dt.month == 6)]['DrawNo'].nunique()

In [0]:
origin_data[(origin_data['DrawDate'].dt.year == 2019) & (origin_data['DrawDate'].dt.month == 10)]['DrawNo'].nunique()

In [0]:
print(15 * 23 + 14 * 23)