<a href="https://colab.research.google.com/github/mengwangk/dl-projects/blob/master/04_04_auto_ml_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automated ML - Tuning

In [0]:
COLAB = True

DATASET_NAME = '4D.zip'

FEATURE_DATASET_PREFIX = 'feature_matrix_d2_v1'

In [0]:
if COLAB:
  !rm -rf dl-projects
  !git clone https://github.com/mengwangk/dl-projects

In [0]:
if COLAB:
  !cp dl-projects/utils* .
  !cp dl-projects/preprocess* .
  !cp dl-projects/plot* .

In [0]:
%load_ext autoreload
# %reload_ext autoreload
%autoreload 2

%matplotlib inline

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math 
import matplotlib
import sys

from scipy import stats
from collections import Counter
from pathlib import Path

plt.style.use('fivethirtyeight')

sns.set(style="ticks")

import featuretools as ft

import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, precision_recall_curve, roc_curve, mean_squared_error, accuracy_score, average_precision_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

# from skopt import BayesSearchCV
# from skopt.space import Real, Categorical, Integer

# from sklearn.ensemble import RandomForestClassifier

# from scikitplot.plotters import plot_precision_recall_curve

from dateutil.relativedelta import relativedelta

from IPython.display import display

from utils import *
from preprocess import *

import xgboost as xgb

np.set_printoptions(threshold=sys.maxsize)

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

from utils import feature_selection, plot_feature_importances
from plot import plot_correlation_matrix, plot_labelled_scatter

In [0]:
%aimport

## Preparation

In [0]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive')
  GDRIVE_DATASET_FOLDER = Path('gdrive/My Drive/datasets/')

In [0]:
if COLAB:
  DATASET_PATH = GDRIVE_DATASET_FOLDER
  ORIGIN_DATASET_PATH = Path('dl-projects/datasets')
else:
  DATASET_PATH = Path("datasets")
  ORIGIN_DATASET_PATH = Path('datasets')

DATASET = DATASET_PATH/f"{FEATURE_DATASET_PREFIX}.pkl"
ORIGIN_DATASET = ORIGIN_DATASET_PATH/DATASET_NAME

if COLAB:
  !ls -l gdrive/"My Drive"/datasets/ --block-size=M
  !ls -l dl-projects/datasets --block-size=M

In [0]:
data = pd.read_pickle(DATASET)
origin_data = format_tabular(ORIGIN_DATASET)

In [0]:
data.info()

## Exploratory Data Analysis

### View data

In [0]:
feature_matrix = data

In [0]:
print(feature_matrix.columns)

In [0]:
feature_matrix[feature_matrix['Label'] == 1 ].tail(4)

In [0]:
origin_data[origin_data.LuckyNo == 3294]

In [0]:
feature_matrix.describe().round(2)

### Check Balance of Data

In [0]:
print('Positive: ' + str(feature_matrix['Label'].value_counts()[0]) + ' which is ', round(feature_matrix['Label'].value_counts()[0]/len(feature_matrix) * 100,2), '% of the dataset')
print('Negative: ' + str(feature_matrix['Label'].value_counts()[1]) + ' which is ', round(feature_matrix['Label'].value_counts()[1]/len(feature_matrix) * 100,2), '% of the dataset')

In [0]:
plt.figure(figsize=(8, 8))
sns.countplot('Label', data=feature_matrix)

In [0]:
feature_matrix.isna().sum().sort_values(ascending=False)

In [0]:
feature_matrix.isnull().sum().sort_values(ascending=False)

In [0]:
feature_matrix[feature_matrix.isnull().any(axis=1)].time.unique()

In [0]:
feature_matrix[feature_matrix.isnull().any(axis=1)].head()

### Data Cleansing

In [0]:
## Fill all NaN with 0
feature_matrix = feature_matrix.fillna(0)

In [0]:
feature_matrix.isnull().sum().sort_values(ascending=False)

In [0]:
feature_matrix.isna().sum().sort_values(ascending=False)

In [0]:
feature_matrix[feature_matrix.isnull().any(axis=1)].head()

### Feature Correlation

In [0]:
# plot_correlation_matrix(feature_matrix)

### Visualization

In [0]:
X = feature_matrix.drop(columns = ['NumberId', 'time','date', 'Label', 'TotalStrike', 'month', 'year', 'index'], errors='ignore')
X.shape, X.info()

In [0]:
y = feature_matrix['Label']
y.shape, y

#### MinMaxScaler

In [0]:
# Use PCA to find the 1st 2 principal components
X_normalized = MinMaxScaler().fit(X).transform(X)  
pca = PCA(n_components = 2).fit(X_normalized)
X_pca = pca.transform(X_normalized)

# Plot the PCA transformed version
plt.figure(figsize=(8, 8))
plot_labelled_scatter(X_pca, y, ['0', '1'])

# plt.xlabel('First principal component')
# plt.ylabel('Second principal component')
# plt.title('PCA (n_components = 2)');

#### StandardScaler

In [0]:
# Use PCA to find the 1st 2 principal components
X_normalized = StandardScaler().fit(X).transform(X)  
pca = PCA(n_components = 2).fit(X_normalized)
X_pca = pca.transform(X_normalized)
plt.figure(figsize=(8, 8))
plot_labelled_scatter(X_pca, y, ['0', '1'])

Use **StandardScaler** seems to be better!

In [0]:
X.shape, X_normalized.shape

## Balancing the data

In [0]:
X = feature_matrix.drop(columns = ['NumberId', 'time', 'Label', 'TotalStrike', 'month', 'year'])
y = feature_matrix['Label']

### Without Normalization

In [0]:
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE

# Oversampling followed by under-sampling
smt = SMOTETomek(ratio='auto')
X_smt, y_smt = smt.fit_sample(X, y)

In [0]:
# Use PCA to find the 1st 2 principal components again after resampling
X_normalized = StandardScaler().fit(X_stmt).transform(X_smt)  
pca = PCA(n_components = 2).fit(X_normalized)
X_pca = pca.transform(X_normalized)
plt.figure(figsize=(8, 8))
plot_labelled_scatter(X_pca, y_smt, ['0', '1'])

In [0]:
# Check the number of positive and negative labels now
#print('Positive: ' + str(feature_matrix['Label'].value_counts()[0]) + ' which is ', round(feature_matrix['Label'].value_counts()[0]/len(feature_matrix) * 100,2), '% of the dataset')
#print('Negative: ' + str(feature_matrix['Label'].value_counts()[1]) + ' which is ', round(feature_matrix['Label'].value_counts()[1]/len(feature_matrix) * 100,2), '% of the dataset')