<a href="https://colab.research.google.com/github/mengwangk/dl-projects/blob/master/04_02_auto_ml_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automated ML

In [0]:
COLAB = True

In [0]:

if COLAB:
  # !sudo apt-get install git-lfs && git lfs install
  !rm -rf dl-projects
  !git clone https://github.com/mengwangk/dl-projects
  !cd dl-projects && ls

In [0]:
if COLAB:
  !cp dl-projects/utils* .
  !cp dl-projects/preprocess* .

In [0]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
import math 
import matplotlib

from scipy import stats
from collections import Counter
from pathlib import Path

plt.style.use('fivethirtyeight')

sns.set(style="ticks")

# Automated feature engineering
import featuretools as ft

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from IPython.display import display

from utils import *
from preprocess import *

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [0]:
%aimport

## Preparation

In [0]:
if COLAB:
  DATASET_PATH = Path("dl-projects/datasets")
else:
  DATASET_PATH = Path("datasets")

DATASET = DATASET_PATH/"4D.zip"

In [0]:
data = format_tabular(DATASET)

In [0]:
data.info()

In [0]:
data.tail(10)

In [0]:
data['NumberId'] = data['LuckyNo']

In [0]:
data.tail(10)

In [0]:
data.describe()

In [0]:
plt.figure(figsize=(20,6))
sns.boxplot(x='NumberId', y='PrizeType',data=data)
plt.xticks(rotation=90)
plt.title('Draw')

print(data[data['NumberId']==1760])

## Exploration

In [0]:
def ecdf(data):
    x = np.sort(data)
    y = np.arange(1, len(x) + 1) / len(x)
    return x, y

## Making Labels

In [0]:
data['TotalStrike'] = 1
data.head(10)

In [0]:
def make_cutoffs(start_date, end_date, threshold=0):
    # Find numbers exist before start date
    number_pool = data[data['DrawDate'] < start_date]['NumberId'].unique()
    tmp = pd.DataFrame({'NumberId': number_pool})
   
    # For numbers in the number pool, find their strike count between the start and end dates
    strike_counts = data[(data['NumberId'].isin(number_pool)) & 
        (data['DrawDate'] >= start_date) & 
        (data['DrawDate']< end_date)
    ].groupby('NumberId')['TotalStrike'].count().reset_index()
    
    number_of_draws = data[
                 (data['DrawDate'] >= start_date) & 
                 (data['DrawDate']< end_date)]['DrawDate'].nunique()
    # display(strike_counts)
    # print(number_of_draws)
    
    # Merge with all the number ids to record all customers who existed before start date
    strike_counts = strike_counts.merge(tmp, on='NumberId', how='right')
    
    # Set the total for any numbers who did not strike in the timeframe equal to 0
    strike_counts['TotalStrike'] = strike_counts['TotalStrike'].fillna(0)
    
    # Label is based on the threshold
    strike_counts['Label'] = (strike_counts['TotalStrike'] > threshold).astype(int)
        
    # The cutoff time is the start date
    strike_counts['cutoff_time'] = pd.to_datetime(start_date)
    strike_counts = strike_counts[['NumberId', 'cutoff_time', 'TotalStrike', 'Label']]
    
    #display(strike_counts[strike_counts['Label']==1].nunique())
    #display(strike_counts.sort_values(by='TotalStrike', ascending=False))
   
    return number_of_draws, strike_counts

In [0]:
number_of_draws, may_2015 = make_cutoffs(pd.datetime(2015, 5, 1), pd.datetime(2015, 6, 1))
#display(len(may_2015))
#display(may_2015[may_2015['Label']==1].nunique())
may_2015[(may_2015['Label']==1) & (may_2015['TotalStrike']==2)].sort_values(by='TotalStrike', ascending=False).head()

In [0]:
may_2015['Label'].value_counts().plot.bar()
plt.title('Label Distribution for May')

In [0]:
CUT_OFF_YEAR=pd.datetime(2014, 1, 1)

In [0]:
## Loop through each month starting from CUT_OFF_YEAR

from dateutil.relativedelta import relativedelta

# print(data['DrawDate'].max())
max_year_month = data['DrawDate'].max() - relativedelta(months=1) + relativedelta(day=31)
print(f"Max month year: {max_year_month}")

start_year_month = CUT_OFF_YEAR

months_data = []
total_draws = 0
while start_year_month < max_year_month:
    start_date = start_year_month 
    end_date = start_date + relativedelta(months=1)
    start_year_month = start_year_month + relativedelta(months=1)
    #print(f"Labels from {start_date} to {end_date}")
    draw_count, month_data = make_cutoffs(start_date, end_date)
    total_draws = total_draws + draw_count
    months_data.append(month_data)

print(f"Total draws: {total_draws}")
print(f"Total draws: {data[(data['DrawDate'] >= CUT_OFF_YEAR) & (data['DrawDate'] <= max_year_month)]['DrawDate'].nunique()}")
print(f"Total months:{len(months_data)}")
print(f"Total records count: {sum([len(l) for l in months_data])}")
print([len(l) for l in months_data])

In [0]:
labels = pd.concat(months_data)
labels.to_csv(DATASET_PATH/'labels.csv')
labels.describe()

In [0]:
# plot_labels = labels.copy()
# plot_labels['month'] = plot_labels['cutoff_time'].dt.month

# plt.figure(figsize = (12, 6))
# sns.boxplot(x = 'month', y = 'TotalStrike', 
#             data = plot_labels[(plot_labels['TotalStrike'] > 0)]);
# plt.title('Distribution by Month');

In [0]:
labels[(labels['NumberId'] == 9016)  & (labels['Label'] > 0)]

In [0]:
labels.loc[labels['NumberId'] == 9016].set_index('cutoff_time')['TotalStrike'].plot(figsize = (6, 4), linewidth = 3)
plt.xlabel('Date', size = 16); 
plt.ylabel('Total Strike', size = 16); 
plt.title('Draw', size = 20);
plt.xticks(size = 16); plt.yticks(size = 16);

## Automated Feature Engineering

In [0]:
es = ft.EntitySet(id="Lotto Results")

# Add the entire data table as an entity
es.entity_from_dataframe("Results",
                         dataframe=data,
                         index="results_index",
                         time_index = 'DrawDate')

es['Results']

In [0]:
es.normalize_entity(new_entity_id="Numbers",
                    base_entity_id="Results",
                    index="NumberId",
                    )

In [0]:
es

In [0]:
es['Numbers'].df.head(24)

In [0]:
es['Results'].df.head(24)

In [0]:
len(es['Results'].df)

## Deep Feature Synthesis

In [0]:
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='Numbers',
                                       cutoff_time = labels, verbose = 2,
                                       cutoff_time_in_index = True,
                                       chunk_size = len(labels), n_jobs = 1,
                                       max_depth = 1)

In [0]:
len(feature_matrix.columns), feature_matrix.columns

In [0]:
len(feature_matrix)

In [0]:
feature_matrix.head()

In [0]:
feature_matrix.shape

In [0]:
feature_matrix.loc[9016, :].sample(10, axis = 1)

## Correlations

In [0]:
feature_matrix = pd.get_dummies(feature_matrix).reset_index()
feature_matrix.shape

In [0]:
feature_matrix.head()

In [0]:
corrs = feature_matrix.corr().sort_values('TotalStrike')
corrs['TotalStrike'].head()

In [0]:
corrs['TotalStrike'].dropna().tail()

In [0]:
g = sns.FacetGrid(feature_matrix[(feature_matrix['SUM(Results.DrawNo)'] > 0)],
                  hue = 'Label', size = 4, aspect = 3)
g.map(sns.kdeplot, 'SUM(Results.DrawNo)')
g.add_legend();
plt.title('Distribution of Results Total by Label');

In [0]:
feature_matrix['month'] = feature_matrix['time'].dt.month
feature_matrix['year'] = feature_matrix['time'].dt.year

In [0]:
feature_matrix.info()

In [0]:
feature_matrix.head()

## Save feature matrix

In [0]:
#if COLAB:
#  feature_matrix.to_csv(DATASET_PATH/'feature_matrix.csv', index=False)
#  feature_matrix.to_pickle(DATASET_PATH/'feature_matrix.pkl')

### Save the data

https://towardsdatascience.com/downloading-datasets-into-google-drive-via-google-colab-bcb1b30b0166

In [0]:
if COLAB:
  #!cd dl-projects && git config --global user.email 'mengwangk@gmail.com' 
  #!cd dl-projects && git config --global user.name 'mengwangk'
  #!cd dl-projects && git add -A && git commit -m 'Updated from colab' 
  from google.colab import drive
  drive.mount('/content/gdrive')
  GDRIVE_DATASET_FOLDER = Path('gdrive/My Drive/datasets/')
  #!ls /content/gdrive/My\ Drive/
  feature_matrix.to_csv(GDRIVE_DATASET_FOLDER/'feature_matrix.csv', index=False)
  feature_matrix.to_pickle(GDRIVE_DATASET_FOLDER/'feature_matrix.pkl')


In [0]:
#if COLAB:
#  !cd dl-projects && git remote rm origin && git remote add origin https://mengwangk:XX@github.com/mengwangk/dl-projects.git && git push -u origin master

In [0]:
# from google.colab import files
# files.download(DATASET_PATH/'feature_matrix.csv') 

In [0]:
if COLAB:
  !cd dl-projects/datasets && ls -l --block-size=M