<a href="https://colab.research.google.com/github/mengwangk/dl-projects/blob/master/04_02_automated_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automated ML

In [14]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
import math 
import matplotlib

from scipy import stats
from collections import Counter
from pathlib import Path

plt.style.use('fivethirtyeight')

sns.set(style="ticks")

# Automated feature engineering
import featuretools as ft

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler, StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from IPython.display import display

from utils import *
from preprocess import *

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [16]:
%aimport

Modules to reload:
all-except-skipped

Modules to skip:



## Preparation

In [17]:
DATASET_PATH = Path("datasets/lotto")
DATASET = DATASET_PATH/"4D.zip"

In [18]:
df = format_tabular(DATASET)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106398 entries, 0 to 106397
Data columns (total 4 columns):
DrawNo       106398 non-null int64
DrawDate     106398 non-null datetime64[ns]
PrizeType    106398 non-null object
LuckyNo      106398 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 3.2+ MB


In [20]:
df.tail(10)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo
106388,503319,2019-11-17,SpecialNo1,2457
106389,503319,2019-11-17,SpecialNo10,4508
106390,503319,2019-11-17,SpecialNo2,2647
106391,503319,2019-11-17,SpecialNo3,2911
106392,503319,2019-11-17,SpecialNo4,7238
106393,503319,2019-11-17,SpecialNo5,4698
106394,503319,2019-11-17,SpecialNo6,1916
106395,503319,2019-11-17,SpecialNo7,1552
106396,503319,2019-11-17,SpecialNo8,3738
106397,503319,2019-11-17,SpecialNo9,6188


In [21]:
df.describe()

Unnamed: 0,DrawNo,LuckyNo
count,106398.0,106398.0
mean,272049.80869,4974.075838
std,133554.096792,2885.679139
min,40792.0,0.0
25%,156399.0,2462.25
50%,272056.0,4952.0
75%,387713.0,7464.0
max,503319.0,9999.0


## Exploration

In [10]:
def ecdf(data):
    x = np.sort(data)
    y = np.arange(1, len(x) + 1) / len(x)
    return x, y

## Making Labels

In [23]:
def make_cutoffs_strike(draw_date):
  all_numbers = df[df['draw_date'] <= draw_date]['number_id'].unique()
  matched_numbers = df[df['draw_date'] == draw_date]['number_id'].unique()
  df_all = pd.DataFrame({'number_id': all_numbers, 'lucky_no': all_numbers})
  df_all['label'] = (df_all['number_id'].isin(matched_numbers)).astype(int)
  
  # The cutoff time is the draw_date
  df_all['cutoff_time'] = pd.to_datetime(draw_date)
  df_all = df_all[['number_id', 'cutoff_time', 'lucky_no', 'label']]
  return df_all

In [24]:
def make_custoffs(start_date, end_date):
    pass
    

In [None]:
make_custoffs(pd.datetime(2011, 5, 1), pd.datetime(2011, 6, 1))


## Automated Feature Engineering

In [12]:
es = ft.EntitySet(id="Results")

# Add the entire data table as an entity
es.entity_from_dataframe("results",
                         dataframe=df,
                         index="results_index",
                         time_index = 'DrawDate')

es['results']



Entity: results
  Variables:
    results_index (dtype: index)
    DrawNo (dtype: numeric)
    DrawDate (dtype: datetime_time_index)
    PrizeType (dtype: categorical)
    LuckyNo (dtype: numeric)
  Shape:
    (Rows: 106398, Columns: 5)

In [13]:
es['results'].df

Unnamed: 0,results_index,DrawNo,DrawDate,PrizeType,LuckyNo
0,0,40792,1992-05-06,1stPrizeNo,19
1,1,40792,1992-05-06,2ndPrizeNo,1124
2,2,40792,1992-05-06,3rdPrizeNo,592
3,3,40792,1992-05-06,ConsolationNo1,5311
4,4,40792,1992-05-06,ConsolationNo10,407
5,5,40792,1992-05-06,ConsolationNo2,1949
6,6,40792,1992-05-06,ConsolationNo3,1606
7,7,40792,1992-05-06,ConsolationNo4,3775
8,8,40792,1992-05-06,ConsolationNo5,6226
9,9,40792,1992-05-06,ConsolationNo6,1271
