In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
%matplotlib inline

# Automated feature engineering
import featuretools as ft

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from pathlib import Path

In [2]:
DATASET = Path("datasets/lotto/data_processed_2.csv")

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [3]:
# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

In [22]:
df = dataset.copy()

In [23]:
df.dtypes

LuckyNo                             int64
1st_digit                           int64
2nd_digit                           int64
3rd_digit                           int64
4th_digit                           int64
PrizeType                          object
DrawNo                              int64
DaysSinceLastExactMatch             int64
DaysUntilNextExactMatch             int64
DaysSinceLastAnyMatch               int64
DaysUntilNextAnyMatch               int64
DrawYear                            int64
DrawMonth                           int64
DrawWeek                            int64
DrawDayofweek                       int64
DrawDayofyear                       int64
CumProbaExactMatch                float64
CumProbaAnyMatch                  float64
DrawDate                   datetime64[ns]
TotalMean                         float64
1stDigitMean                      float64
2ndDigitMean                      float64
3rdDigitMean                      float64
4thDigitMean                      

In [24]:
columns = ['DrawNo', 'DrawDate', 'PrizeType', 'LuckyNo']
df = df[columns]
df.shape, df.head(10)

((104673, 4),    DrawNo   DrawDate        PrizeType  LuckyNo
 0   40792 1992-05-06       1stPrizeNo       19
 1   40792 1992-05-06       2ndPrizeNo     1124
 2   40792 1992-05-06       3rdPrizeNo      592
 3   40792 1992-05-06   ConsolationNo1     5311
 4   40792 1992-05-06  ConsolationNo10      407
 5   40792 1992-05-06   ConsolationNo2     1949
 6   40792 1992-05-06   ConsolationNo3     1606
 7   40792 1992-05-06   ConsolationNo4     3775
 8   40792 1992-05-06   ConsolationNo5     6226
 9   40792 1992-05-06   ConsolationNo6     1271)

In [25]:
# df['LuckyNo'].value_counts().plot.bar(figsize=(6,5))

In [26]:
df.groupby(['LuckyNo']).max().head(10)

Unnamed: 0_level_0,DrawNo,DrawDate,PrizeType
LuckyNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,415314,2014-11-25,SpecialNo9
1,434115,2015-12-13,SpecialNo8
2,468717,2017-11-22,SpecialNo4
3,470217,2017-12-23,SpecialNo6
4,483418,2018-09-19,SpecialNo9
5,492219,2019-03-20,SpecialNo4
6,495719,2019-06-08,SpecialNo8
7,484918,2018-10-20,SpecialNo4
8,493319,2019-04-13,SpecialNo9
9,437516,2016-02-20,SpecialNo8


In [27]:
df.groupby(['LuckyNo']).count().tail(10)

Unnamed: 0_level_0,DrawNo,DrawDate,PrizeType
LuckyNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9990,10,10,10
9991,15,15,15
9992,10,10,10
9993,11,11,11
9994,13,13,13
9995,8,8,8
9996,10,10,10
9997,13,13,13
9998,9,9,9
9999,19,19,19


## Skewness of data

In [28]:
#  https://github.com/mengwangk/Automated-Manual-Comparison/blob/master/Retail%20Spending/notebooks/Automated%20Retail%20Spending.ipynb

In [29]:
def ecdf(data):
    """
    Empirical cumulative distribution function (ECDF) plot
    """
    x = np.sort(data)
    y = np.arange(1, len(x) + 1) / len(x)
    return x, y

In [30]:
# d = df['LuckyNo'].astype(str)

In [31]:
# x, y = ecdf(d)
# plt.plot(x, y, marker = '.')
# plt.xlabel('Lucky No'); plt.ylabel('Percentile'); plt.title('ECDF of Lucky No');

### Generate new features

In [32]:
df = df.reset_index()

In [37]:
df.rename(columns={"index": "instance_id"}, inplace=True)
df.head(10)

Unnamed: 0,instance_id,DrawNo,DrawDate,PrizeType,LuckyNo
0,0,40792,1992-05-06,1stPrizeNo,19
1,1,40792,1992-05-06,2ndPrizeNo,1124
2,2,40792,1992-05-06,3rdPrizeNo,592
3,3,40792,1992-05-06,ConsolationNo1,5311
4,4,40792,1992-05-06,ConsolationNo10,407
5,5,40792,1992-05-06,ConsolationNo2,1949
6,6,40792,1992-05-06,ConsolationNo3,1606
7,7,40792,1992-05-06,ConsolationNo4,3775
8,8,40792,1992-05-06,ConsolationNo5,6226
9,9,40792,1992-05-06,ConsolationNo6,1271


In [38]:
es = ft.EntitySet(id="Draw Results")

In [39]:
# Add the entire data table as an entity
es.entity_from_dataframe("result",
                         dataframe=df,
                         index="instance_id",
                         time_index = 'DrawDate')

es['result']

Entity: result
  Variables:
    instance_id (dtype: index)
    DrawNo (dtype: numeric)
    DrawDate (dtype: datetime_time_index)
    PrizeType (dtype: categorical)
    LuckyNo (dtype: numeric)
  Shape:
    (Rows: 104673, Columns: 5)

In [40]:
# Create a new entity
es.normalize_entity(new_entity_id="draws",
                    base_entity_id="result",
                    index="LuckyNo")
                    # additional_variables=[""])
es['draws'].df.head()

Unnamed: 0,LuckyNo,first_result_time
19,19,1992-05-06
407,407,1992-05-06
592,592,1992-05-06
950,950,1992-05-06
1124,1124,1992-05-06


In [41]:
es['result'].df.head()

Unnamed: 0,instance_id,DrawNo,DrawDate,PrizeType,LuckyNo
0,0,40792,1992-05-06,1stPrizeNo,19
1,1,40792,1992-05-06,2ndPrizeNo,1124
2,2,40792,1992-05-06,3rdPrizeNo,592
3,3,40792,1992-05-06,ConsolationNo1,5311
4,4,40792,1992-05-06,ConsolationNo10,407


In [42]:
es['draws'].df.loc[lambda df: df['LuckyNo'] == 19].head()

Unnamed: 0,LuckyNo,first_result_time
19,19,1992-05-06


In [43]:
es['result'].df.loc[lambda df: df['LuckyNo'] == 19].head()

Unnamed: 0,instance_id,DrawNo,DrawDate,PrizeType,LuckyNo
0,0,40792,1992-05-06,1stPrizeNo,19
297,297,41992,1992-06-03,SpecialNo8,19
10813,10813,87795,1995-05-06,ConsolationNo1,19
11566,11566,90995,1995-07-19,SpecialNo7,19
14844,14844,105296,1996-06-16,ConsolationNo6,19


In [44]:
es

Entityset: Draw Results
  Entities:
    result [Rows: 104673, Columns: 5]
    draws [Rows: 10000, Columns: 2]
  Relationships:
    result.LuckyNo -> draws.LuckyNo

#### Cut off time

In [46]:
labels = df[["instance_id", "DrawDate"]].groupby(["instance_id", "DrawDate"]).min().reset_index()
labels.rename(columns = {"DrawDate":"cutoff_time"}, inplace=True)

In [47]:
labels.head(30)

Unnamed: 0,instance_id,cutoff_time
0,0,1992-05-06
1,1,1992-05-06
2,2,1992-05-06
3,3,1992-05-06
4,4,1992-05-06
5,5,1992-05-06
6,6,1992-05-06
7,7,1992-05-06
8,8,1992-05-06
9,9,1992-05-06


#### Generate features

In [48]:
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='result',
                                       cutoff_time = labels, verbose = 2,
                                       cutoff_time_in_index = True,
                                       chunk_size = len(labels), n_jobs = -1,
                                       max_depth = 1)

Built 7 features
EntitySet scattered to workers in 1.028 seconds
Elapsed: 01:17 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 1/1 chunks


In [49]:
feature_matrix.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,DrawNo,PrizeType,LuckyNo,DAY(DrawDate),YEAR(DrawDate),MONTH(DrawDate),WEEKDAY(DrawDate)
instance_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1992-05-06,40792,1stPrizeNo,19,6,1992,5,2
1,1992-05-06,40792,2ndPrizeNo,1124,6,1992,5,2
2,1992-05-06,40792,3rdPrizeNo,592,6,1992,5,2
3,1992-05-06,40792,ConsolationNo1,5311,6,1992,5,2
4,1992-05-06,40792,ConsolationNo10,407,6,1992,5,2
5,1992-05-06,40792,ConsolationNo2,1949,6,1992,5,2
6,1992-05-06,40792,ConsolationNo3,1606,6,1992,5,2
7,1992-05-06,40792,ConsolationNo4,3775,6,1992,5,2
8,1992-05-06,40792,ConsolationNo5,6226,6,1992,5,2
9,1992-05-06,40792,ConsolationNo6,1271,6,1992,5,2
