In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
%matplotlib inline

# Automated feature engineering
import featuretools as ft

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from pathlib import Path

In [2]:
DATASET = Path("datasets/lotto/data_processed_2.csv")

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [3]:
# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

In [4]:
df = dataset.copy()

In [5]:
df.dtypes

LuckyNo                             int64
1st_digit                           int64
2nd_digit                           int64
3rd_digit                           int64
4th_digit                           int64
PrizeType                          object
DrawNo                              int64
DaysSinceLastExactMatch             int64
DaysUntilNextExactMatch             int64
DaysSinceLastAnyMatch               int64
DaysUntilNextAnyMatch               int64
DrawYear                            int64
DrawMonth                           int64
DrawWeek                            int64
DrawDayofweek                       int64
DrawDayofyear                       int64
CumProbaExactMatch                float64
CumProbaAnyMatch                  float64
DrawDate                   datetime64[ns]
TotalMean                         float64
1stDigitMean                      float64
2ndDigitMean                      float64
3rdDigitMean                      float64
4thDigitMean                      

In [6]:
columns = ['DrawNo', 'DrawDate', 'PrizeType', 'LuckyNo']
df = df[columns]
df.shape, df.head(10)

((104673, 4),    DrawNo   DrawDate        PrizeType  LuckyNo
 0   40792 1992-05-06       1stPrizeNo       19
 1   40792 1992-05-06       2ndPrizeNo     1124
 2   40792 1992-05-06       3rdPrizeNo      592
 3   40792 1992-05-06   ConsolationNo1     5311
 4   40792 1992-05-06  ConsolationNo10      407
 5   40792 1992-05-06   ConsolationNo2     1949
 6   40792 1992-05-06   ConsolationNo3     1606
 7   40792 1992-05-06   ConsolationNo4     3775
 8   40792 1992-05-06   ConsolationNo5     6226
 9   40792 1992-05-06   ConsolationNo6     1271)

In [7]:
# df['LuckyNo'].value_counts().plot.bar(figsize=(6,5))

In [8]:
df.groupby(['LuckyNo']).max().head(10)

Unnamed: 0_level_0,DrawNo,DrawDate,PrizeType
LuckyNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,415314,2014-11-25,SpecialNo9
1,434115,2015-12-13,SpecialNo8
2,468717,2017-11-22,SpecialNo4
3,470217,2017-12-23,SpecialNo6
4,483418,2018-09-19,SpecialNo9
5,492219,2019-03-20,SpecialNo4
6,495719,2019-06-08,SpecialNo8
7,484918,2018-10-20,SpecialNo4
8,493319,2019-04-13,SpecialNo9
9,437516,2016-02-20,SpecialNo8


In [9]:
df.groupby(['LuckyNo']).count().head(10)

Unnamed: 0_level_0,DrawNo,DrawDate,PrizeType
LuckyNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,13,13,13
1,10,10,10
2,6,6,6
3,11,11,11
4,13,13,13
5,9,9,9
6,13,13,13
7,10,10,10
8,10,10,10
9,12,12,12


## Skewness of data

In [10]:
#  https://github.com/mengwangk/Automated-Manual-Comparison/blob/master/Retail%20Spending/notebooks/Automated%20Retail%20Spending.ipynb

In [11]:
def ecdf(data):
    """
    Empirical cumulative distribution function (ECDF) plot
    """
    x = np.sort(data)
    y = np.arange(1, len(x) + 1) / len(x)
    return x, y

In [12]:
# d = df['LuckyNo'].astype(str)

In [13]:
# x, y = ecdf(d)
# plt.plot(x, y, marker = '.')
# plt.xlabel('Lucky No'); plt.ylabel('Percentile'); plt.title('ECDF of Lucky No');

### Generate new features

In [41]:
es = ft.EntitySet(id="Draw Results")

In [42]:
# Add the entire data table as an entity
es.entity_from_dataframe("result",
                         dataframe=df,
                         index="result_index",
                         time_index = 'DrawDate')

es['result']

Entity: result
  Variables:
    result_index (dtype: index)
    DrawNo (dtype: numeric)
    DrawDate (dtype: datetime_time_index)
    PrizeType (dtype: categorical)
    LuckyNo (dtype: numeric)
  Shape:
    (Rows: 104673, Columns: 5)

In [43]:
# Create a new entity
es.normalize_entity(new_entity_id="draws",
                    base_entity_id="result",
                    index="LuckyNo")
                    # additional_variables=[""])
es['draws'].df.head()

Unnamed: 0,LuckyNo,first_result_time
19,19,1992-05-06
407,407,1992-05-06
592,592,1992-05-06
950,950,1992-05-06
1124,1124,1992-05-06


In [44]:
es['result'].df.head()

Unnamed: 0,result_index,DrawNo,DrawDate,PrizeType,LuckyNo
0,0,40792,1992-05-06,1stPrizeNo,19
1,1,40792,1992-05-06,2ndPrizeNo,1124
2,2,40792,1992-05-06,3rdPrizeNo,592
3,3,40792,1992-05-06,ConsolationNo1,5311
4,4,40792,1992-05-06,ConsolationNo10,407


In [45]:
es['draws'].df.loc[lambda df: df['LuckyNo'] == 19].head()

Unnamed: 0,LuckyNo,first_result_time
19,19,1992-05-06


In [46]:
es['result'].df.loc[lambda df: df['LuckyNo'] == 19].head()

Unnamed: 0,result_index,DrawNo,DrawDate,PrizeType,LuckyNo
0,0,40792,1992-05-06,1stPrizeNo,19
297,297,41992,1992-06-03,SpecialNo8,19
10813,10813,87795,1995-05-06,ConsolationNo1,19
11566,11566,90995,1995-07-19,SpecialNo7,19
14844,14844,105296,1996-06-16,ConsolationNo6,19


In [47]:
es

Entityset: Draw Results
  Entities:
    result [Rows: 104673, Columns: 5]
    draws [Rows: 10000, Columns: 2]
  Relationships:
    result.LuckyNo -> draws.LuckyNo

#### Cut off time

In [78]:
labels = df[["DrawNo", "DrawDate"]].groupby(["DrawNo", "DrawDate"]).min().reset_index()
labels.rename(columns = {"DrawDate":"cutoff_time"}, inplace=True)
labels.head(10)

Unnamed: 0,DrawNo,cutoff_time
0,40792,1992-05-06
1,40892,1992-05-07
2,40992,1992-05-10
3,41092,1992-05-13
4,41192,1992-05-14
5,41292,1992-05-17
6,41392,1992-05-20
7,41492,1992-05-21
8,41592,1992-05-24
9,41692,1992-05-27


In [48]:
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='result')
feature_matrix

Unnamed: 0_level_0,DrawNo,PrizeType,LuckyNo,DAY(DrawDate),YEAR(DrawDate),MONTH(DrawDate),WEEKDAY(DrawDate),draws.SUM(result.DrawNo),draws.STD(result.DrawNo),draws.MAX(result.DrawNo),draws.SKEW(result.DrawNo),draws.MIN(result.DrawNo),draws.MEAN(result.DrawNo),draws.COUNT(result),draws.NUM_UNIQUE(result.PrizeType),draws.MODE(result.PrizeType),draws.DAY(first_result_time),draws.YEAR(first_result_time),draws.MONTH(first_result_time),draws.WEEKDAY(first_result_time)
result_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,40792,1stPrizeNo,19,6,1992,5,2,5618818,153047.954707,464417,-0.072923,40792,267562.761905,21,14,SpecialNo7,6,1992,5,2
1,40792,2ndPrizeNo,1124,6,1992,5,2,3531297,149114.50546,468517,-0.661895,40792,321027.0,11,10,SpecialNo4,6,1992,5,2
2,40792,3rdPrizeNo,592,6,1992,5,2,4432998,139261.32049,465317,-0.464576,40792,277062.375,16,13,ConsolationNo10,6,1992,5,2
3,40792,ConsolationNo1,5311,6,1992,5,2,4085680,149149.256699,494219,-0.03101,40792,255355.0,16,11,ConsolationNo5,6,1992,5,2
4,40792,ConsolationNo10,407,6,1992,5,2,1894441,170206.514732,463817,-0.027689,40792,270634.428571,7,5,ConsolationNo5,6,1992,5,2
