In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
%matplotlib inline

# Automated feature engineering
import featuretools as ft

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from pathlib import Path

In [2]:
DATASET = Path("datasets/lotto/data_processed_2.csv")

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [3]:
# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

In [4]:
df = dataset.copy()

In [5]:
df.dtypes

LuckyNo                             int64
1st_digit                           int64
2nd_digit                           int64
3rd_digit                           int64
4th_digit                           int64
PrizeType                          object
DrawNo                              int64
DaysSinceLastExactMatch             int64
DaysUntilNextExactMatch             int64
DaysSinceLastAnyMatch               int64
DaysUntilNextAnyMatch               int64
DrawYear                            int64
DrawMonth                           int64
DrawWeek                            int64
DrawDayofweek                       int64
DrawDayofyear                       int64
CumProbaExactMatch                float64
CumProbaAnyMatch                  float64
DrawDate                   datetime64[ns]
TotalMean                         float64
1stDigitMean                      float64
2ndDigitMean                      float64
3rdDigitMean                      float64
4thDigitMean                      

In [6]:
columns = ['DrawNo', 'DrawDate', 'PrizeType', 'LuckyNo']
df = df[columns]
df.shape, df.head(10)

((104673, 4),    DrawNo   DrawDate        PrizeType  LuckyNo
 0   40792 1992-05-06       1stPrizeNo       19
 1   40792 1992-05-06       2ndPrizeNo     1124
 2   40792 1992-05-06       3rdPrizeNo      592
 3   40792 1992-05-06   ConsolationNo1     5311
 4   40792 1992-05-06  ConsolationNo10      407
 5   40792 1992-05-06   ConsolationNo2     1949
 6   40792 1992-05-06   ConsolationNo3     1606
 7   40792 1992-05-06   ConsolationNo4     3775
 8   40792 1992-05-06   ConsolationNo5     6226
 9   40792 1992-05-06   ConsolationNo6     1271)

In [7]:
# df['LuckyNo'].value_counts().plot.bar(figsize=(6,5))

In [8]:
# df.groupby(['LuckyNo']).max().head(10)

In [9]:
df.groupby(['LuckyNo']).count().tail(10)

Unnamed: 0_level_0,DrawNo,DrawDate,PrizeType
LuckyNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9990,10,10,10
9991,15,15,15
9992,10,10,10
9993,11,11,11
9994,13,13,13
9995,8,8,8
9996,10,10,10
9997,13,13,13
9998,9,9,9
9999,19,19,19


In [10]:
df['result_id'] = df.groupby(['LuckyNo']).ngroup()

In [11]:
df.loc[lambda df: df['result_id'] == 9016]

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,result_id
9808,83395,1995-01-22,ConsolationNo7,9016,9016
11534,90895,1995-07-16,ConsolationNo8,9016,9016
15764,109296,1996-09-18,ConsolationNo6,9016,9016
32487,181901,2001-03-31,ConsolationNo8,9016,9016
45392,238104,2004-09-01,SpecialNo1,9016,9016
56919,288207,2007-08-12,SpecialNo4,9016,9016
67291,333310,2010-04-11,SpecialNo3,9016,9016
85833,413914,2014-10-28,SpecialNo7,9016,9016
92222,441716,2016-05-14,SpecialNo2,9016,9016


In [12]:
df.head(10)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,result_id
0,40792,1992-05-06,1stPrizeNo,19,19
1,40792,1992-05-06,2ndPrizeNo,1124,1124
2,40792,1992-05-06,3rdPrizeNo,592,592
3,40792,1992-05-06,ConsolationNo1,5311,5311
4,40792,1992-05-06,ConsolationNo10,407,407
5,40792,1992-05-06,ConsolationNo2,1949,1949
6,40792,1992-05-06,ConsolationNo3,1606,1606
7,40792,1992-05-06,ConsolationNo4,3775,3775
8,40792,1992-05-06,ConsolationNo5,6226,6226
9,40792,1992-05-06,ConsolationNo6,1271,1271


## Skewness of data

In [13]:
#  https://github.com/mengwangk/Automated-Manual-Comparison/blob/master/Retail%20Spending/notebooks/Automated%20Retail%20Spending.ipynb

In [14]:
def ecdf(data):
    """
    Empirical cumulative distribution function (ECDF) plot
    """
    x = np.sort(data)
    y = np.arange(1, len(x) + 1) / len(x)
    return x, y

In [15]:
# d = df['LuckyNo'].astype(str)

In [16]:
# x, y = ecdf(d)
# plt.plot(x, y, marker = '.')
# plt.xlabel('Lucky No'); plt.ylabel('Percentile'); plt.title('ECDF of Lucky No');

### Generate new features

In [17]:
df = df.reset_index()

In [18]:
df.rename(columns={"index": "instance_id"}, inplace=True)
df.head(10)

Unnamed: 0,instance_id,DrawNo,DrawDate,PrizeType,LuckyNo,result_id
0,0,40792,1992-05-06,1stPrizeNo,19,19
1,1,40792,1992-05-06,2ndPrizeNo,1124,1124
2,2,40792,1992-05-06,3rdPrizeNo,592,592
3,3,40792,1992-05-06,ConsolationNo1,5311,5311
4,4,40792,1992-05-06,ConsolationNo10,407,407
5,5,40792,1992-05-06,ConsolationNo2,1949,1949
6,6,40792,1992-05-06,ConsolationNo3,1606,1606
7,7,40792,1992-05-06,ConsolationNo4,3775,3775
8,8,40792,1992-05-06,ConsolationNo5,6226,6226
9,9,40792,1992-05-06,ConsolationNo6,1271,1271


#### Cut off time

In [19]:
labels = df[["instance_id", "DrawDate"]].groupby(["instance_id", "DrawDate"]).min().reset_index()
labels.rename(columns = {"DrawDate":"cutoff_time"}, inplace=True)

  stacked_values = np.vstack(map(np.asarray, values))


In [20]:
labels.head(30)

Unnamed: 0,instance_id,cutoff_time
0,0,1992-05-06
1,1,1992-05-06
2,2,1992-05-06
3,3,1992-05-06
4,4,1992-05-06
5,5,1992-05-06
6,6,1992-05-06
7,7,1992-05-06
8,8,1992-05-06
9,9,1992-05-06


In [21]:
df.drop(columns=['instance_id'], inplace=True)

In [22]:
df.head(10)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,result_id
0,40792,1992-05-06,1stPrizeNo,19,19
1,40792,1992-05-06,2ndPrizeNo,1124,1124
2,40792,1992-05-06,3rdPrizeNo,592,592
3,40792,1992-05-06,ConsolationNo1,5311,5311
4,40792,1992-05-06,ConsolationNo10,407,407
5,40792,1992-05-06,ConsolationNo2,1949,1949
6,40792,1992-05-06,ConsolationNo3,1606,1606
7,40792,1992-05-06,ConsolationNo4,3775,3775
8,40792,1992-05-06,ConsolationNo5,6226,6226
9,40792,1992-05-06,ConsolationNo6,1271,1271


In [23]:
es = ft.EntitySet(id="Draw Results")

In [24]:
# Add the entire data table as an entity
es.entity_from_dataframe("results",
                         dataframe=df,
                         index="results_index",
                         time_index = 'DrawDate')

es['results']



Entity: results
  Variables:
    results_index (dtype: index)
    DrawNo (dtype: numeric)
    DrawDate (dtype: datetime_time_index)
    PrizeType (dtype: categorical)
    LuckyNo (dtype: numeric)
    result_id (dtype: numeric)
  Shape:
    (Rows: 104673, Columns: 6)

In [25]:
# Create a new entity
es.normalize_entity(new_entity_id="draws",
                    base_entity_id="results",
                    index="LuckyNo")
                    # additional_variables=[""])
es['draws'].df.head()

Unnamed: 0,LuckyNo,first_results_time
19,19,1992-05-06
407,407,1992-05-06
592,592,1992-05-06
950,950,1992-05-06
1124,1124,1992-05-06


In [34]:
es['results'].df.head(10)

Unnamed: 0,results_index,DrawNo,DrawDate,PrizeType,LuckyNo,result_id
0,0,40792,1992-05-06,1stPrizeNo,19,19
1,1,40792,1992-05-06,2ndPrizeNo,1124,1124
2,2,40792,1992-05-06,3rdPrizeNo,592,592
3,3,40792,1992-05-06,ConsolationNo1,5311,5311
4,4,40792,1992-05-06,ConsolationNo10,407,407
5,5,40792,1992-05-06,ConsolationNo2,1949,1949
6,6,40792,1992-05-06,ConsolationNo3,1606,1606
7,7,40792,1992-05-06,ConsolationNo4,3775,3775
8,8,40792,1992-05-06,ConsolationNo5,6226,6226
9,9,40792,1992-05-06,ConsolationNo6,1271,1271


In [27]:
es['draws'].df.loc[lambda df: df['LuckyNo'] == 19].head()

Unnamed: 0,LuckyNo,first_results_time
19,19,1992-05-06


In [28]:
es['results'].df.loc[lambda df: df['LuckyNo'] == 19].head()

Unnamed: 0,results_index,DrawNo,DrawDate,PrizeType,LuckyNo,result_id
0,0,40792,1992-05-06,1stPrizeNo,19,19
297,297,41992,1992-06-03,SpecialNo8,19,19
10813,10813,87795,1995-05-06,ConsolationNo1,19,19
11566,11566,90995,1995-07-19,SpecialNo7,19,19
14844,14844,105296,1996-06-16,ConsolationNo6,19,19


In [29]:
es

Entityset: Draw Results
  Entities:
    results [Rows: 104673, Columns: 6]
    draws [Rows: 10000, Columns: 2]
  Relationships:
    results.LuckyNo -> draws.LuckyNo

#### Generate features

In [30]:
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='results',
                                       cutoff_time = labels, verbose = 2,
                                       cutoff_time_in_index = True,
                                       chunk_size = len(labels), n_jobs = -1,
                                       max_depth = 1)

Built 8 features
EntitySet scattered to workers in 1.006 seconds
Elapsed: 01:16 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 1/1 chunks


In [None]:
feature_matrix.head(50)