In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
import featuretools as ft
import matplotlib as mpl
import matplotlib.pyplot as plt
import re
import datetime
from datetime import date
from pathlib import Path

In [3]:
# to make this notebook's output stable across runs
np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.display import display
pd.options.display.max_columns = 50
pd.options.display.html.table_schema = True

## Functions

In [4]:
def add_datepart(df, fldname, drop=True, time=False):
    """Helper function that adds columns relevant to a date."""
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

## Constants

In [5]:
TOTAL_NUMBERS = 10000

## Load Data

In [6]:
PATH = Path("datasets/lotto")
DATASET = PATH/'data_all.csv'

In [7]:
# df will store the original dataset
dataset = pd.read_csv(DATASET, parse_dates=['DrawDate'], dtype={'PrizeType': str})
dataset.columns

Index(['DrawNo', 'DrawDate', 'PrizeType', 'LuckyNo'], dtype='object')

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104673 entries, 0 to 104672
Data columns (total 4 columns):
DrawNo       104673 non-null int64
DrawDate     104673 non-null datetime64[ns]
PrizeType    104673 non-null object
LuckyNo      104673 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 3.2+ MB


In [9]:
dataset.dtypes

DrawNo                int64
DrawDate     datetime64[ns]
PrizeType            object
LuckyNo               int64
dtype: object

In [10]:
dataset.nunique()

DrawNo        4551
DrawDate      4551
PrizeType       23
LuckyNo      10000
dtype: int64

## Train Test Split

In [11]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(dataset, test_size=0.20, shuffle=False)
combined_df = [train_df, test_df]

display(len(train_df))
display(len(test_df))

83738

20935

## Feature generation

#### Days since last exact match

In [12]:
df = dataset.copy()

In [13]:
df['DaysSinceLastExactMatch'] = 0

result = df.loc[df.LuckyNo == 1234]
display(len(result), result)

10

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch
231,41792,1992-05-28,2ndPrizeNo,1234,0
18696,121997,1997-07-12,SpecialNo7,1234,0
35191,193801,2001-12-26,2ndPrizeNo,1234,0
47146,245705,2005-02-13,SpecialNo6,1234,0
47753,248405,2005-04-06,ConsolationNo2,1234,0
49421,255605,2005-09-11,SpecialNo4,1234,0
71468,351511,2011-04-24,ConsolationNo4,1234,0
71528,351711,2011-04-27,SpecialNo8,1234,0
98841,470517,2017-12-27,ConsolationNo7,1234,0
101068,480218,2018-07-15,ConsolationNo3,1234,0


In [14]:
datetime.datetime.strptime('1997-07-12', '%Y-%m-%d').date() - datetime.datetime.strptime('1992-05-28', '%Y-%m-%d').date()

datetime.timedelta(days=1871)

In [15]:
# df_temp['DaysSinceLastExactMatch'] = (df_temp.DrawDate - df_temp.loc[df_temp.LuckyNo == 9999].DrawDate.shift(1)).dt.days
# df_temp.loc[df_temp.LuckyNo == 9999].head(3)
matched_dates = df.loc[df.LuckyNo == 1234].DrawDate.values
days = np.diff(matched_dates).astype('timedelta64[D]')
days = np.insert(days, 0,0)
display(matched_dates, days)
len(days)

array(['1992-05-28T00:00:00.000000000', '1997-07-12T00:00:00.000000000',
       '2001-12-26T00:00:00.000000000', '2005-02-13T00:00:00.000000000',
       '2005-04-06T00:00:00.000000000', '2005-09-11T00:00:00.000000000',
       '2011-04-24T00:00:00.000000000', '2011-04-27T00:00:00.000000000',
       '2017-12-27T00:00:00.000000000', '2018-07-15T00:00:00.000000000'],
      dtype='datetime64[ns]')

array([   0, 1871, 1628, 1145,   52,  158, 2051,    3, 2436,  200],
      dtype='timedelta64[D]')

10

In [16]:
def get_elapsed_days(data, no, fld, func):
    """Get elapsed between the draw dates"""
    df_filtered = data[func(data.LuckyNo, no)]
    days = np.absolute(np.diff(df_filtered.DrawDate.values).astype('timedelta64[D]')).astype('int64')
    days = np.insert(days, 0,0)
    for idx, val in zip(df_filtered.index, days):
        data.at[idx, fld] = val

In [17]:
def exact_match(lucky_no, no):
    return lucky_no == no

In [18]:
df.sort_values(by=['DrawDate'], ascending=True, inplace=True)

In [19]:
for no in range(0, TOTAL_NUMBERS):
    if no % 1000 == 0:
        print('Processing %s of 10000' % str(no))
    get_elapsed_days(df, no, 'DaysSinceLastExactMatch', exact_match)

Processing 0 of 10000
Processing 1000 of 10000
Processing 2000 of 10000
Processing 3000 of 10000
Processing 4000 of 10000
Processing 5000 of 10000
Processing 6000 of 10000
Processing 7000 of 10000
Processing 8000 of 10000
Processing 9000 of 10000


In [20]:
display(df[df.LuckyNo==1234])

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch
231,41792,1992-05-28,2ndPrizeNo,1234,0
18696,121997,1997-07-12,SpecialNo7,1234,1871
35191,193801,2001-12-26,2ndPrizeNo,1234,1628
47146,245705,2005-02-13,SpecialNo6,1234,1145
47753,248405,2005-04-06,ConsolationNo2,1234,52
49421,255605,2005-09-11,SpecialNo4,1234,158
71468,351511,2011-04-24,ConsolationNo4,1234,2051
71528,351711,2011-04-27,SpecialNo8,1234,3
98841,470517,2017-12-27,ConsolationNo7,1234,2436
101068,480218,2018-07-15,ConsolationNo3,1234,200


#### Days Until Next Exact Match

In [21]:
df['DaysUntilNextExactMatch'] = 0
df.tail(20)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch,DaysUntilNextExactMatch
104670,495819,2019-06-09,SpecialNo7,5050,120,0
104667,495819,2019-06-09,SpecialNo4,3669,14,0
104668,495819,2019-06-09,SpecialNo5,9517,603,0
104669,495819,2019-06-09,SpecialNo6,1844,1187,0
104662,495819,2019-06-09,ConsolationNo9,3596,2954,0
104666,495819,2019-06-09,SpecialNo3,1218,1628,0
104661,495819,2019-06-09,ConsolationNo8,6788,407,0
104655,495819,2019-06-09,ConsolationNo2,4622,176,0
104659,495819,2019-06-09,ConsolationNo6,3153,449,0
104658,495819,2019-06-09,ConsolationNo5,1132,189,0


In [22]:
df.sort_values(by=['DrawDate'], ascending=False, inplace=True)

In [23]:
for no in range(0, TOTAL_NUMBERS):
    if no % 1000 == 0:
        print('Processing %s of 10000' % str(no))
    get_elapsed_days(df, no, 'DaysUntilNextExactMatch', exact_match)

Processing 0 of 10000
Processing 1000 of 10000
Processing 2000 of 10000
Processing 3000 of 10000
Processing 4000 of 10000
Processing 5000 of 10000
Processing 6000 of 10000
Processing 7000 of 10000
Processing 8000 of 10000
Processing 9000 of 10000


In [30]:
display(df[df.LuckyNo==1234])

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch,DaysUntilNextExactMatch,DaysSinceLastAnyMatch
231,41792,1992-05-28,2ndPrizeNo,1234,0,1871,0
18696,121997,1997-07-12,SpecialNo7,1234,1871,1628,0
35191,193801,2001-12-26,2ndPrizeNo,1234,1628,1145,0
47146,245705,2005-02-13,SpecialNo6,1234,1145,52,0
47753,248405,2005-04-06,ConsolationNo2,1234,52,158,0
49421,255605,2005-09-11,SpecialNo4,1234,158,2051,0
71468,351511,2011-04-24,ConsolationNo4,1234,2051,3,0
71528,351711,2011-04-27,SpecialNo8,1234,3,2436,0
98841,470517,2017-12-27,ConsolationNo7,1234,2436,200,0
101068,480218,2018-07-15,ConsolationNo3,1234,200,0,0


#### Days Since Last Any Match

In [25]:
def pad(val):
    return str(val).zfill(4)

In [80]:
from itertools import permutations

def get_permutations(no):
    no_list = []
    for p in list(set(permutations(pad(no)))):
        no_list.append(int(''.join(p)))
    return no_list
    
def any_match(lucky_no, no):
    return lucky_no.isin(get_permutations(no))

In [89]:
df['DaysSinceLastAnyMatch'] = 0
df.sort_values(by=['DrawDate'], ascending=True, inplace=True)
display(df[df.LuckyNo==1234])

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch,DaysUntilNextExactMatch,DaysSinceLastAnyMatch
231,41792,1992-05-28,2ndPrizeNo,1234,0,1871,0
18696,121997,1997-07-12,SpecialNo7,1234,1871,1628,0
35191,193801,2001-12-26,2ndPrizeNo,1234,1628,1145,0
47146,245705,2005-02-13,SpecialNo6,1234,1145,52,0
47753,248405,2005-04-06,ConsolationNo2,1234,52,158,0
49421,255605,2005-09-11,SpecialNo4,1234,158,2051,0
71468,351511,2011-04-24,ConsolationNo4,1234,2051,3,0
71528,351711,2011-04-27,SpecialNo8,1234,3,2436,0
98841,470517,2017-12-27,ConsolationNo7,1234,2436,200,0
101068,480218,2018-07-15,ConsolationNo3,1234,200,0,0


In [90]:
# for no in range(0, 1000):
#     if no % 1000 == 0:
#         print('Processing %s of 10000' % str(no))
#     get_elapsed_days(df, no, 'DaysSinceLastAnyMatch', any_match)

get_elapsed_days(df, 1234, 'DaysSinceLastAnyMatch', any_match)
display(df[df.LuckyNo.isin(get_permutations(1234))].head(10))

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch,DaysUntilNextExactMatch,DaysSinceLastAnyMatch
231,41792,1992-05-28,2ndPrizeNo,1234,0,1871,0
1133,45692,1992-08-27,ConsolationNo3,3142,0,265,91
1199,45992,1992-09-03,ConsolationNo1,1342,0,500,7
1413,46892,1992-09-24,ConsolationNo7,2431,0,1196,21
1802,48592,1992-11-04,ConsolationNo5,3421,0,36,41
2152,50092,1992-12-09,SpecialNo1,4132,0,801,35
2169,50192,1992-12-10,ConsolationNo4,3421,36,787,1
2247,50492,1992-12-17,SpecialNo3,3214,0,528,7
2259,50592,1992-12-20,ConsolationNo2,1243,0,578,3
3729,56993,1993-05-19,ConsolationNo1,3142,265,368,150


#### Days Until Next Any Match

#### Extract digits by position

In [None]:
df['1st_digit'] = df['LuckyNo'].str[0:1]
df['2st_digit'] = df['LuckyNo'].str[1:2]
df['3st_digit'] = df['LuckyNo'].str[2:3]
df['4st_digit'] = df['LuckyNo'].str[3:4]
#add_datepart(df, 'DrawDate', drop=False)
df.tail(23)

#### Digits pairings

In [45]:
df['1st_digit_rolling_mean'] = df['1st_digit'].rolling(window=23).mean()
df['2st_digit_rolling_mean'] = df['2st_digit'].rolling(window=23).mean()
df['3st_digit_rolling_mean'] = df['3st_digit'].rolling(window=23).mean()
df['4st_digit_rolling_mean'] = df['4st_digit'].rolling(window=23).mean()
df['total_digits_rolling_mean'] = (df['1st_digit_rolling_mean'] + df['2st_digit_rolling_mean'] + df['3st_digit_rolling_mean'] + df['4st_digit_rolling_mean']) / 4

In [46]:
df.head(46)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,1st_digit,2st_digit,3st_digit,4st_digit,1st_digit_rolling_mean,2st_digit_rolling_mean,3st_digit_rolling_mean,4st_digit_rolling_mean,total_digits_rolling_mean
0,40792,1992-05-06,1stPrizeNo,19,0,0,1,9,,,,,
1,40792,1992-05-06,2ndPrizeNo,1124,1,1,2,4,,,,,
2,40792,1992-05-06,3rdPrizeNo,592,0,5,9,2,,,,,
3,40792,1992-05-06,ConsolationNo1,5311,5,3,1,1,,,,,
4,40792,1992-05-06,ConsolationNo10,407,0,4,0,7,,,,,
5,40792,1992-05-06,ConsolationNo2,1949,1,9,4,9,,,,,
6,40792,1992-05-06,ConsolationNo3,1606,1,6,0,6,,,,,
7,40792,1992-05-06,ConsolationNo4,3775,3,7,7,5,,,,,
8,40792,1992-05-06,ConsolationNo5,6226,6,2,2,6,,,,,
9,40792,1992-05-06,ConsolationNo6,1271,1,2,7,1,,,,,


In [47]:
#add_datepart(df, 'DrawDate', drop=False)