In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [25]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
import featuretools as ft
import matplotlib as mpl
import matplotlib.pyplot as plt
import re

In [3]:
# to make this notebook's output stable across runs
np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.display import display
pd.options.display.max_columns = 50
pd.options.display.html.table_schema = True

## Functions

In [4]:
def add_datepart(df, fldname, drop=True, time=False):
    """Helper function that adds columns relevant to a date."""
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [5]:
def download_file(url, path, download_path):
    """Download the dataset"""
    if not os.path.isdir(path):
        os.makedirs(path)
    urllib.request.urlretrieve(url, download_path)
    if (os.path.isfile(download_path)):
        print(f'Download path: {download_path}')

## Download dataset

In [6]:
import os
import zipfile
from six.moves import urllib

DOWNLOAD_ROOT = "http://www.sportstoto.com.my/"
LOTTO_PATH = os.path.join(os.getcwd(), "datasets", "lotto")

In [7]:
_4D_URL = DOWNLOAD_ROOT + "upload/4D.zip"
_4D_ZIP = os.path.join(LOTTO_PATH, "4D.zip")

In [8]:
# Download 4D
download_file(_4D_URL, LOTTO_PATH, _4D_ZIP)

Download path: /Users/mengwangk/workspace/development/dl-projects/datasets/lotto/4D.zip


In [9]:
downloaded_file = _4D_ZIP

In [15]:
# Read into data frame
df = pd.read_csv(downloaded_file, header=0, sep=',', quotechar='"', dtype=np.str)
df.tail(10)

Unnamed: 0,DrawNo,DrawDate,1stPrizeNo,2ndPrizeNo,3rdPrizeNo,SpecialNo1,SpecialNo2,SpecialNo3,SpecialNo4,SpecialNo5,SpecialNo6,SpecialNo7,SpecialNo8,SpecialNo9,SpecialNo10,ConsolationNo1,ConsolationNo2,ConsolationNo3,ConsolationNo4,ConsolationNo5,ConsolationNo6,ConsolationNo7,ConsolationNo8,ConsolationNo9,ConsolationNo10
4538,494619,20190512,5907,2641,1839,2617,1346,4533,6470,2898,4759,7335,5065,2781,3403,9303,2085,5137,6218,4033,1097,5750,3323,3660,4733
4539,494719,20190515,7439,4747,995,693,2326,6155,1860,6555,830,844,4587,1584,4897,8959,1630,4372,7245,6446,2382,1364,8163,6523,9291
4540,494819,20190518,4599,2772,675,7822,2971,4857,2592,2666,8753,7027,7050,3127,1905,5133,141,830,2442,4281,5616,1731,1441,4717,6522
4541,494919,20190519,9533,5236,6643,2726,9971,664,6970,3607,3404,920,8625,94,3791,2023,8989,406,9780,503,182,3016,5118,39,1665
4542,495019,20190522,2107,2140,5099,6017,6539,7674,5693,9347,2556,757,7523,5538,7792,6628,3068,5040,5548,8464,1348,4195,3481,1272,4319
4543,495119,20190525,5337,1484,1311,1056,5204,6491,3950,5034,517,976,2873,9890,3967,3750,2593,8374,3788,8717,396,8161,3780,3418,8313
4544,495219,20190526,5456,5388,6294,448,9000,9237,3870,3669,7182,2766,655,6450,462,8268,7579,4873,9143,8421,3413,4499,5424,8929,4376
4545,495319,20190529,6220,147,5344,2488,7321,8056,4491,9435,5319,8189,2549,6311,9456,2207,9043,8072,6455,8531,2449,8621,4478,4027,3250
4546,495419,20190601,7882,8550,7022,511,4738,128,911,1661,2986,2885,5687,4664,5916,2969,2006,9275,2089,3649,2537,3566,6964,4626,4336
4547,495519,20190602,2646,3531,7330,4150,7579,767,437,558,4071,904,7159,4811,5545,5914,6124,6946,2270,7425,6924,1698,2663,7664,1693


In [16]:
df_transposed = df.melt(id_vars=["DrawNo", "DrawDate"], var_name="PrizeType", value_name="LuckyNo")
df_transposed = df_transposed.sort_values(["DrawNo", "DrawDate", "PrizeType"], ascending=True)
df_transposed['DrawDate'] = pd.to_datetime(df_transposed['DrawDate'], format='%Y%m%d')
df_transposed.reset_index(inplace=True, drop=True)
df_transposed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104604 entries, 0 to 104603
Data columns (total 4 columns):
DrawNo       104604 non-null object
DrawDate     104604 non-null datetime64[ns]
PrizeType    104604 non-null object
LuckyNo      104604 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 3.2+ MB


In [17]:
df_transposed.tail(23)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo
104581,495519,2019-06-02,1stPrizeNo,2646
104582,495519,2019-06-02,2ndPrizeNo,3531
104583,495519,2019-06-02,3rdPrizeNo,7330
104584,495519,2019-06-02,ConsolationNo1,5914
104585,495519,2019-06-02,ConsolationNo10,1693
104586,495519,2019-06-02,ConsolationNo2,6124
104587,495519,2019-06-02,ConsolationNo3,6946
104588,495519,2019-06-02,ConsolationNo4,2270
104589,495519,2019-06-02,ConsolationNo5,7425
104590,495519,2019-06-02,ConsolationNo6,6924


In [18]:
df_transposed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104604 entries, 0 to 104603
Data columns (total 4 columns):
DrawNo       104604 non-null object
DrawDate     104604 non-null datetime64[ns]
PrizeType    104604 non-null object
LuckyNo      104604 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 3.2+ MB


## Train Test Split

In [52]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_transposed, test_size=0.20, shuffle=False)
combined_df = [train_df, test_df]

display(len(train_df),train_df.head(10))
display(len(test_df), test_df.head(10))

83683

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo
0,40792,1992-05-06,1stPrizeNo,19
1,40792,1992-05-06,2ndPrizeNo,1124
2,40792,1992-05-06,3rdPrizeNo,592
3,40792,1992-05-06,ConsolationNo1,5311
4,40792,1992-05-06,ConsolationNo10,407
5,40792,1992-05-06,ConsolationNo2,1949
6,40792,1992-05-06,ConsolationNo3,1606
7,40792,1992-05-06,ConsolationNo4,3775
8,40792,1992-05-06,ConsolationNo5,6226
9,40792,1992-05-06,ConsolationNo6,1271


20921

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo
83683,404614,2014-04-16,ConsolationNo6,4803
83684,404614,2014-04-16,ConsolationNo7,92
83685,404614,2014-04-16,ConsolationNo8,1823
83686,404614,2014-04-16,ConsolationNo9,1419
83687,404614,2014-04-16,SpecialNo1,5382
83688,404614,2014-04-16,SpecialNo10,5878
83689,404614,2014-04-16,SpecialNo2,761
83690,404614,2014-04-16,SpecialNo3,987
83691,404614,2014-04-16,SpecialNo4,6871
83692,404614,2014-04-16,SpecialNo5,4534


## Engineer Features

In [53]:
def extract_digit(no:str):
    pass

df = df_transposed.copy()
df['1st_digit'] = df['LuckyNo'].str[0:1]
df['2st_digit'] = df['LuckyNo'].str[1:2]
df['3st_digit'] = df['LuckyNo'].str[2:3]
df['4st_digit'] = df['LuckyNo'].str[3:4]
#add_datepart(df, 'DrawDate', drop=False)
df.tail(23)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,1st_digit,2st_digit,3st_digit,4st_digit
104581,495519,2019-06-02,1stPrizeNo,2646,2,6,4,6
104582,495519,2019-06-02,2ndPrizeNo,3531,3,5,3,1
104583,495519,2019-06-02,3rdPrizeNo,7330,7,3,3,0
104584,495519,2019-06-02,ConsolationNo1,5914,5,9,1,4
104585,495519,2019-06-02,ConsolationNo10,1693,1,6,9,3
104586,495519,2019-06-02,ConsolationNo2,6124,6,1,2,4
104587,495519,2019-06-02,ConsolationNo3,6946,6,9,4,6
104588,495519,2019-06-02,ConsolationNo4,2270,2,2,7,0
104589,495519,2019-06-02,ConsolationNo5,7425,7,4,2,5
104590,495519,2019-06-02,ConsolationNo6,6924,6,9,2,4


## Feature engineering

In [45]:
df['1st_digit_rolling_mean'] = df['1st_digit'].rolling(window=23).mean()
df['2st_digit_rolling_mean'] = df['2st_digit'].rolling(window=23).mean()
df['3st_digit_rolling_mean'] = df['3st_digit'].rolling(window=23).mean()
df['4st_digit_rolling_mean'] = df['4st_digit'].rolling(window=23).mean()
df['total_digits_rolling_mean'] = (df['1st_digit_rolling_mean'] + df['2st_digit_rolling_mean'] + df['3st_digit_rolling_mean'] + df['4st_digit_rolling_mean']) / 4

In [46]:
df.head(46)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,1st_digit,2st_digit,3st_digit,4st_digit,1st_digit_rolling_mean,2st_digit_rolling_mean,3st_digit_rolling_mean,4st_digit_rolling_mean,total_digits_rolling_mean
0,40792,1992-05-06,1stPrizeNo,19,0,0,1,9,,,,,
1,40792,1992-05-06,2ndPrizeNo,1124,1,1,2,4,,,,,
2,40792,1992-05-06,3rdPrizeNo,592,0,5,9,2,,,,,
3,40792,1992-05-06,ConsolationNo1,5311,5,3,1,1,,,,,
4,40792,1992-05-06,ConsolationNo10,407,0,4,0,7,,,,,
5,40792,1992-05-06,ConsolationNo2,1949,1,9,4,9,,,,,
6,40792,1992-05-06,ConsolationNo3,1606,1,6,0,6,,,,,
7,40792,1992-05-06,ConsolationNo4,3775,3,7,7,5,,,,,
8,40792,1992-05-06,ConsolationNo5,6226,6,2,2,6,,,,,
9,40792,1992-05-06,ConsolationNo6,1271,1,2,7,1,,,,,


In [47]:
#add_datepart(df, 'DrawDate', drop=False)