In [2]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
import featuretools as ft
import matplotlib as mpl
import matplotlib.pyplot as plt
import re

In [4]:
# to make this notebook's output stable across runs
np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.display import display
pd.options.display.max_columns = 50
pd.options.display.html.table_schema = True

## Functions

In [5]:
def download_file(url, path, download_path):
    """Download the dataset"""
    if not os.path.isdir(path):
        os.makedirs(path)
    urllib.request.urlretrieve(url, download_path)
    if (os.path.isfile(download_path)):
        print(f'Download path: {download_path}')

## Download dataset

In [6]:
import os
import zipfile
from six.moves import urllib

DOWNLOAD_ROOT = "http://www.sportstoto.com.my/"
LOTTO_PATH = os.path.join(os.getcwd(), "datasets", "lotto")

In [7]:
_4D_URL = DOWNLOAD_ROOT + "upload/4D.zip"
_4D_ZIP = os.path.join(LOTTO_PATH, "4D.zip")

In [8]:
# Download 4D
download_file(_4D_URL, LOTTO_PATH, _4D_ZIP)

Download path: /Users/mengwangk/workspace/development/dl-projects/datasets/lotto/4D.zip


In [9]:
downloaded_file = _4D_ZIP

In [10]:
# Read into data frame
df = pd.read_csv(downloaded_file, header=0, sep=',', quotechar='"', dtype=np.str)
df.tail(10)

Unnamed: 0,DrawNo,DrawDate,1stPrizeNo,2ndPrizeNo,3rdPrizeNo,SpecialNo1,SpecialNo2,SpecialNo3,SpecialNo4,SpecialNo5,SpecialNo6,SpecialNo7,SpecialNo8,SpecialNo9,SpecialNo10,ConsolationNo1,ConsolationNo2,ConsolationNo3,ConsolationNo4,ConsolationNo5,ConsolationNo6,ConsolationNo7,ConsolationNo8,ConsolationNo9,ConsolationNo10
4541,494919,20190519,9533,5236,6643,2726,9971,664,6970,3607,3404,920,8625,94,3791,2023,8989,406,9780,503,182,3016,5118,39,1665
4542,495019,20190522,2107,2140,5099,6017,6539,7674,5693,9347,2556,757,7523,5538,7792,6628,3068,5040,5548,8464,1348,4195,3481,1272,4319
4543,495119,20190525,5337,1484,1311,1056,5204,6491,3950,5034,517,976,2873,9890,3967,3750,2593,8374,3788,8717,396,8161,3780,3418,8313
4544,495219,20190526,5456,5388,6294,448,9000,9237,3870,3669,7182,2766,655,6450,462,8268,7579,4873,9143,8421,3413,4499,5424,8929,4376
4545,495319,20190529,6220,147,5344,2488,7321,8056,4491,9435,5319,8189,2549,6311,9456,2207,9043,8072,6455,8531,2449,8621,4478,4027,3250
4546,495419,20190601,7882,8550,7022,511,4738,128,911,1661,2986,2885,5687,4664,5916,2969,2006,9275,2089,3649,2537,3566,6964,4626,4336
4547,495519,20190602,2646,3531,7330,4150,7579,767,437,558,4071,904,7159,4811,5545,5914,6124,6946,2270,7425,6924,1698,2663,7664,1693
4548,495619,20190605,7389,6576,1495,1678,1826,1359,6090,3252,1314,7001,4819,1715,2177,3610,5099,6276,4486,1904,5901,9713,977,9927,678
4549,495719,20190608,307,2971,7758,2927,7287,3709,3925,4206,4703,1144,7351,6775,5228,6780,7829,6894,6801,6,4591,6964,9531,4031,3397
4550,495819,20190609,397,1232,1219,9569,791,1218,3669,9517,1844,5050,9073,2976,6223,8611,4622,483,1808,1132,3153,8902,6788,3596,5031


In [11]:
df_transposed = df.melt(id_vars=["DrawNo", "DrawDate"], var_name="PrizeType", value_name="LuckyNo")
df_transposed = df_transposed.sort_values(["DrawNo", "DrawDate", "PrizeType"], ascending=True)
df_transposed['DrawDate'] = pd.to_datetime(df_transposed['DrawDate'], format='%Y%m%d')
df_transposed.reset_index(inplace=True, drop=True)
df_transposed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104673 entries, 0 to 104672
Data columns (total 4 columns):
DrawNo       104673 non-null object
DrawDate     104673 non-null datetime64[ns]
PrizeType    104673 non-null object
LuckyNo      104673 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 3.2+ MB


In [12]:
df_transposed.tail(23)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo
104650,495819,2019-06-09,1stPrizeNo,397
104651,495819,2019-06-09,2ndPrizeNo,1232
104652,495819,2019-06-09,3rdPrizeNo,1219
104653,495819,2019-06-09,ConsolationNo1,8611
104654,495819,2019-06-09,ConsolationNo10,5031
104655,495819,2019-06-09,ConsolationNo2,4622
104656,495819,2019-06-09,ConsolationNo3,483
104657,495819,2019-06-09,ConsolationNo4,1808
104658,495819,2019-06-09,ConsolationNo5,1132
104659,495819,2019-06-09,ConsolationNo6,3153


In [13]:
df_transposed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104673 entries, 0 to 104672
Data columns (total 4 columns):
DrawNo       104673 non-null object
DrawDate     104673 non-null datetime64[ns]
PrizeType    104673 non-null object
LuckyNo      104673 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 3.2+ MB


In [14]:
df_transposed.DrawNo.value_counts()

314209    23
488118    23
127697    23
476918    23
374812    23
093295    23
229804    23
174500    23
050892    23
203602    23
122597    23
120997    23
074994    23
416014    23
205302    23
266906    23
116197    23
367012    23
280107    23
289507    23
186201    23
437316    23
280407    23
288307    23
393013    23
470617    23
396313    23
419215    23
338510    23
470818    23
          ..
328209    23
226603    23
375912    23
154899    23
253105    23
416214    23
174200    23
454517    23
276906    23
477918    23
237804    23
145399    23
441316    23
212303    23
372212    23
377712    23
397113    23
077394    23
358411    23
487118    23
217503    23
124097    23
321309    23
060593    23
393613    23
198002    23
312709    23
267306    23
311209    23
463517    23
Name: DrawNo, Length: 4551, dtype: int64

In [25]:
print(len(df_transposed) / 23)
print(len(df_transposed.DrawNo.unique()))

4551.0
4551


In [30]:
df_transposed.to_csv(os.path.join(LOTTO_PATH, "data_all.csv"), index=False)

In [31]:
os.listdir(LOTTO_PATH)

['4D.zip', 'models', 'data_all.csv', '.ipynb_checkpoints']

In [32]:
import itertools

itertools.combinations?

[0;31mInit signature:[0m [0mitertools[0m[0;34m.[0m[0mcombinations[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
combinations(iterable, r) --> combinations object

Return successive r-length combinations of elements in the iterable.

combinations(range(4), 3) --> (0,1,2), (0,1,3), (0,2,3), (1,2,3)
[0;31mType:[0m           type
[0;31mSubclasses:[0m     


In [34]:
for i in range(4):
    print(i)

0
1
2
3
