# FortuneNet

## Magic

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

## Imports

In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
import featuretools as ft
import matplotlib as mpl
import matplotlib.pyplot as plt

In [3]:
from fastai.basics import *

## Setup

In [4]:
# to make this notebook's output stable across runs
np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.display import display
pd.options.display.max_columns = 50
pd.options.display.html.table_schema = True

In [5]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
ID = "fortunenet"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Helper Functions

In [8]:
def add_datepart(df, fldname, drop=True, time=False):
    "Helper function that adds columns relevant to a date."
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [19]:
def download_file(url, path, download_path):
    """Download the dataset"""
    if not os.path.isdir(path):
        os.makedirs(path)
    urllib.request.urlretrieve(url, download_path)
    if (os.path.isfile(download_path)):
        print(f'Download path: {download_path}')

## Datasets - Run the section you want to analyse

In [20]:
import os
import zipfile
from six.moves import urllib

DOWNLOAD_ROOT = "http://www.sportstoto.com.my/"
LOTTO_PATH = os.path.join(os.getcwd(), "datasets", "lotto")

### Dataset - 4D

In [21]:
_4D_URL = DOWNLOAD_ROOT + "upload/4D.zip"
_4D_ZIP = os.path.join(LOTTO_PATH, "4D.zip")

In [22]:
# Download 4D
download_file(_4D_URL, LOTTO_PATH, _4D_ZIP)

Download path: /Users/mengwangk/workspace/development/FortuneNet/datasets/lotto/4D.zip


In [23]:
downloaded_file = _4D_ZIP

### Dataset - Others

## Read the downloaded file

In [24]:
# Read into data frame
df = pd.read_csv(downloaded_file, header=0, sep=',', quotechar='"')

In [25]:
# Tail the last 10 rows
df.tail(10)

Unnamed: 0,DrawNo,DrawDate,1stPrizeNo,2ndPrizeNo,3rdPrizeNo,SpecialNo1,SpecialNo2,SpecialNo3,SpecialNo4,SpecialNo5,SpecialNo6,SpecialNo7,SpecialNo8,SpecialNo9,SpecialNo10,ConsolationNo1,ConsolationNo2,ConsolationNo3,ConsolationNo4,ConsolationNo5,ConsolationNo6,ConsolationNo7,ConsolationNo8,ConsolationNo9,ConsolationNo10
4523,493119,20190407,9513,660,6815,3003,3883,301,1393,4808,9067,3605,2139,8117,8667,4543,1160,10,2463,5289,9962,7603,7355,6040,9375
4524,493219,20190410,1002,7403,123,4950,1195,524,6065,2364,8959,5930,989,8223,1889,8534,471,999,2114,5833,6200,8957,1097,4668,7028
4525,493319,20190413,9397,1660,2187,9181,2838,4971,400,9201,4830,8293,2589,2962,6365,8,1401,7610,6501,2624,4544,4911,7295,3974,8741
4526,493419,20190414,3196,3443,1796,1957,7325,6801,2098,2517,2231,8870,1121,9679,7159,8711,4940,4441,6414,4897,5403,9498,3661,9290,1809
4527,493519,20190417,4007,8246,1493,2102,194,6853,9083,6763,2599,5192,3514,8253,8548,2385,3325,6311,1837,8312,8840,8743,5015,335,5203
4528,493619,20190420,6622,7203,4982,993,7631,8813,6902,8474,4372,5081,2037,8326,6705,5764,5979,1951,7761,9182,8027,5130,8119,2508,5710
4529,493719,20190421,3634,5822,4752,7700,1287,2736,5179,4386,997,312,918,4313,6232,6400,2898,352,3326,6903,3435,3716,5085,4527,2554
4530,493819,20190424,1615,9328,6138,6914,4874,6988,2663,3020,1152,7209,527,3310,1081,6585,4517,1314,1181,9444,6827,1871,6984,7105,9314
4531,493919,20190427,3582,1486,8937,8464,5668,4914,2182,2689,7176,3315,3413,4247,9621,955,4440,8582,6238,2945,2016,2240,1286,8726,715
4532,494019,20190428,8975,1998,8973,3112,2316,8050,4631,3584,9329,1714,1572,5405,1344,5392,1206,7955,1235,1197,4883,6971,7918,155,6495


## Preprocessing

### Transpose the data

In [38]:
df_transposed = df.melt(id_vars=["DrawNo", "DrawDate"], var_name="PrizeType", value_name="LuckyNo")
df_transposed = df_transposed.sort_values(["DrawNo", "DrawDate", "PrizeType"], ascending=False)
df_transposed['DrawDate'] = pd.to_datetime(df_transposed['DrawDate'], format='%Y%m%d')
df_transposed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104259 entries, 54395 to 0
Data columns (total 4 columns):
DrawNo       104259 non-null int64
DrawDate     104259 non-null datetime64[ns]
PrizeType    104259 non-null object
LuckyNo      104259 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 4.0+ MB


In [46]:
# Tail the data
df_transposed.head(10)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo
54395,494019,2019-04-28,SpecialNo9,5405
49862,494019,2019-04-28,SpecialNo8,1572
45329,494019,2019-04-28,SpecialNo7,1714
40796,494019,2019-04-28,SpecialNo6,9329
36263,494019,2019-04-28,SpecialNo5,3584
31730,494019,2019-04-28,SpecialNo4,4631
27197,494019,2019-04-28,SpecialNo3,8050
22664,494019,2019-04-28,SpecialNo2,2316
58928,494019,2019-04-28,SpecialNo10,1344
18131,494019,2019-04-28,SpecialNo1,3112


In [45]:
# Split the date into year, month, day, day of week
# df_transformed["DrawYear"] = pd.DatetimeIndex(df_transformed["DrawDate"]).year
# df_transformed["DrawMonth"] = pd.DatetimeIndex(df_transformed["DrawDate"]).month
# df_transformed["DrawDay"] = pd.DatetimeIndex(df_transformed["DrawDate"]).day
# df_transformed["DrawDayOfYear"] = pd.DatetimeIndex(df_transformed["DrawDate"]).dayofyear
# df_transformed["DrawDayOfWeek"] = pd.DatetimeIndex(df_transformed["DrawDate"]).dayofweek
# df_transformed["DrawWeekDay"] = pd.DatetimeIndex(df_transformed["DrawDate"]).weekday
# df_transformed["DrawWeekOfYear"] = pd.DatetimeIndex(df_transformed["DrawDate"]).weekofyear
# df_transformed["DrawQuarter"] = pd.DatetimeIndex(df_transformed["DrawDate"]).quarter
df_processed = df_transposed.copy()
add_datepart(df_processed, 'DrawDate')
df_processed.head(46)

Unnamed: 0,DrawNo,PrizeType,LuckyNo,DrawYear,DrawMonth,DrawWeek,DrawDay,DrawDayofweek,DrawDayofyear,DrawIs_month_end,DrawIs_month_start,DrawIs_quarter_end,DrawIs_quarter_start,DrawIs_year_end,DrawIs_year_start,DrawElapsed
54395,494019,SpecialNo9,5405,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
49862,494019,SpecialNo8,1572,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
45329,494019,SpecialNo7,1714,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
40796,494019,SpecialNo6,9329,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
36263,494019,SpecialNo5,3584,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
31730,494019,SpecialNo4,4631,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
27197,494019,SpecialNo3,8050,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
22664,494019,SpecialNo2,2316,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
58928,494019,SpecialNo10,1344,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600
18131,494019,SpecialNo1,3112,2019,4,17,28,6,118,False,False,False,False,False,False,1556409600


In [41]:
%matplotlib inlinedd
import matplotlib.pyplot as plt
#df_transformed.hist(bins=50, figsize=(20,15))
#plt.show()

In [42]:
# Encode the prize types
from sklearn.preprocessing import LabelBinarizer
prize_type_encoder = LabelBinarizer()
prize_type_encoded =  prize_type_encoder.fit_transform(df_transformed['PrizeType'])
print(prize_type_encoder.classes_)
print(len(prize_type_encoded))
print(prize_type_encoded)

['1stPrizeNo' '2ndPrizeNo' '3rdPrizeNo' 'ConsolationNo1' 'ConsolationNo10'
 'ConsolationNo2' 'ConsolationNo3' 'ConsolationNo4' 'ConsolationNo5'
 'ConsolationNo6' 'ConsolationNo7' 'ConsolationNo8' 'ConsolationNo9'
 'SpecialNo1' 'SpecialNo10' 'SpecialNo2' 'SpecialNo3' 'SpecialNo4'
 'SpecialNo5' 'SpecialNo6' 'SpecialNo7' 'SpecialNo8' 'SpecialNo9']
104075
[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [43]:
# Generate statistics
df_transformed.describe(include=[np.number])

Unnamed: 0,DrawNo,LuckyNo,DrawYear,DrawMonth,DrawDay,DrawDayOfYear,DrawDayOfWeek,DrawWeekDay,DrawWeekOfYear,DrawQuarter
count,104075.0,104075.0,104075.0,104075.0,104075.0,104075.0,104075.0,104075.0,104075.0,104075.0
mean,266999.380331,4975.299054,2005.678674,6.510939,15.633149,182.672707,4.051934,4.051934,26.494365,2.503646
std,130638.277059,2885.777935,7.767156,3.47026,8.943265,106.093162,1.843786,1.843786,15.155788,1.123259
min,40792.0,0.0,1992.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,153899.0,2464.0,1999.0,3.0,8.0,90.0,2.0,2.0,13.0,1.0
50%,267006.0,4954.0,2006.0,7.0,16.0,183.0,5.0,5.0,27.0,3.0
75%,380112.0,7466.0,2012.0,10.0,24.0,275.0,6.0,6.0,40.0,4.0
max,493219.0,9999.0,2019.0,12.0,31.0,366.0,6.0,6.0,53.0,4.0


In [65]:
# Value counts
# df_transformed['LuckyNo'].value_counts()

## Pipeline

In [46]:
# Imports
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer

 
LSTM <br>
https://www.pyimagesearch.com/2018/05/07/multi-label-classification-with-keras/ <br>
https://www.pyimagesearch.com/2018/06/04/keras-multiple-outputs-and-multiple-losses/ <br>
<a href="http://www.crest.fr/ckfinder/userfiles/files/Pageperso/vcottet/paper%20-%20Deep%20Learning%20predicts%20Loto.pdf">Paper</a>
<br>
<a href="
https://stackoverflow.com/questions/44202627/keras-model-from-nn-schematic">StackOverflow</a>

In [47]:
# Pipeline for numerical columns
num_pipeline = Pipeline([
                    ('std_scaler', StandardScaler())
               ])


In [48]:
# Pipeline for categorical columns

In [49]:
# Full pipeline

In [50]:
# Transform the data