<a href="https://colab.research.google.com/github/mengwangk/FortuneNet/blob/master/fortunenet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FortuneNet

## Magic

In [0]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

## Imports

In [0]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd
import featuretools as ft
import matplotlib as mpl
import matplotlib.pyplot as plt

In [0]:
from fastai.basics import *
from fastai.tabular import *

## Setup

In [0]:
# to make this notebook's output stable across runs
np.random.seed(42)

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.display import display
pd.options.display.max_columns = 50
pd.options.display.html.table_schema = True

In [0]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
ID = "fortunenet"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Helper Functions

In [0]:
def add_datepart(df, fldname, drop=True, time=False):
    """Helper function that adds columns relevant to a date."""
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [0]:
def download_file(url, path, download_path):
    """Download the dataset"""
    if not os.path.isdir(path):
        os.makedirs(path)
    urllib.request.urlretrieve(url, download_path)
    if (os.path.isfile(download_path)):
        print(f'Download path: {download_path}')

## Datasets - Run the section you want to analyse

In [0]:
import os
import zipfile
from six.moves import urllib

DOWNLOAD_ROOT = "http://www.sportstoto.com.my/"
LOTTO_PATH = os.path.join(os.getcwd(), "datasets", "lotto")

### Dataset - 4D

In [0]:
_4D_URL = DOWNLOAD_ROOT + "upload/4D.zip"
_4D_ZIP = os.path.join(LOTTO_PATH, "4D.zip")

In [14]:
# Download 4D
download_file(_4D_URL, LOTTO_PATH, _4D_ZIP)

Download path: /content/datasets/lotto/4D.zip


In [0]:
downloaded_file = _4D_ZIP

### Dataset - Others

## Read the downloaded file

In [0]:
# Read into data frame
df = pd.read_csv(downloaded_file, header=0, sep=',', quotechar='"')

In [64]:
# Tail the last 10 rows
df.tail(10).T

Unnamed: 0,4524,4525,4526,4527,4528,4529,4530,4531,4532,4533
DrawNo,493219,493319,493419,493519,493619,493719,493819,493919,494019,494119
DrawDate,20190410,20190413,20190414,20190417,20190420,20190421,20190424,20190427,20190428,20190501
1stPrizeNo,1002,9397,3196,4007,6622,3634,1615,3582,8975,4633
2ndPrizeNo,7403,1660,3443,8246,7203,5822,9328,1486,1998,4154
3rdPrizeNo,123,2187,1796,1493,4982,4752,6138,8937,8973,318
SpecialNo1,4950,9181,1957,2102,993,7700,6914,8464,3112,1271
SpecialNo2,1195,2838,7325,194,7631,1287,4874,5668,2316,7440
SpecialNo3,524,4971,6801,6853,8813,2736,6988,4914,8050,8707
SpecialNo4,6065,400,2098,9083,6902,5179,2663,2182,4631,6127
SpecialNo5,2364,9201,2517,6763,8474,4386,3020,2689,3584,2690


## Preprocessing

### Transpose the data

In [65]:
df_transposed = df.melt(id_vars=["DrawNo", "DrawDate"], var_name="PrizeType", value_name="LuckyNo")
df_transposed = df_transposed.sort_values(["DrawNo", "DrawDate", "PrizeType"], ascending=True)
df_transposed['DrawDate'] = pd.to_datetime(df_transposed['DrawDate'], format='%Y%m%d')
df_transposed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104282 entries, 0 to 54407
Data columns (total 4 columns):
DrawNo       104282 non-null int64
DrawDate     104282 non-null datetime64[ns]
PrizeType    104282 non-null object
LuckyNo      104282 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 4.0+ MB


In [66]:
# Tail the data
df_transposed.head(23).T

Unnamed: 0,0,4534,9068,58942,99748,63476,68010,72544,77078,81612,86146,90680,95214,13602,54408,18136,22670,27204,31738,36272,40806,45340,49874
DrawNo,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792,40792
DrawDate,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00,1992-05-06 00:00:00
PrizeType,1stPrizeNo,2ndPrizeNo,3rdPrizeNo,ConsolationNo1,ConsolationNo10,ConsolationNo2,ConsolationNo3,ConsolationNo4,ConsolationNo5,ConsolationNo6,ConsolationNo7,ConsolationNo8,ConsolationNo9,SpecialNo1,SpecialNo10,SpecialNo2,SpecialNo3,SpecialNo4,SpecialNo5,SpecialNo6,SpecialNo7,SpecialNo8,SpecialNo9
LuckyNo,19,1124,592,5311,407,1949,1606,3775,6226,1271,7455,7227,9258,950,5301,2479,7139,3114,4609,7836,8981,4465,6114


In [67]:
# Split the date into year, month, day, day of week
# df_transformed["DrawYear"] = pd.DatetimeIndex(df_transformed["DrawDate"]).year
# df_transformed["DrawMonth"] = pd.DatetimeIndex(df_transformed["DrawDate"]).month
# df_transformed["DrawDay"] = pd.DatetimeIndex(df_transformed["DrawDate"]).day
# df_transformed["DrawDayOfYear"] = pd.DatetimeIndex(df_transformed["DrawDate"]).dayofyear
# df_transformed["DrawDayOfWeek"] = pd.DatetimeIndex(df_transformed["DrawDate"]).dayofweek
# df_transformed["DrawWeekDay"] = pd.DatetimeIndex(df_transformed["DrawDate"]).weekday
# df_transformed["DrawWeekOfYear"] = pd.DatetimeIndex(df_transformed["DrawDate"]).weekofyear
# df_transformed["DrawQuarter"] = pd.DatetimeIndex(df_transformed["DrawDate"]).quarter
df_processed = df_transposed.copy()
add_datepart(df_processed, 'DrawDate', drop=False)
df_processed.tail(10)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DrawYear,DrawMonth,DrawWeek,DrawDay,DrawDayofweek,DrawDayofyear,DrawIs_month_end,DrawIs_month_start,DrawIs_quarter_end,DrawIs_quarter_start,DrawIs_year_end,DrawIs_year_start,DrawElapsed
18135,494119,2019-05-01,SpecialNo1,1271,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
58941,494119,2019-05-01,SpecialNo10,2106,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
22669,494119,2019-05-01,SpecialNo2,7440,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
27203,494119,2019-05-01,SpecialNo3,8707,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
31737,494119,2019-05-01,SpecialNo4,6127,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
36271,494119,2019-05-01,SpecialNo5,2690,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
40805,494119,2019-05-01,SpecialNo6,8531,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
45339,494119,2019-05-01,SpecialNo7,9627,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
49873,494119,2019-05-01,SpecialNo8,6547,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800
54407,494119,2019-05-01,SpecialNo9,2222,2019,5,18,1,2,121,False,True,False,False,False,False,1556668800


### Train and Validation Split

In [31]:
n = len(df_processed); n

104282

In [0]:
train_df = df_processed

### Field Variables

In [0]:
# Outcome to predict
dep_var = "LuckyNo"

## Experiment with a sample

In [41]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104282 entries, 0 to 104281
Data columns (total 16 columns):
DrawNo                  104282 non-null int64
PrizeType               104282 non-null object
LuckyNo                 104282 non-null int64
DrawYear                104282 non-null int64
DrawMonth               104282 non-null int64
DrawWeek                104282 non-null int64
DrawDay                 104282 non-null int64
DrawDayofweek           104282 non-null int64
DrawDayofyear           104282 non-null int64
DrawIs_month_end        104282 non-null bool
DrawIs_month_start      104282 non-null bool
DrawIs_quarter_end      104282 non-null bool
DrawIs_quarter_start    104282 non-null bool
DrawIs_year_end         104282 non-null bool
DrawIs_year_start       104282 non-null bool
DrawElapsed             104282 non-null int64
dtypes: bool(6), int64(9), object(1)
memory usage: 8.6+ MB


In [40]:
idx = np.random.permutation(range(n))[:6000]
print(n)
print(len(idx))
print(idx)

idx.sort()

small_train_df = train_df.iloc[idx[:3000]]
small_test_df = train_df.iloc[idx[3000:]]


104282
6000
[35393 56792 90165 45932 ... 29432 59465 57785  8622]


In [0]:
# Set continuous and categorical columns
small_cont_vars = ["DrawNo", "DrawElapsed"]
small_cat_vars = ["PrizeType", "DrawYear", "DrawMonth", "DrawWeek", "DrawDay", "DrawDayofweek", 
                  "DrawDayofyear","DrawIs_month_end","DrawIs_month_start", "DrawIs_quarter_end", "DrawIs_quarter_start", "DrawIs_year_end", "DrawIs_year_start"]

In [0]:
small_train_df = small_train_df[small_cat_vars + small_cont_vars + [dep_var]]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + [dep_var]]

In [48]:
small_train_df.head(5)

Unnamed: 0,PrizeType,DrawYear,DrawMonth,DrawWeek,DrawDay,DrawDayofweek,DrawDayofyear,DrawIs_month_end,DrawIs_month_start,DrawIs_quarter_end,DrawIs_quarter_start,DrawIs_year_end,DrawIs_year_start,DrawNo,DrawElapsed,LuckyNo
11,1stPrizeNo,1992,5,22,31,6,152,True,False,False,False,False,False,41892,707270400,3876
18,1stPrizeNo,1992,6,25,17,2,169,False,False,False,False,False,False,42592,708739200,6440
31,1stPrizeNo,1992,7,29,16,3,198,False,False,False,False,False,False,43892,711244800,8467
43,1stPrizeNo,1992,8,33,13,3,226,False,False,False,False,False,False,45092,713664000,5592
72,1stPrizeNo,1992,10,43,21,2,295,False,False,False,False,False,False,47992,719625600,7942


In [49]:
small_test_df.head(5)

Unnamed: 0,PrizeType,DrawYear,DrawMonth,DrawWeek,DrawDay,DrawDayofweek,DrawDayofyear,DrawIs_month_end,DrawIs_month_start,DrawIs_quarter_end,DrawIs_quarter_start,DrawIs_year_end,DrawIs_year_start,DrawNo,DrawElapsed,LuckyNo
50594,SpecialNo9,1996,12,49,8,6,343,False,False,False,False,False,False,112796,850003200,4406
50617,SpecialNo9,1997,2,5,1,5,32,False,True,False,False,False,False,115097,854755200,6167
50628,SpecialNo9,1997,2,9,26,2,57,False,False,False,False,False,False,116197,856915200,298
50638,SpecialNo9,1997,3,12,22,5,81,False,False,False,False,False,False,117197,858988800,8123
50669,SpecialNo9,1997,6,22,1,6,152,False,True,False,False,False,False,120297,865123200,7594


In [0]:
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)

In [52]:
small_train_df.PrizeType.cat.categories

Index(['1stPrizeNo', '2ndPrizeNo', '3rdPrizeNo', 'SpecialNo1', 'SpecialNo2',
       'SpecialNo3', 'SpecialNo4', 'SpecialNo5', 'SpecialNo6', 'SpecialNo7',
       'SpecialNo8', 'SpecialNo9'],
      dtype='object')

In [54]:
small_train_df.PrizeType.cat.codes[:10]

11     0
18     0
31     0
43     0
72     0
73     0
87     0
107    0
122    0
169    0
dtype: int8

In [0]:
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)

In [57]:
doc(FillMissing)

In [58]:
small_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 11 to 50566
Data columns (total 16 columns):
PrizeType               3000 non-null category
DrawYear                3000 non-null category
DrawMonth               3000 non-null category
DrawWeek                3000 non-null category
DrawDay                 3000 non-null category
DrawDayofweek           3000 non-null category
DrawDayofyear           3000 non-null category
DrawIs_month_end        3000 non-null category
DrawIs_month_start      3000 non-null category
DrawIs_quarter_end      3000 non-null category
DrawIs_quarter_start    3000 non-null category
DrawIs_year_end         3000 non-null category
DrawIs_year_start       3000 non-null category
DrawNo                  3000 non-null int64
DrawElapsed             3000 non-null int64
LuckyNo                 3000 non-null int64
dtypes: category(13), int64(3)
memory usage: 155.1 KB


## Preparing full data set

In [0]:
train_df = df_processed
