In [50]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy as sp

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

DATASET = Path("datasets/lotto/data_processed.csv")

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [3]:
# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

In [56]:
dataset.head(10)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch,DaysUntilNextExactMatch,DaysSinceLastAnyMatch,DaysUntilNextAnyMatch,1st_digit,2nd_digit,...,pos_4_0_freq,pos_4_1_freq,pos_4_2_freq,pos_4_3_freq,pos_4_4_freq,pos_4_5_freq,pos_4_6_freq,pos_4_7_freq,pos_4_8_freq,pos_4_9_freq
0,40792,1992-05-06,1stPrizeNo,19,0,28,0,28,0,0,...,5,8,2,2,5,5,5,6,2,6
1,40792,1992-05-06,2ndPrizeNo,1124,0,2436,0,162,1,1,...,5,8,2,2,5,5,5,6,2,6
2,40792,1992-05-06,3rdPrizeNo,592,0,25,0,11,0,5,...,5,8,2,2,5,5,5,6,2,6
3,40792,1992-05-06,ConsolationNo1,5311,0,515,0,204,5,3,...,5,8,2,2,5,5,5,6,2,6
4,40792,1992-05-06,ConsolationNo10,407,0,2114,0,22,0,4,...,5,8,2,2,5,5,5,6,2,6
5,40792,1992-05-06,ConsolationNo2,1949,0,95,0,95,1,9,...,5,8,2,2,5,5,5,6,2,6
6,40792,1992-05-06,ConsolationNo3,1606,0,351,0,35,1,6,...,5,8,2,2,5,5,5,6,2,6
7,40792,1992-05-06,ConsolationNo4,3775,0,494,0,126,3,7,...,5,8,2,2,5,5,5,6,2,6
8,40792,1992-05-06,ConsolationNo5,6226,0,494,0,232,6,2,...,5,8,2,2,5,5,5,6,2,6
9,40792,1992-05-06,ConsolationNo6,1271,0,410,0,119,1,2,...,5,8,2,2,5,5,5,6,2,6


In [18]:
df = dataset.copy()
len(df.columns)

115

In [19]:
for ind, column in enumerate(df.columns):
    print('"{}"'.format(column), end=', ')
    if ind % 5 == 0:
        print()

"DrawNo", 
"DrawDate", "PrizeType", "LuckyNo", "DaysSinceLastExactMatch", "DaysUntilNextExactMatch", 
"DaysSinceLastAnyMatch", "DaysUntilNextAnyMatch", "1st_digit", "2nd_digit", "3rd_digit", 
"4th_digit", "DrawYear", "DrawMonth", "DrawWeek", "DrawDay", 
"DrawDayofweek", "DrawDayofyear", "CumProbaExactMatch", "CumProbaAnyMatch", "0_1", 
"0_2", "0_3", "0_4", "0_5", "0_6", 
"0_7", "0_8", "0_9", "1_2", "1_3", 
"1_4", "1_5", "1_6", "1_7", "1_8", 
"1_9", "2_3", "2_4", "2_5", "2_6", 
"2_7", "2_8", "2_9", "3_4", "3_5", 
"3_6", "3_7", "3_8", "3_9", "4_5", 
"4_6", "4_7", "4_8", "4_9", "5_6", 
"5_7", "5_8", "5_9", "6_7", "6_8", 
"6_9", "7_8", "7_9", "8_9", "0_0", 
"1_1", "2_2", "3_3", "4_4", "5_5", 
"6_6", "7_7", "8_8", "9_9", "pos_1_0_freq", 
"pos_1_1_freq", "pos_1_2_freq", "pos_1_3_freq", "pos_1_4_freq", "pos_1_5_freq", 
"pos_1_6_freq", "pos_1_7_freq", "pos_1_8_freq", "pos_1_9_freq", "pos_2_0_freq", 
"pos_2_1_freq", "pos_2_2_freq", "pos_2_3_freq", "pos_2_4_freq", "pos_2_5_freq", 
"pos_2_6_freq"

In [17]:
numeric_features = ["DrawNo",
                    "DaysSinceLastExactMatch", "DaysUntilNextExactMatch", 
                    "DaysSinceLastAnyMatch", "DaysUntilNextAnyMatch",
                    "DrawYear", "DrawMonth", "DrawWeek", "DrawDay", "DrawDayofweek", "DrawDayofyear",
                    "CumProbaExactMatch", "CumProbaAnyMatch",
                    "0_1", "0_2", "0_3", "0_4", "0_5", 
                    "0_6", "0_7", "0_8", "0_9", "1_2", "1_3", "1_4", "1_5", 
                    "1_6", "1_7", "1_8", "1_9", "2_3", "2_4", "2_5", "2_6", 
                    "2_7", "2_8", "2_9", "3_4", "3_5", "3_6", "3_7", "3_8", 
                    "3_9", "4_5", "4_6", "4_7", "4_8", "4_9", "5_6", "5_7", 
                    "5_8", "5_9", "6_7", "6_8", "6_9", "7_8", "7_9", "8_9", 
                    "0_0", "1_1", "2_2", "3_3", "4_4", "5_5", "6_6", "7_7", 
                    "8_8", "9_9",
                    "pos_1_0_freq", "pos_1_1_freq", "pos_1_2_freq", "pos_1_3_freq", "pos_1_4_freq", "pos_1_5_freq", 
                    "pos_1_6_freq", "pos_1_7_freq", "pos_1_8_freq", "pos_1_9_freq", "pos_2_0_freq", "pos_2_1_freq", "pos_2_2_freq", "pos_2_3_freq", 
                    "pos_2_4_freq", "pos_2_5_freq", "pos_2_6_freq", "pos_2_7_freq", "pos_2_8_freq", "pos_2_9_freq", "pos_3_0_freq", "pos_3_1_freq", 
                    "pos_3_2_freq", "pos_3_3_freq", "pos_3_4_freq", "pos_3_5_freq", "pos_3_6_freq", "pos_3_7_freq", "pos_3_8_freq", "pos_3_9_freq", 
                    "pos_4_0_freq", "pos_4_1_freq", "pos_4_2_freq", "pos_4_3_freq", "pos_4_4_freq", "pos_4_5_freq", "pos_4_6_freq", "pos_4_7_freq", 
                    "pos_4_8_freq", "pos_4_9_freq"
                   ]
categorical_features = ["PrizeType"]
datetime_features = ["DrawDate"]
target_feature = ["LuckyNo"]
len(numeric_features)

108

### Scaling

In [42]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
nf_scaled = scaler.fit_transform(df[numeric_features])
print(scaler.inverse_transform(nf_transformed))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


[[4.07920e+04 0.00000e+00 2.80000e+01 ... 6.00000e+00 2.00000e+00
  6.00000e+00]
 [4.07920e+04 0.00000e+00 2.43600e+03 ... 6.00000e+00 2.00000e+00
  6.00000e+00]
 [4.07920e+04 0.00000e+00 2.50000e+01 ... 6.00000e+00 2.00000e+00
  6.00000e+00]
 ...
 [4.95819e+05 1.20000e+02 0.00000e+00 ... 1.03720e+04 1.04320e+04
  1.02760e+04]
 [4.95819e+05 5.92000e+02 0.00000e+00 ... 1.03720e+04 1.04320e+04
  1.02760e+04]
 [4.95819e+05 2.06600e+03 0.00000e+00 ... 1.03720e+04 1.04320e+04
  1.02760e+04]]


In [40]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

onehot_encoder = OneHotEncoder()
cf_scaled = onehot_encoder.fit_transform(df[categorical_features])
onehot_encoder.categories_, onehot_encoder.get_feature_names(), cf_scaled.toarray()

([array(['1stPrizeNo', '2ndPrizeNo', '3rdPrizeNo', 'ConsolationNo1',
         'ConsolationNo10', 'ConsolationNo2', 'ConsolationNo3',
         'ConsolationNo4', 'ConsolationNo5', 'ConsolationNo6',
         'ConsolationNo7', 'ConsolationNo8', 'ConsolationNo9', 'SpecialNo1',
         'SpecialNo10', 'SpecialNo2', 'SpecialNo3', 'SpecialNo4',
         'SpecialNo5', 'SpecialNo6', 'SpecialNo7', 'SpecialNo8',
         'SpecialNo9'], dtype=object)],
 array(['x0_1stPrizeNo', 'x0_2ndPrizeNo', 'x0_3rdPrizeNo',
        'x0_ConsolationNo1', 'x0_ConsolationNo10', 'x0_ConsolationNo2',
        'x0_ConsolationNo3', 'x0_ConsolationNo4', 'x0_ConsolationNo5',
        'x0_ConsolationNo6', 'x0_ConsolationNo7', 'x0_ConsolationNo8',
        'x0_ConsolationNo9', 'x0_SpecialNo1', 'x0_SpecialNo10',
        'x0_SpecialNo2', 'x0_SpecialNo3', 'x0_SpecialNo4', 'x0_SpecialNo5',
        'x0_SpecialNo6', 'x0_SpecialNo7', 'x0_SpecialNo8', 'x0_SpecialNo9'],
       dtype=object),
 array([[1., 0., 0., ..., 0., 0., 0.],
     

In [45]:
# Features
nf_scaled.shape, cf_scaled.shape

((104673, 108), (104673, 23))

In [51]:
# Join the 2 arrays
features = sp.sparse.hstack((nf_scaled, cf_scaled))
features.shape

(104673, 131)

In [55]:
labels = scaler.fit_transform(df[target_feature])
labels, scaler.inverse_transform(labels)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(array([[-1.71731986],
        [-1.33434554],
        [-1.51872774],
        ...,
        [ 0.0263397 ],
        [ 1.42064349],
        [-0.69247364]]), array([[  19.],
        [1124.],
        [ 592.],
        ...,
        [5050.],
        [9073.],
        [2976.]]))

In [14]:
corr_matrix = df.corr()
print(corr_matrix['LuckyNo'].sort_values(ascending=False))

LuckyNo                    1.000000
1st_digit                  0.994971
2nd_digit                  0.098613
3rd_digit                  0.010338
DaysUntilNextExactMatch    0.005985
DaysSinceLastExactMatch    0.005985
DrawDayofweek              0.004380
DaysUntilNextAnyMatch      0.003307
DaysSinceLastAnyMatch      0.002328
4th_digit                  0.001711
DrawDay                    0.000759
DrawWeek                  -0.002762
DrawDayofyear             -0.002896
DrawMonth                 -0.002985
DrawYear                  -0.003071
pos_2_8_freq              -0.003161
8_9                       -0.003164
pos_3_8_freq              -0.003164
pos_4_8_freq              -0.003165
pos_1_8_freq              -0.003165
0_8                       -0.003177
pos_2_9_freq              -0.003180
pos_4_9_freq              -0.003181
1_8                       -0.003181
pos_1_9_freq              -0.003184
pos_3_9_freq              -0.003185
3_6                       -0.003188
7_8                       -0

"DrawNo", 
"DrawDate", "PrizeType", "LuckyNo", "DaysSinceLastExactMatch", "DaysUntilNextExactMatch", "DaysSinceLastAnyMatch", "DaysUntilNextAnyMatch", "1st_digit", 
"2nd_digit", "3rd_digit", "4th_digit", "DrawYear", "DrawMonth", "DrawWeek", "DrawDay", "DrawDayofweek", 
"DrawDayofyear", "CumProbaExactMatch", "CumProbaAnyMatch", "0_1", "0_2", "0_3", "0_4", "0_5", 
"0_6", "0_7", "0_8", "0_9", "1_2", "1_3", "1_4", "1_5", 
"1_6", "1_7", "1_8", "1_9", "2_3", "2_4", "2_5", "2_6", 
"2_7", "2_8", "2_9", "3_4", "3_5", "3_6", "3_7", "3_8", 
"3_9", "4_5", "4_6", "4_7", "4_8", "4_9", "5_6", "5_7", 
"5_8", "5_9", "6_7", "6_8", "6_9", "7_8", "7_9", "8_9", 
"0_0", "1_1", "2_2", "3_3", "4_4", "5_5", "6_6", "7_7", 
"8_8", "9_9", "pos_1_0_freq", "pos_1_1_freq", "pos_1_2_freq", "pos_1_3_freq", "pos_1_4_freq", "pos_1_5_freq", 
"pos_1_6_freq", "pos_1_7_freq", "pos_1_8_freq", "pos_1_9_freq", "pos_2_0_freq", "pos_2_1_freq", "pos_2_2_freq", "pos_2_3_freq", 
"pos_2_4_freq", "pos_2_5_freq", "pos_2_6_freq", "pos_

In [6]:
columns_to_drop = [ "DaysSinceLastExactMatch", "DaysUntilNextAnyMatch",
                    "1st_digit", "2nd_digit", "3rd_digit", "4th_digit",
                    "0_1", "0_2", "0_3", 
                    "0_4", "0_5", "0_6", "0_7", "0_8", "0_9", "1_2", "1_3", 
                    "1_4", "1_5", "1_6", "1_7", "1_8", "1_9", "2_3", "2_4", 
                    "2_5", "2_6", "2_7", "2_8", "2_9", "3_4", "3_5", "3_6", 
                    "3_7", "3_8", "3_9", "4_5", "4_6", "4_7", "4_8", "4_9", 
                    "5_6", "5_7", "5_8", "5_9", "6_7", "6_8", "6_9", "7_8", 
                    "7_9", "8_9", "0_0", "1_1", "2_2", "3_3", "4_4", "5_5", 
                    "6_6", "7_7", "8_8", "9_9",
                    "pos_1_0_freq", "pos_1_1_freq", "pos_1_2_freq", "pos_1_3_freq", 
                    "pos_1_4_freq", "pos_1_5_freq", "pos_1_6_freq", "pos_1_7_freq", "pos_1_8_freq", "pos_1_9_freq", "pos_2_0_freq", "pos_2_1_freq", 
                    "pos_2_2_freq", "pos_2_3_freq", "pos_2_4_freq", "pos_2_5_freq", "pos_2_6_freq", "pos_2_7_freq", "pos_2_8_freq", "pos_2_9_freq", 
                    "pos_3_0_freq", "pos_3_1_freq", "pos_3_2_freq", "pos_3_3_freq", "pos_3_4_freq", "pos_3_5_freq", "pos_3_6_freq", "pos_3_7_freq", 
                    "pos_3_8_freq", "pos_3_9_freq", "pos_4_0_freq", "pos_4_1_freq", "pos_4_2_freq", "pos_4_3_freq", "pos_4_4_freq", "pos_4_5_freq", 
                    "pos_4_6_freq", "pos_4_7_freq", "pos_4_8_freq", "pos_4_9_freq"
                  ]
df.drop(columns_to_drop,inplace=True, axis=1, errors='ignore')
df.columns

Index(['DrawNo', 'DrawDate', 'PrizeType', 'LuckyNo', 'DaysUntilNextExactMatch',
       'DaysSinceLastAnyMatch', 'DrawYear', 'DrawMonth', 'DrawWeek', 'DrawDay',
       'DrawDayofweek', 'DrawDayofyear', 'CumProbaExactMatch',
       'CumProbaAnyMatch'],
      dtype='object')

In [7]:
df.describe()

Unnamed: 0,DrawNo,LuckyNo,DaysUntilNextExactMatch,DaysSinceLastAnyMatch,DrawYear,DrawMonth,DrawWeek,DrawDay,DrawDayofweek,DrawDayofyear,CumProbaExactMatch,CumProbaAnyMatch
count,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0
mean,268299.492419,4974.001777,764.409447,66.179053,2005.754779,6.501648,26.452428,15.628214,4.054054,182.380356,0.062382,0.930387
std,131388.886981,2885.324727,832.538166,93.325465,7.809743,3.462904,15.123744,8.942459,1.843042,105.868193,0.038001,0.675621
min,40792.0,0.0,0.0,0.0,1992.0,1.0,1.0,1.0,1.0,1.0,0.01,0.01
25%,154499.0,2463.0,157.0,15.0,1999.0,3.0,13.0,8.0,2.0,90.0,0.03,0.38
50%,268306.0,4952.0,504.0,39.0,2006.0,7.0,26.0,16.0,5.0,182.0,0.06,0.8
75%,382113.0,7463.0,1095.0,83.0,2013.0,10.0,40.0,24.0,6.0,274.0,0.09,1.31
max,495819.0,9999.0,8087.0,4113.0,2019.0,12.0,53.0,31.0,6.0,366.0,0.27,2.93
