In [50]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy as sp

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

DATASET = Path("datasets/lotto/data_processed.csv")

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [3]:
# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

In [56]:
dataset.head(10)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch,DaysUntilNextExactMatch,DaysSinceLastAnyMatch,DaysUntilNextAnyMatch,1st_digit,2nd_digit,...,pos_4_0_freq,pos_4_1_freq,pos_4_2_freq,pos_4_3_freq,pos_4_4_freq,pos_4_5_freq,pos_4_6_freq,pos_4_7_freq,pos_4_8_freq,pos_4_9_freq
0,40792,1992-05-06,1stPrizeNo,19,0,28,0,28,0,0,...,5,8,2,2,5,5,5,6,2,6
1,40792,1992-05-06,2ndPrizeNo,1124,0,2436,0,162,1,1,...,5,8,2,2,5,5,5,6,2,6
2,40792,1992-05-06,3rdPrizeNo,592,0,25,0,11,0,5,...,5,8,2,2,5,5,5,6,2,6
3,40792,1992-05-06,ConsolationNo1,5311,0,515,0,204,5,3,...,5,8,2,2,5,5,5,6,2,6
4,40792,1992-05-06,ConsolationNo10,407,0,2114,0,22,0,4,...,5,8,2,2,5,5,5,6,2,6
5,40792,1992-05-06,ConsolationNo2,1949,0,95,0,95,1,9,...,5,8,2,2,5,5,5,6,2,6
6,40792,1992-05-06,ConsolationNo3,1606,0,351,0,35,1,6,...,5,8,2,2,5,5,5,6,2,6
7,40792,1992-05-06,ConsolationNo4,3775,0,494,0,126,3,7,...,5,8,2,2,5,5,5,6,2,6
8,40792,1992-05-06,ConsolationNo5,6226,0,494,0,232,6,2,...,5,8,2,2,5,5,5,6,2,6
9,40792,1992-05-06,ConsolationNo6,1271,0,410,0,119,1,2,...,5,8,2,2,5,5,5,6,2,6


In [18]:
df = dataset.copy()
len(df.columns)

115

In [19]:
for ind, column in enumerate(df.columns):
    print('"{}"'.format(column), end=', ')
    if ind % 5 == 0:
        print()

"DrawNo", 
"DrawDate", "PrizeType", "LuckyNo", "DaysSinceLastExactMatch", "DaysUntilNextExactMatch", 
"DaysSinceLastAnyMatch", "DaysUntilNextAnyMatch", "1st_digit", "2nd_digit", "3rd_digit", 
"4th_digit", "DrawYear", "DrawMonth", "DrawWeek", "DrawDay", 
"DrawDayofweek", "DrawDayofyear", "CumProbaExactMatch", "CumProbaAnyMatch", "0_1", 
"0_2", "0_3", "0_4", "0_5", "0_6", 
"0_7", "0_8", "0_9", "1_2", "1_3", 
"1_4", "1_5", "1_6", "1_7", "1_8", 
"1_9", "2_3", "2_4", "2_5", "2_6", 
"2_7", "2_8", "2_9", "3_4", "3_5", 
"3_6", "3_7", "3_8", "3_9", "4_5", 
"4_6", "4_7", "4_8", "4_9", "5_6", 
"5_7", "5_8", "5_9", "6_7", "6_8", 
"6_9", "7_8", "7_9", "8_9", "0_0", 
"1_1", "2_2", "3_3", "4_4", "5_5", 
"6_6", "7_7", "8_8", "9_9", "pos_1_0_freq", 
"pos_1_1_freq", "pos_1_2_freq", "pos_1_3_freq", "pos_1_4_freq", "pos_1_5_freq", 
"pos_1_6_freq", "pos_1_7_freq", "pos_1_8_freq", "pos_1_9_freq", "pos_2_0_freq", 
"pos_2_1_freq", "pos_2_2_freq", "pos_2_3_freq", "pos_2_4_freq", "pos_2_5_freq", 
"pos_2_6_freq"

In [156]:
numeric_features = ["DrawNo",
                    "DaysSinceLastExactMatch", "DaysUntilNextExactMatch", 
                    "DaysSinceLastAnyMatch", "DaysUntilNextAnyMatch",
                    "DrawYear", "DrawMonth", "DrawWeek", "DrawDay", "DrawDayofweek", "DrawDayofyear",
                    "CumProbaExactMatch", "CumProbaAnyMatch",
                    "0_1", "0_2", "0_3", "0_4", "0_5", 
                    "0_6", "0_7", "0_8", "0_9", "1_2", "1_3", "1_4", "1_5", 
                    "1_6", "1_7", "1_8", "1_9", "2_3", "2_4", "2_5", "2_6", 
                    "2_7", "2_8", "2_9", "3_4", "3_5", "3_6", "3_7", "3_8", 
                    "3_9", "4_5", "4_6", "4_7", "4_8", "4_9", "5_6", "5_7", 
                    "5_8", "5_9", "6_7", "6_8", "6_9", "7_8", "7_9", "8_9", 
                    "0_0", "1_1", "2_2", "3_3", "4_4", "5_5", "6_6", "7_7", 
                    "8_8", "9_9",
                    "pos_1_0_freq", "pos_1_1_freq", "pos_1_2_freq", "pos_1_3_freq", "pos_1_4_freq", "pos_1_5_freq", 
                    "pos_1_6_freq", "pos_1_7_freq", "pos_1_8_freq", "pos_1_9_freq", "pos_2_0_freq", "pos_2_1_freq", "pos_2_2_freq", "pos_2_3_freq", 
                    "pos_2_4_freq", "pos_2_5_freq", "pos_2_6_freq", "pos_2_7_freq", "pos_2_8_freq", "pos_2_9_freq", "pos_3_0_freq", "pos_3_1_freq", 
                    "pos_3_2_freq", "pos_3_3_freq", "pos_3_4_freq", "pos_3_5_freq", "pos_3_6_freq", "pos_3_7_freq", "pos_3_8_freq", "pos_3_9_freq", 
                    "pos_4_0_freq", "pos_4_1_freq", "pos_4_2_freq", "pos_4_3_freq", "pos_4_4_freq", "pos_4_5_freq", "pos_4_6_freq", "pos_4_7_freq", 
                    "pos_4_8_freq", "pos_4_9_freq"
                   ]
categorical_features = ["PrizeType"]
datetime_features = ["DrawDate"]
target_feature = ["LuckyNo"]
len(numeric_features)

108

### Scaling

In [157]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
nf_scaled = scaler.fit_transform(df[numeric_features])
print(scaler.mean_)
print(scaler.inverse_transform(nf_scaled))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


[2.68299492e+05 7.64409447e+02 7.64409447e+02 6.61790529e+01
 6.61790529e+01 2.00575478e+03 6.50164799e+00 2.64524280e+01
 1.56282136e+01 4.05405405e+00 1.82380356e+02 6.23821807e-02
 9.30387206e-01 5.47741134e+03 5.42231773e+03 5.55793957e+03
 5.45725005e+03 5.41884179e+03 5.39399253e+03 5.45006834e+03
 5.40091365e+03 5.48733092e+03 5.59624874e+03 5.59605845e+03
 5.66231993e+03 5.50389519e+03 5.52639903e+03 5.57656537e+03
 5.56644847e+03 5.53951791e+03 5.56605823e+03 5.50430521e+03
 5.41334476e+03 5.44459635e+03 5.38095957e+03 5.42613052e+03
 5.39721402e+03 5.51594419e+03 5.50239354e+03 5.51964821e+03
 5.52140628e+03 5.57041903e+03 5.55934102e+03 5.39769875e+03
 5.39214217e+03 5.38072270e+03 5.50358471e+03 5.51762360e+03
 5.36897956e+03 5.34769128e+03 5.42560338e+03 5.47009383e+03
 5.51257108e+03 5.45700198e+03 5.38235377e+03 5.54290200e+03
 5.28726368e+03 5.49614568e+03 2.71367765e+03 2.79092156e+03
 2.75304614e+03 2.85032806e+03 2.73804461e+03 2.66042936e+03
 2.66422610e+03 2.750623

In [158]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

onehot_encoder = OneHotEncoder()
cf_scaled = onehot_encoder.fit_transform(df[categorical_features])
onehot_encoder.categories_, cf_scaled.shape, len(cf_scaled.toarray()), cf_scaled.toarray()

([array(['1stPrizeNo', '2ndPrizeNo', '3rdPrizeNo', 'ConsolationNo1',
         'ConsolationNo10', 'ConsolationNo2', 'ConsolationNo3',
         'ConsolationNo4', 'ConsolationNo5', 'ConsolationNo6',
         'ConsolationNo7', 'ConsolationNo8', 'ConsolationNo9', 'SpecialNo1',
         'SpecialNo10', 'SpecialNo2', 'SpecialNo3', 'SpecialNo4',
         'SpecialNo5', 'SpecialNo6', 'SpecialNo7', 'SpecialNo8',
         'SpecialNo9'], dtype=object)],
 (104673, 23),
 104673,
 array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]]))

In [159]:
# Features
nf_scaled.shape, cf_scaled.shape

((104673, 108), (104673, 23))

In [160]:
# Join the 2 arrays
features = sp.sparse.hstack((nf_scaled, cf_scaled))
features.shape

(104673, 131)

In [161]:
labels = scaler.fit_transform(df[target_feature])
labels, scaler.inverse_transform(labels)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(array([[-1.71731986],
        [-1.33434554],
        [-1.51872774],
        ...,
        [ 0.0263397 ],
        [ 1.42064349],
        [-0.69247364]]), array([[  19.],
        [1124.],
        [ 592.],
        ...,
        [5050.],
        [9073.],
        [2976.]]))

### Correlation

In [162]:
features.shape, labels.shape

((104673, 131), (104673, 1))

In [163]:
features.dtype, labels.dtype

(dtype('float64'), dtype('float64'))

In [164]:
for ind, f in enumerate(nf_scaled[0]):
    print(numeric_features[ind], '\t\t', np.corrcoef(nf_scaled[:, ind], labels[:, 0])[0][1])
    # print(nf_scaled[:, ind].shape, labels[:, 0].shape)
    # print()

DrawNo 		 -0.0032182204064142117
DaysSinceLastExactMatch 		 0.005984842985229688
DaysUntilNextExactMatch 		 0.005984842985229688
DaysSinceLastAnyMatch 		 0.0023282328869966736
DaysUntilNextAnyMatch 		 0.0033065825999482705
DrawYear 		 -0.003070982129500968
DrawMonth 		 -0.0029849057887388255
DrawWeek 		 -0.002762140942847658
DrawDay 		 0.000759331149886402
DrawDayofweek 		 0.004379659385640327
DrawDayofyear 		 -0.002895897774558666
CumProbaExactMatch 		 -0.009410044867088286
CumProbaAnyMatch 		 -0.003994047653223088
0_1 		 -0.0032493732274394465
0_2 		 -0.0032389233437030685
0_3 		 -0.003238820261335415
0_4 		 -0.0032097768297360283
0_5 		 -0.0032663500389717415
0_6 		 -0.0032136848733541486
0_7 		 -0.0032363612261468986
0_8 		 -0.003176719789956222
0_9 		 -0.003209382184239365
1_2 		 -0.003191717565033962
1_3 		 -0.0032229673411500043
1_4 		 -0.0032336841605827395
1_5 		 -0.0032086447064334396
1_6 		 -0.0032130236276318182
1_7 		 -0.003213883597526321
1_8 		 -0.0031812922504174113
1_9

#### Drop features

In [165]:
# drop DrawDay
numeric_features = ["DrawNo",
                    "DaysSinceLastExactMatch", "DaysUntilNextExactMatch", 
                    "DaysSinceLastAnyMatch", "DaysUntilNextAnyMatch",
                    "DrawYear", "DrawMonth", "DrawWeek", "DrawDayofweek", "DrawDayofyear",
                    "CumProbaExactMatch", "CumProbaAnyMatch"
                   ]
categorical_features = ["PrizeType"]
datetime_features = ["DrawDate"]
target_feature = ["LuckyNo"]

In [168]:
nf_scaled = scaler.fit_transform(df[numeric_features])
for ind, f in enumerate(nf_scaled[0]):
    print(numeric_features[ind], '\t\t', np.corrcoef(nf_scaled[:, ind], labels[:, 0])[0][1])

DrawNo 		 -0.0032182204064142117
DaysSinceLastExactMatch 		 0.005984842985229688
DaysUntilNextExactMatch 		 0.005984842985229688
DaysSinceLastAnyMatch 		 0.0023282328869966736
DaysUntilNextAnyMatch 		 0.0033065825999482705
DrawYear 		 -0.003070982129500968
DrawMonth 		 -0.0029849057887388255
DrawWeek 		 -0.002762140942847658
DrawDayofweek 		 0.004379659385640327
DrawDayofyear 		 -0.002895897774558666
CumProbaExactMatch 		 -0.009410044867088286
CumProbaAnyMatch 		 -0.003994047653223088


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [171]:
corr_matrix = df[numeric_features + categorical_features + target_feature].corr()
print(corr_matrix['LuckyNo'].sort_values(ascending=False))

LuckyNo                    1.000000
DaysUntilNextExactMatch    0.005985
DaysSinceLastExactMatch    0.005985
DrawDayofweek              0.004380
DaysUntilNextAnyMatch      0.003307
DaysSinceLastAnyMatch      0.002328
DrawWeek                  -0.002762
DrawDayofyear             -0.002896
DrawMonth                 -0.002985
DrawYear                  -0.003071
DrawNo                    -0.003218
CumProbaAnyMatch          -0.003994
CumProbaExactMatch        -0.009410
Name: LuckyNo, dtype: float64


In [6]:
columns_to_drop = [ "DaysSinceLastExactMatch", "DaysUntilNextAnyMatch",
                    "1st_digit", "2nd_digit", "3rd_digit", "4th_digit",
                    "0_1", "0_2", "0_3", 
                    "0_4", "0_5", "0_6", "0_7", "0_8", "0_9", "1_2", "1_3", 
                    "1_4", "1_5", "1_6", "1_7", "1_8", "1_9", "2_3", "2_4", 
                    "2_5", "2_6", "2_7", "2_8", "2_9", "3_4", "3_5", "3_6", 
                    "3_7", "3_8", "3_9", "4_5", "4_6", "4_7", "4_8", "4_9", 
                    "5_6", "5_7", "5_8", "5_9", "6_7", "6_8", "6_9", "7_8", 
                    "7_9", "8_9", "0_0", "1_1", "2_2", "3_3", "4_4", "5_5", 
                    "6_6", "7_7", "8_8", "9_9",
                    "pos_1_0_freq", "pos_1_1_freq", "pos_1_2_freq", "pos_1_3_freq", 
                    "pos_1_4_freq", "pos_1_5_freq", "pos_1_6_freq", "pos_1_7_freq", "pos_1_8_freq", "pos_1_9_freq", "pos_2_0_freq", "pos_2_1_freq", 
                    "pos_2_2_freq", "pos_2_3_freq", "pos_2_4_freq", "pos_2_5_freq", "pos_2_6_freq", "pos_2_7_freq", "pos_2_8_freq", "pos_2_9_freq", 
                    "pos_3_0_freq", "pos_3_1_freq", "pos_3_2_freq", "pos_3_3_freq", "pos_3_4_freq", "pos_3_5_freq", "pos_3_6_freq", "pos_3_7_freq", 
                    "pos_3_8_freq", "pos_3_9_freq", "pos_4_0_freq", "pos_4_1_freq", "pos_4_2_freq", "pos_4_3_freq", "pos_4_4_freq", "pos_4_5_freq", 
                    "pos_4_6_freq", "pos_4_7_freq", "pos_4_8_freq", "pos_4_9_freq"
                  ]
df.drop(columns_to_drop,inplace=True, axis=1, errors='ignore')
df.columns

Index(['DrawNo', 'DrawDate', 'PrizeType', 'LuckyNo', 'DaysUntilNextExactMatch',
       'DaysSinceLastAnyMatch', 'DrawYear', 'DrawMonth', 'DrawWeek', 'DrawDay',
       'DrawDayofweek', 'DrawDayofyear', 'CumProbaExactMatch',
       'CumProbaAnyMatch'],
      dtype='object')

In [7]:
df.describe()

Unnamed: 0,DrawNo,LuckyNo,DaysUntilNextExactMatch,DaysSinceLastAnyMatch,DrawYear,DrawMonth,DrawWeek,DrawDay,DrawDayofweek,DrawDayofyear,CumProbaExactMatch,CumProbaAnyMatch
count,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0,104673.0
mean,268299.492419,4974.001777,764.409447,66.179053,2005.754779,6.501648,26.452428,15.628214,4.054054,182.380356,0.062382,0.930387
std,131388.886981,2885.324727,832.538166,93.325465,7.809743,3.462904,15.123744,8.942459,1.843042,105.868193,0.038001,0.675621
min,40792.0,0.0,0.0,0.0,1992.0,1.0,1.0,1.0,1.0,1.0,0.01,0.01
25%,154499.0,2463.0,157.0,15.0,1999.0,3.0,13.0,8.0,2.0,90.0,0.03,0.38
50%,268306.0,4952.0,504.0,39.0,2006.0,7.0,26.0,16.0,5.0,182.0,0.06,0.8
75%,382113.0,7463.0,1095.0,83.0,2013.0,10.0,40.0,24.0,6.0,274.0,0.09,1.31
max,495819.0,9999.0,8087.0,4113.0,2019.0,12.0,53.0,31.0,6.0,366.0,0.27,2.93
