In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy as sp

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

DATASET = Path("datasets/lotto/data_processed.csv")

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [None]:
# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

In [None]:
dataset.head(10)

In [None]:
df = dataset.copy()
len(df.columns)

In [None]:
for ind, column in enumerate(df.columns):
    print('"{}"'.format(column), end=', ')
    if ind % 5 == 0:
        print()

In [None]:
numeric_features = ["DrawNo",
                    "DaysSinceLastExactMatch", "DaysUntilNextExactMatch", 
                    "DaysSinceLastAnyMatch", "DaysUntilNextAnyMatch",
                    "DrawYear", "DrawMonth", "DrawWeek", "DrawDay", "DrawDayofweek", "DrawDayofyear",
                    "CumProbaExactMatch", "CumProbaAnyMatch",
                    "0_1", "0_2", "0_3", "0_4", "0_5", 
                    "0_6", "0_7", "0_8", "0_9", "1_2", "1_3", "1_4", "1_5", 
                    "1_6", "1_7", "1_8", "1_9", "2_3", "2_4", "2_5", "2_6", 
                    "2_7", "2_8", "2_9", "3_4", "3_5", "3_6", "3_7", "3_8", 
                    "3_9", "4_5", "4_6", "4_7", "4_8", "4_9", "5_6", "5_7", 
                    "5_8", "5_9", "6_7", "6_8", "6_9", "7_8", "7_9", "8_9", 
                    "0_0", "1_1", "2_2", "3_3", "4_4", "5_5", "6_6", "7_7", 
                    "8_8", "9_9",
                    "pos_1_0_freq", "pos_1_1_freq", "pos_1_2_freq", "pos_1_3_freq", "pos_1_4_freq", "pos_1_5_freq", 
                    "pos_1_6_freq", "pos_1_7_freq", "pos_1_8_freq", "pos_1_9_freq", "pos_2_0_freq", "pos_2_1_freq", "pos_2_2_freq", "pos_2_3_freq", 
                    "pos_2_4_freq", "pos_2_5_freq", "pos_2_6_freq", "pos_2_7_freq", "pos_2_8_freq", "pos_2_9_freq", "pos_3_0_freq", "pos_3_1_freq", 
                    "pos_3_2_freq", "pos_3_3_freq", "pos_3_4_freq", "pos_3_5_freq", "pos_3_6_freq", "pos_3_7_freq", "pos_3_8_freq", "pos_3_9_freq", 
                    "pos_4_0_freq", "pos_4_1_freq", "pos_4_2_freq", "pos_4_3_freq", "pos_4_4_freq", "pos_4_5_freq", "pos_4_6_freq", "pos_4_7_freq", 
                    "pos_4_8_freq", "pos_4_9_freq"
                   ]
categorical_features = ["PrizeType"]
datetime_features = ["DrawDate"]
target_feature = ["LuckyNo"]
target_features_drilled = ["1st_digit", "2nd_digit", "3rd_digit", "4th_digit"]

# mean_features = ["TotalMean", "1stDigitMean", "2ndDigitMean", "3rdDigitMean", "4thDigitMean"]

len(numeric_features)

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
nf_scaled = scaler.fit_transform(df[numeric_features])
print(scaler.mean_)
print(scaler.inverse_transform(nf_scaled))

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

onehot_encoder = OneHotEncoder()
cf_scaled = onehot_encoder.fit_transform(df[categorical_features])
onehot_encoder.categories_, cf_scaled.shape, len(cf_scaled.toarray()), cf_scaled.toarray()

In [None]:
# Features
nf_scaled.shape, cf_scaled.shape

In [None]:
# Join the 2 arrays
features = sp.sparse.hstack((nf_scaled, cf_scaled))
features.shape

In [None]:
labels = scaler.fit_transform(df[target_feature])
labels, scaler.inverse_transform(labels)

### Correlation

In [None]:
features.shape, labels.shape

In [None]:
features.dtype, labels.dtype

In [None]:
for ind, f in enumerate(nf_scaled[0]):
    print(numeric_features[ind], '\t\t', np.corrcoef(nf_scaled[:, ind], labels[:, 0])[0][1])
    # print(nf_scaled[:, ind].shape, labels[:, 0].shape)
    # print()

#### Drop features

In [None]:
# drop DrawDay
numeric_features = ["DrawNo",
                    "DaysSinceLastExactMatch", "DaysUntilNextExactMatch", 
                    "DaysSinceLastAnyMatch", "DaysUntilNextAnyMatch",
                    "DrawYear", "DrawMonth", "DrawWeek", "DrawDayofweek", "DrawDayofyear",
                    "CumProbaExactMatch", "CumProbaAnyMatch"
                   ]
categorical_features = ["PrizeType"]
datetime_features = ["DrawDate"]
target_feature = ["LuckyNo"]

In [None]:
nf_scaled = scaler.fit_transform(df[numeric_features])
for ind, f in enumerate(nf_scaled[0]):
    print(numeric_features[ind], '\t\t', np.corrcoef(nf_scaled[:, ind], labels[:, 0])[0][1])

In [None]:
# Validation
corr_matrix = df[numeric_features + categorical_features + target_feature].corr()
print(corr_matrix['LuckyNo'].sort_values(ascending=False))

In [None]:
df = dataset[target_feature + target_features_drilled + categorical_features + numeric_features + datetime_features].copy()
df.columns

In [None]:
df.describe() # all data

In [None]:
# 1/3 of data
df.head( int(len(df) / 3)).describe()

In [None]:
# 1/2 of data
df.head( int(len(df) / 2)).describe()

In [None]:
df['LuckyNo'].mean(), df['1st_digit'].mean(), df['2nd_digit'].mean(), df['3rd_digit'].mean(), df['4th_digit'].mean()

In [None]:
# Derive the means

df["TotalMean"] = 0
df["1stDigitMean"] = 0
df["2ndDigitMean"] = 0 
df["3rdDigitMean"] = 0 
df["4thDigitMean"] = 0

for index, draw_date in enumerate(df.DrawDate.unique()):
    if (index + 1) % 1000 == 0:
        print(f'Processing {index+1}')
    df.loc[df.DrawDate == draw_date, 'TotalMean'] = df[df.DrawDate <= draw_date]['LuckyNo'].mean()
    # print(len(df[df.DrawDate <= draw_date]['LuckyNo']))
    # print(draw_date, df[df.DrawDate == draw_date]['LuckyNo'].mean())
    # if (index + 1 % 100 == 0):
        #break
    #print()

In [None]:
df.head(26)