# Training 

In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import featuretools as ft
import scipy.stats as ss
import math 
import matplotlib

from scipy import stats
from collections import Counter
from pathlib import Path

%matplotlib inline

sns.set(style="ticks")

from utils import *

In [50]:
DATASET = Path("datasets/lotto/data_processed_2.csv")

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

dataset.columns, len(dataset.columns)

(Index(['LuckyNo', '1st_digit', '2nd_digit', '3rd_digit', '4th_digit',
        'PrizeType', 'DrawNo', 'DaysSinceLastExactMatch',
        'DaysUntilNextExactMatch', 'DaysSinceLastAnyMatch',
        'DaysUntilNextAnyMatch', 'DrawYear', 'DrawMonth', 'DrawWeek',
        'DrawDayofweek', 'DrawDayofyear', 'CumProbaExactMatch',
        'CumProbaAnyMatch', 'DrawDate', 'TotalMean', '1stDigitMean',
        '2ndDigitMean', '3rdDigitMean', '4thDigitMean'],
       dtype='object'), 24)

In [51]:
df = dataset.copy()

In [52]:
num_attrs = ["TotalMean", "1stDigitMean", "2ndDigitMean", 
            "DaysUntilNextExactMatch", "DaysSinceLastExactMatch",
            "DrawDayofweek", "DaysUntilNextAnyMatch",
            "4thDigitMean", "DaysSinceLastAnyMatch",
            "3rdDigitMean", "DrawWeek", "DrawDayofyear", "DrawMonth"
            ]
cat_attrs = ["PrizeType"]
label = ["LuckyNo"]
len(num_attrs), len(cat_attrs)

(13, 1)

### Train Test Split

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [55]:
train_set.shape, test_set.shape

((83738, 24), (20935, 24))

In [81]:
data = train_set[num_attrs + cat_attrs]
labels = train_set[label].copy()
data.shape, labels.shape

((83738, 14), (83738, 1))

### Preprocessing

In [73]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer

In [74]:
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = num_pipeline = Pipeline([
       ("cat", OneHotEncoder(categories='auto'))
    ])

In [75]:
data_num_attrs = num_pipeline.fit_transform(data[num_attrs])
print(num_pipeline.inverse_transform(data_num_attrs))

[[4.97435594e+03 4.47595109e+00 4.48958657e+00 ... 1.00000000e+00
  5.00000000e+00 1.00000000e+00]
 [4.97801962e+03 4.47898887e+00 4.49526242e+00 ... 3.50000000e+01
  2.42000000e+02 8.00000000e+00]
 [4.97647746e+03 4.47743438e+00 4.49504685e+00 ... 4.40000000e+01
  3.06000000e+02 1.10000000e+01]
 ...
 [4.97570271e+03 4.47722912e+00 4.49034299e+00 ... 1.00000000e+01
  6.50000000e+01 3.00000000e+00]
 [5.00980206e+03 4.51258581e+00 4.47482838e+00 ... 3.10000000e+01
  2.12000000e+02 7.00000000e+00]
 [4.98869382e+03 4.48395671e+00 4.54825644e+00 ... 3.80000000e+01
  2.65000000e+02 9.00000000e+00]]


In [76]:
data_cat_attrs = cat_pipeline.fit_transform(data[cat_attrs])
print(cat_pipeline.inverse_transform(data_cat_attrs))

[['ConsolationNo1']
 ['1stPrizeNo']
 ['SpecialNo2']
 ...
 ['ConsolationNo7']
 ['ConsolationNo6']
 ['SpecialNo4']]


In [77]:
data_num_attrs.shape, data_cat_attrs.shape

((83738, 32483), (83738, 23))

In [78]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attrs),
        ("cat", cat_pipeline, cat_attrs),
    ])

data_prepared = full_pipeline.fit_transform(data)
data_prepared.shape

(83738, 32506)

In [79]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [82]:
# let's try the full preprocessing pipeline on a few training instances
some_data = data.iloc[:5]
some_labels = labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [[4463.51708075]
 [5130.42552388]
 [5330.17111437]
 [4667.01141844]
 [5477.38395408]]


In [84]:
print("Labels:", some_labels)

Labels:         LuckyNo
103020     6266
89263       249
85897      3339
54297      8800
996        3024


In [88]:
from sklearn.metrics import mean_squared_error

predictions = lin_reg.predict(data_prepared)
lin_mse = mean_squared_error(labels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2625.7318516601754