# Feature Engineering

In [96]:
from pathlib import Path
import pandas as pd
import numpy as np

DATASET = Path("datasets/lotto/data_processed.csv")

In [2]:
# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

In [3]:
df = dataset.copy()

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104673 entries, 0 to 104672
Columns: 115 entries, DrawNo to pos_4_9_freq
dtypes: datetime64[ns](1), float64(2), int64(111), object(1)
memory usage: 91.8+ MB


In [98]:
df.head(1)

Unnamed: 0,DrawNo,DrawDate,PrizeType,LuckyNo,DaysSinceLastExactMatch,DaysUntilNextExactMatch,DaysSinceLastAnyMatch,DaysUntilNextAnyMatch,1st_digit,2nd_digit,...,pos_4_0_freq,pos_4_1_freq,pos_4_2_freq,pos_4_3_freq,pos_4_4_freq,pos_4_5_freq,pos_4_6_freq,pos_4_7_freq,pos_4_8_freq,pos_4_9_freq
0,40792,1992-05-06,1stPrizeNo,19,0,28,0,28,0,0,...,5,8,2,2,5,5,5,6,2,6


In [6]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

In [None]:
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

In [93]:
# pytorch
import torch
from torchvision import transforms

In [103]:
# fastai
from fastai import *
from fastai.vision import *

In [None]:
np.c_?

In [111]:
for ind, column in enumerate(df.columns):
    print("[{}]".format(column), end=' ')
    if ind % 10 == 0:
        print()

[DrawNo] 
[DrawDate] [PrizeType] [LuckyNo] [DaysSinceLastExactMatch] [DaysUntilNextExactMatch] [DaysSinceLastAnyMatch] [DaysUntilNextAnyMatch] [1st_digit] [2nd_digit] [3rd_digit] 
[4th_digit] [DrawYear] [DrawMonth] [DrawWeek] [DrawDay] [DrawDayofweek] [DrawDayofyear] [CumProbaExactMatch] [CumProbaAnyMatch] [0_1] 
[0_2] [0_3] [0_4] [0_5] [0_6] [0_7] [0_8] [0_9] [1_2] [1_3] 
[1_4] [1_5] [1_6] [1_7] [1_8] [1_9] [2_3] [2_4] [2_5] [2_6] 
[2_7] [2_8] [2_9] [3_4] [3_5] [3_6] [3_7] [3_8] [3_9] [4_5] 
[4_6] [4_7] [4_8] [4_9] [5_6] [5_7] [5_8] [5_9] [6_7] [6_8] 
[6_9] [7_8] [7_9] [8_9] [0_0] [1_1] [2_2] [3_3] [4_4] [5_5] 
[6_6] [7_7] [8_8] [9_9] [pos_1_0_freq] [pos_1_1_freq] [pos_1_2_freq] [pos_1_3_freq] [pos_1_4_freq] [pos_1_5_freq] 
[pos_1_6_freq] [pos_1_7_freq] [pos_1_8_freq] [pos_1_9_freq] [pos_2_0_freq] [pos_2_1_freq] [pos_2_2_freq] [pos_2_3_freq] [pos_2_4_freq] [pos_2_5_freq] 
[pos_2_6_freq] [pos_2_7_freq] [pos_2_8_freq] [pos_2_9_freq] [pos_3_0_freq] [pos_3_1_freq] [pos_3_2_freq] [pos_3_3_

In [108]:
numeric_features = ["DaysSinceLastExactMatch", "DaysUntilNextExactMatch"]
categorical_features = ["PrizeType"]
datetime_features = ["DrawDate"]
target_feature = ["DrawNo"]

In [112]:
df[target_feature + datetime_features + categorical_features + numeric_features].head(10)

Unnamed: 0,DrawNo,DrawDate,PrizeType,DaysSinceLastExactMatch,DaysUntilNextExactMatch
0,40792,1992-05-06,1stPrizeNo,0,28
1,40792,1992-05-06,2ndPrizeNo,0,2436
2,40792,1992-05-06,3rdPrizeNo,0,25
3,40792,1992-05-06,ConsolationNo1,0,515
4,40792,1992-05-06,ConsolationNo10,0,2114
5,40792,1992-05-06,ConsolationNo2,0,95
6,40792,1992-05-06,ConsolationNo3,0,351
7,40792,1992-05-06,ConsolationNo4,0,494
8,40792,1992-05-06,ConsolationNo5,0,494
9,40792,1992-05-06,ConsolationNo6,0,410


## Numeric

```
1. Numeric feature preprocessing is different for tree and
non-tree models:
    a. Tree-based models doesn’t depend on scaling
    b. Non-tree-based models hugely depend on scaling


2. Most often used preprocessings are:
    a. MinMaxScaler - to [0,1]
    b. StandardScaler - to mean==0, std==1
    c. Rank - sets spaces between sorted values to be equal
    d. np.log(1+x) and np.sqrt(1+x)


3. Scaling and Rank for numeric features:
    a. Tree-based models doesn't depend on them
    b. Non-tree-based models hugely depend on them

4. Most often used preprocessings are:
    a. MinMaxScaler - to [0,1]
    b. StandardScaler - to mean==0, std==1
    c. Rank - sets spaces between sorted values to be equal
    d. np.log(1+x) and np.sqrt(1+x)

5. Feature generation is powered by:
    a. Prior knowledge
    b. Exploratory data analysis
    
To [0,1]
sklearn.preprocessing.MinMaxScaler
X = (X  X.min())/(X.max()  X.min())

To mean=0, std=1
sklearn.preprocessing.StandardScaler
X = (X  X.mean())/X.std()    

UPPERBOUND, LOWERBOUND = np.percentile(x, [1,99])
y = np.clip(x, UPPERBOUBD, LOWERBOUND)
pd.Series(y).hist(bins=30)

```


In [95]:
numeric_features = []

## Categorical

### Mean Encoding

## DateTime

## Timeseries

## Coordinate