# Unit 3 - Modeling


In this notebook we will cover:
   1. How to choose a machine learning model?
       1. What to choose from?
       2. What to test for?
       


<img src="images/MLvisual2.png"/>

# Model Selection
- Many different algorithms to chose from
- First 3 factors to consider when chosing an algorithm:
    - Task (Classification, Regression, Clustering, DR)
    - Type of data (Labeled, unlabeled)
    - Amount of data
    

<img src="images/scikit_roadmap.png"/>

In [None]:
import matplotlib
import numpy as np
import pandas as pd
import random
import sklearn
import lightgbm as lgb
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
%matplotlib inline

#!pip install numerapi
from pathlib import Path
import dask.dataframe as dd
from dask.array import from_array
import numerapi
import matplotlib.pyplot as plt

from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm
)


In [None]:
napi = numerapi.NumerAPI()


In [None]:
#Create instance of NumerAPI

#Use numerAPI to download a single file
train_pq_path = "numerai_training_data_int8.parquet"
val_pq_path = "numerai_validation_data_int8.parquet"


napi.download_dataset("numerai_training_data_int8.parquet", train_pq_path)
napi.download_dataset("numerai_validation_data_int8.parquet", val_pq_path)



In [None]:
#Read parquet files into DataFrames
df_train = dd.read_parquet('numerai_training_data_int8.parquet')  
df_val = dd.read_parquet('numerai_validation_data_int8.parquet') 



In [None]:
features = [c for c in df_train if c.startswith("feature")]
features_erano = features + ["erano"]

targets = [c for c in df_train if c.startswith("target")]

df_train["erano"] = df_train.era.astype(int)
eras = df_train.erano
target = "target"



In [None]:
df_val["erano"] = df_val.era.astype(int)


In [None]:
#Create variables with just feature or target data
X_train = df_train.reset_index()[features].to_dask_array(lengths=True)

X_train_erano = df_train.reset_index()[features_erano].to_dask_array(lengths=True)

y_train = df_train.reset_index()["target"].to_dask_array(lengths=True)



# K-Fold Cross Validation

- K-fold cross-validation is a statistical method used to estimate the skill of machine learning models.
- Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default).

<img src="images/kfold.png" width=600/>

# Group 🇰-Fold Cross Validation

- Group K-fold is a K-fold iterator variant with non-overlapping groups.


<img src="images/groupkfold.png" width=750/>

# Era-wise Time-series Cross Validation

- Prevents you from using any future information to predict out of sample, since your out of sample test set is always in the future.


<img src="images/tseriessplit.png" width=750/>

In [None]:
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn import model_selection, metrics 
import csv

class TimeSeriesSplitGroups(_BaseKFold):
    def __init__(self, n_splits=5):
        super().__init__(n_splits, shuffle=False, random_state=None)

    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_list = np.unique(groups)
        n_groups = len(group_list)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of samples: {1}.").format(n_folds,
                                                             n_groups))
        indices = np.arange(n_samples)
        test_size = (n_groups // n_folds)
        test_starts = range(test_size + n_groups % n_folds,
                            n_groups, test_size)
        #test_starts = list(test_starts)[::-1]
        for test_start in test_starts:
            
            yield (indices[groups.isin(group_list[:test_start])],
                   indices[groups.isin(group_list[test_start:test_start + test_size])])

            

# Loss Function 📉
- We will be using a correlation based loss function
- MSE looks worse than correlation out of sample



In [None]:
# The models should be scored based on the rank-correlation (spearman) with the target
def numerai_score(y_true, y_pred, eras):
    rank_pred = y_pred.groupby(eras).apply(lambda x: x.rank(pct=True, method="first"))
    return np.corrcoef(y_true, rank_pred)[0,1]

# It can also be convenient while working to evaluate based on the regular (pearson) correlation
def correlation_score(y_true, y_pred):
    return numpy.corrcoef(y_true, y_pred)[0,1]

def spearman(y_true, y_pred): 
    return spearmanr(y_pred, y_true).correlation 



# Thank You and Good Luck!
- Like & Subscribe for more!
- [Github](https://github.com/peterling7710/NumeraiStarterPack) with the notebooks for this series
- Find my socials [here](https://linktr.ee/peterling) for more numer.ai related content

<img src="images/TAF.jpg"/>