### This notebook contains the code for:
- preliminary handling of missing values 
- engineering and selcting features 
- running the Random Forest model 
- performing a grid search over possible ways to handle missing values
- performing a grid search over random forest parameters 
- computing state of the art result with RF
- performing a grid search over decision tree, k-nearest neighbor and logistic regression parameters
- creating an ensemble of hypertuned models
- training and performing cross validation on an autosklearn model


## Importing Packages

In [11]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingClassifier
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

## Importing DATA

In [2]:
x_train = pd.read_csv('./x_train.csv', index_col='ID')
y_train = pd.read_csv('./y_train.csv', index_col='ID')
train = pd.concat([x_train, y_train], axis=1)
test = pd.read_csv('./x_test.csv', index_col='ID')
train_original = train
train.head()

Unnamed: 0_level_0,DATE,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,VOLUME_2,...,VOLUME_16,RET_17,VOLUME_17,RET_18,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20,RET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,18,5,3,44,-0.015748,0.147931,-0.015504,0.179183,...,0.630899,0.003254,-0.379412,0.008752,-0.110597,-0.012959,0.174521,-0.002155,-0.000937,True
1,0,3,43,15,6,104,0.003984,,-0.09058,,...,,0.003774,,-0.018518,,-0.028777,,-0.034722,,True
2,0,4,57,20,8,142,0.00044,-0.096282,-0.058896,0.084771,...,-0.010336,-0.017612,-0.354333,-0.006562,-0.519391,-0.012101,-0.356157,-0.006867,-0.308868,False
3,0,8,1,1,1,2,0.031298,-0.42954,0.007756,-0.089919,...,0.012105,0.033824,-0.290178,-0.001468,-0.663834,-0.01352,-0.562126,-0.036745,-0.631458,False
4,0,14,36,12,5,92,0.027273,-0.847155,-0.039302,-0.943033,...,-0.277083,-0.012659,0.139086,0.004237,-0.017547,0.004256,0.57951,-0.040817,0.802806,False


## Filling Missing Values

In [3]:
def missing_values(grouping = ["INDUSTRY_GROUP"],k=8,train=train_original):
    returns = ['RET_%d' % (i + 1) for i in range(20)]
    volume = ['VOLUME_%d' % (i + 1) for i in range(20)]
    
    #Selecting data to fill horizontally based on threshold
    mask1 = (train[returns].isna().astype("int").sum(axis=1)<k)
    mask2 = (train[volume].isna().astype("int").sum(axis=1)<k)
    train1 = train[np.logical_and(mask1,mask2)]
    
    #Filling data horizontally 
    train1[returns] = train1[returns].T.fillna(train1[returns].mean(axis=1)).T
    train1[volume] = train1[volume].T.fillna(train1[volume].mean(axis=1)).T
    
    #Selecting other part of data
    train2 = train.drop(train1.index,axis=0)
    df_obj = (train.groupby(grouping)[returns].transform("mean")).loc[train2.index]
    mask = train2[returns].isnull()
    df_vol = train2[returns]
    df_vol[mask.eq(True)] = df_obj
    
    #Filling returns vertically based on parameter "grouping"
    train2[returns] = df_vol
    df_obj = (train.groupby(grouping)[volume].transform("mean")).loc[train2.index]
    mask = train2[volume].isnull()
    df_vol = train2[volume]
    df_vol[mask.eq(True)] = df_obj
    
    #Filling volume vertically based on parameter "grouping"
    train2[volume] = df_vol
    train = pd.concat([train1,train2],axis=0).sort_values(by = ["ID"])
    train = train.fillna(0)
    return train

## Data Engineering

In [4]:
def feature_engineering(train):
    
    #Creating absolute value returns and summing them
    for i in range(20):
        train["abs_RET_%d" % (i+1)] = np.abs(train["RET_%d" % (i+1)])
    train["accumulate_abs_returns_{}".format(20)] = np.sum([train["abs_RET_%d" % (i+1)] for i in range(20)], axis=0)

    #Creating Log RET and summing them
    for i in range(20):
        train["log_RET_%d" % (i+1)] = np.log(1 + train["RET_%d" % (i+1)])
    train["accumulate_log_returns_20"] = np.sum([train["log_RET_%d" % (i+1)] for i in range(20)], axis=0)
    
    #Creating BETA
    def Beta(x):
        y = x[returns].values.reshape(-1,1)
        X = x[["RET_%s_mkt"%(i+1) for i in range(20)]].values.reshape(-1,1)
        reg = LinearRegression()
        reg.fit(X,y)
        return reg.coef_[0][0]
    returns = ['RET_%d' % (i + 1) for i in range(20)]
    volume = ['VOLUME_%d' % (i + 1) for i in range(20)]
    average_returns_over_dates = train.groupby("DATE")[returns].transform("mean")
    average_returns_over_dates.columns = ["RET_%s_mkt"%(i+1) for i in range(20)]
    train["market_beta"] = pd.concat((average_returns_over_dates,train[returns]),axis=1).apply(Beta,axis=1)
    
    return train

## Data Selection

In [5]:
def feature_selection(train,beta = False):
    
    new_features = []

    new_features += ["accumulate_abs_returns_20"]
    new_features += ["accumulate_log_returns_20"]
    
    if beta:
        new_features += ["market_beta"]


    # Conditional mean on INDUSTRY_GROUP and DATE for RET_1 and VOLUME_1 
    shifts = [1]
    statistics = ['mean']  
    gb_features = ['INDUSTRY_GROUP', 'DATE']
    target_feature_list = ['RET',"VOLUME"]
    tmp_name = '_'.join(gb_features)
    for target_feature in target_feature_list:
        for shift in shifts:
            for stat in statistics:
                name = f'{target_feature}_{shift}_{tmp_name}_{stat}'
                feat = f'{target_feature}_{shift}'
                new_features.append(name)
                for data in [train]:
                    data[name] = data.groupby(gb_features)[feat].transform(stat)

        target = 'RET'

    n_shifts_ret = 5
    n_shifts_vol = 5 # If you don't want all the shifts to reduce noise
    features = ['RET_%d' % (i + 1) for i in range(n_shifts_ret)]
    features += ['VOLUME_%d' % (i + 1) for i in range(n_shifts_vol)]
    features += new_features  # The conditional features
                    
    return train, features, target 

## Model Building & 4-Fold Cross Validation

In [6]:
def run_RF(train,test, features,target,n_estimator=300, depth=8):
    
    X_train = train[features]
    y_train = train[target]
    
    #Parameters of the Random Forest

    rf_params = {
        'n_estimators': n_estimator,
        'max_depth': depth,
        'random_state': 0,
        'n_jobs': -1
    }

    
    #The below is concerned with making a 4K cross-validation BY DATE!
    
    train_dates = train['DATE'].unique()
    test_dates = test['DATE'].unique()

    n_splits = 4
    scores = []
    models = []

    splits = KFold(n_splits=n_splits, random_state=0,
                   shuffle=True).split(train_dates)

    for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
        local_train_dates = train_dates[local_train_dates_ids]
        local_test_dates = train_dates[local_test_dates_ids]

        local_train_ids = train['DATE'].isin(local_train_dates)
        local_test_ids = train['DATE'].isin(local_test_dates)

        X_local_train = X_train.loc[local_train_ids]
        y_local_train = y_train.loc[local_train_ids]
        X_local_test = X_train.loc[local_test_ids]
        y_local_test = y_train.loc[local_test_ids]

        X_local_train = X_local_train.fillna(0)
        X_local_test = X_local_test.fillna(0)

        model = RandomForestClassifier(**rf_params)
        model.fit(X_local_train, y_local_train)

        y_local_pred = model.predict_proba(X_local_test)[:, 1]

        sub = train.loc[local_test_ids].copy()
        sub['pred'] = y_local_pred
        y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values

        models.append(model)
        score = accuracy_score(y_local_test, y_local_pred)
        scores.append(score)
        print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

    mean = np.mean(scores)*100
    std = np.std(scores)*100
    u = (mean + std)
    l = (mean - std)
    print(f'Accuracy: {mean:.2f}% [{l:.2f} ; {u:.2f}] (+- {std:.2f})')
    
    return mean

## Preliminary Results

In [13]:
train = missing_values(grouping=["SECTOR","DATE"],k=15)
train = feature_engineering(train)
train,features,target = feature_selection(train,beta = True)
run_RF(train,test, features,target)

Fold 1 - Accuracy: 52.24%
Fold 2 - Accuracy: 50.40%
Fold 3 - Accuracy: 50.87%
Fold 4 - Accuracy: 52.65%
Accuracy: 51.54% [50.61 ; 52.47] (+- 0.93)


51.54114080164959

## NA Grid Search

#### Given our conditioning on SECTOR and DATE, iterating over best k

In [None]:
result_sector_date_k = {}
for elt in range(15):
    train = missing_values(k = elt)
    train = feature_engineering(train)
    train,features,target = feature_selection(train)
    result_sector_date_k[elt] = run_RF(train,test, features,target)

In [None]:
fig,ax = plt.subplots()
pd.Series(result_sector_date_k).plot()
ax.set_title("k Grid Search ")
ax.set_xlabel("K")
ax.set_ylabel("Accuracy (%)")

#### Given optimal k, iterating over pairs of feature + DATES

In [None]:
group = [["SECTOR","DATE"],["INDUSTRY","DATE"],["SUB_INDUSTRY","DATE"],["INDUSTRY_GROUP","DATE"]]
result_group_with_date = {}
for elt in group:
    train = missing_values(grouping = elt)
    train = feature_engineering(train)
    train,features,target = feature_selection(train)
    result_group_with_date[elt] = run_RF(train,test, features,target)

In [None]:
fig,ax = plt.subplots()
pd.Series(result_group_with_date).plot.barh()
plt.xlim(51.5,52)
ax.set_title("Grid Search Na Values - Grouping by parameters & Dates ")
ax.set_ylabel("Group By Parameter")
ax.set_xlabel("Accuracy (%)")

#### Given optimal k, iterating over single feature

In [None]:
group = ["SECTOR","INDUSTRY","SUB_INDUSTRY","INDUSTRY_GROUP","DATE"]
result_group_without_date = {}
for elt in group:
    train = missing_values(grouping = [elt])
    train = feature_engineering(train)
    train,features,target = feature_selection(train)
    result_group_without_date[elt] = run_RF(train,test, features,target)

In [None]:
fig,ax = plt.subplots()
pd.Series(result_group_without_date).plot.barh()
plt.xlim(51.5,52)
ax.set_title("Grid Search Na Values - Grouping by parameters")
ax.set_ylabel("Group By Parameter")
ax.set_xlabel("Accuracy (%)")

#### Given optimal conditioning on feature, re-search optimal k

In [None]:
result_industry_group_k = {}
for elt in range(15):
    train = missing_values(k = elt)
    train = feature_engineering(train)
    train,features,target = feature_selection(train)
    result_industry_group_k[elt] = run_RF(train,test, features,target)

In [None]:
fig,ax = plt.subplots()
pd.Series(result_industry_group_k).plot()
ax.set_title("New k Grid Search ")
ax.set_xlabel("K")
ax.set_ylabel("Accuracy (%)")

## Hyperparameter Tuning - Random Forest

In [None]:
#Tweaking number of trees
grid_RF = {}
for i in n_estimator:
    grid_RF[i] = run_RF(train,test, features,target,n_estimator = i, depth=2**3)

In [None]:
fig,ax = plt.subplots()
pd.Series(grid_RF).plot()
ax.set_title("RF - n_estimators - Grid Search")
ax.set_xlabel("Number of Trees")
ax.set_ylabel("Accuracy (%)")

In [None]:
#Tweaking depth
grid_RF_depth = {}
for i in depth:
    grid_RF_depth[i] = run_RF(train,test, features,target,depth=i,n_estimator=300)

In [None]:
fig,ax = plt.subplots()
pd.Series(grid_RF_depth).plot()
ax.set_title("RF - depth - estimator Grid Search")
ax.set_xlabel("Number of Trees")
ax.set_ylabel("Accuracy (%)")

## Final Results

In [32]:
train = missing_values()
train = feature_engineering(train)
train,features,target = feature_selection(train)
run_RF(train,test, features,target)

Fold 1 - Accuracy: 52.70%
Fold 2 - Accuracy: 50.76%
Fold 3 - Accuracy: 51.12%
Fold 4 - Accuracy: 52.94%
Accuracy: 51.88% [50.93 ; 52.84] (+- 0.95)


51.88274677375793

# Hyperparameter Estimation (Other Models)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

#building all models needed
kNN = KNeighborsClassifier()
NB = GaussianNB()
LR = LogisticRegression()
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()

In [None]:
# range of hyperparameters to iterate over
DT_parameter_grid = {'criterion':('gini', 'entropy'), 'min_samples_leaf':list(range(3,12)), 'max_depth':list(range(4,12)), 'max_features':list(range(3,10))}
LR_parameter_grid = {'max_iter':list(range(30,50)), 'C': [0.01, 0.05, 0.1, 0.5, 1,2], 'n_jobs': [-1], 'penalty': ('l1', 'l2')}
kNN_parameter_grid = {'n_neighbors':list(range(2,15)), 'weights':('uniform', 'distance'), 'leaf_size':list(range(10,30)), 'n_jobs': [-1]}

In [None]:
from sklearn.model_selection import ParameterGrid

# creating parameter grid for each models hyperparameters
LR_grid = ParameterGrid(LR_parameter_grid)
DT_grid = ParameterGrid(DT_parameter_grid)
kNN_grid = ParameterGrid(kNN_parameter_grid)

## Decision Tree Hyperparameter Tuning

In [None]:
# Decision Tree Hyperparameter Tuning
def DT_hyper_param(param_grid):
    '''trains hyperparameters on a specific model, returns list of accuracy values'''
    all_scores = []
    for params in param_grid:
        hyp_model = DecisionTreeClassifier(criterion = params['criterion'], min_samples_leaf = params['min_samples_leaf'], 
                                            max_depth = params['max_depth'],max_features = params['max_features'])
        
        X_train = train[features]
        y_train = train[target]
        
        train_dates = train['DATE'].unique()
        #test_dates = test['DATE'].unique()

        n_splits = 4
        scores = []
        models = []

        splits = KFold(n_splits=n_splits, random_state=0,
                       shuffle=True).split(train_dates)

        for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
            local_train_dates = train_dates[local_train_dates_ids]
            local_test_dates = train_dates[local_test_dates_ids]

            local_train_ids = train['DATE'].isin(local_train_dates)
            local_test_ids = train['DATE'].isin(local_test_dates)

            X_local_train = X_train.loc[local_train_ids]
            y_local_train = y_train.loc[local_train_ids]
            X_local_test = X_train.loc[local_test_ids]
            y_local_test = y_train.loc[local_test_ids]

            X_local_train = X_local_train.fillna(0)
            X_local_test = X_local_test.fillna(0)

            hyp_model.fit(X_local_train, y_local_train)

            y_local_pred = hyp_model.predict_proba(X_local_test)[:, 1]

            sub = train.loc[local_test_ids].copy()
            sub['pred'] = y_local_pred
            y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values

            score = accuracy_score(y_local_test, y_local_pred)
            scores.append(score)
            print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

        all_scores.append(np.mean(scores)*100)
    return all_scores

In [None]:
DT_acc_scores = DT_hyper_param(DT_grid)

# optimal DT model
DT_grid[np.argmax(DT_acc_scores)]

## K-Nearest Neighbors Hyperparameter Tuning

In [None]:
# kNN Hyperparameter Tuning
def kNN_hyper_param(param_grid):
    '''trains hyperparameters on a specific model, returns list of accuracy values'''
    all_scores = []
    for params in param_grid:
        hyp_model = KNeighborsClassifier(n_neighbors = params['n_neighbors'], weights = params['weights'], n_jobs = params['n_jobs'], leaf_size = params['leaf_size'])
        
        X_train = train.iloc[:,:16] 
        y_train = train.iloc[:,-1]
        
        train_dates = train['DATE'].unique()
        #test_dates = test['DATE'].unique()

        n_splits = 4
        scores = []
        models = []

        splits = KFold(n_splits=n_splits, random_state=0,
                       shuffle=True).split(train_dates)

        for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
            local_train_dates = train_dates[local_train_dates_ids]
            local_test_dates = train_dates[local_test_dates_ids]

            local_train_ids = train['DATE'].isin(local_train_dates)
            local_test_ids = train['DATE'].isin(local_test_dates)

            X_local_train = X_train.loc[local_train_ids]
            y_local_train = y_train.loc[local_train_ids]
            X_local_test = X_train.loc[local_test_ids]
            y_local_test = y_train.loc[local_test_ids]

            X_local_train = X_local_train.fillna(0)
            X_local_test = X_local_test.fillna(0)

            hyp_model.fit(X_local_train, y_local_train)

            y_local_pred = hyp_model.predict_proba(X_local_test)[:, 1]

            sub = train.loc[local_test_ids].copy()
            sub['pred'] = y_local_pred
            y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values

            score = accuracy_score(y_local_test, y_local_pred)
            scores.append(score)
            print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

        all_scores.append(np.mean(scores)*100)
    return all_scores

In [None]:
kNN_acc_scores = kNN_hyper_param(kNN_grid)

# optimal kNN model
kNN_grid[np.argmax(kNN_acc_scores)]

## Logistic Regression Hyperparameter Tuning

In [None]:
# Logistic Regression Hyperparameter Tuning
def LR_hyper_param(param_grid):
    '''trains hyperparameters on a specific model, returns list of accuracy values'''
    all_scores = []
    for params in param_grid:
        hyp_model = LogisticRegression(max_iter = params['max_iter'], C = params['C'], n_jobs = params['n_jobs'], penalty = 'l2')
        
        X_train = train[features]
        y_train = train[target]
        
        train_dates = train['DATE'].unique()
        #test_dates = test['DATE'].unique()

        n_splits = 4
        scores = []
        models = []

        splits = KFold(n_splits=n_splits, random_state=0,
                       shuffle=True).split(train_dates)

        for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
            local_train_dates = train_dates[local_train_dates_ids]
            local_test_dates = train_dates[local_test_dates_ids]

            local_train_ids = train['DATE'].isin(local_train_dates)
            local_test_ids = train['DATE'].isin(local_test_dates)

            X_local_train = X_train.loc[local_train_ids]
            y_local_train = y_train.loc[local_train_ids]
            X_local_test = X_train.loc[local_test_ids]
            y_local_test = y_train.loc[local_test_ids]

            X_local_train = X_local_train.fillna(0)
            X_local_test = X_local_test.fillna(0)

            hyp_model.fit(X_local_train, y_local_train)

            y_local_pred = hyp_model.predict_proba(X_local_test)[:, 1]

            sub = train.loc[local_test_ids].copy()
            sub['pred'] = y_local_pred
            y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values

            score = accuracy_score(y_local_test, y_local_pred)
            scores.append(score)
            print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

        all_scores.append(np.mean(scores)*100)
    return all_scores

In [None]:
LR_acc_scores = LR_hyper_param(LR_grid)

# optimal LR model
LR_grid[np.argmax(LR_acc_scores)]

## Creating Ensemble

In [None]:
# using optimal parameters from previous section to create models to feed into ensemble 
DT_tuned = DecisionTreeClassifier(criterion = 'entropy', min_samples_leaf = 7, max_depth = 11, max_features = 5)
kNN_tuned = KNeighborsClassifier(weights = 'uniform', n_neighbors = 2, n_jobs = -1, leaf_size = 10)
LR_tuned = LogisticRegression(penalty = 'l2', n_jobs = -1, max_iter = 45, C = 0.1)
RF_tuned = RandomForestClassifier(max_depth = 8, n_estimators = 300, max_features = 8)

In [None]:
# creating list of models
tuned_models = [ ('kNN', kNN_tuned),('RF', RF_tuned), ('Log Reg', LR_tuned), ('DT', DT_tuned), ('Naive Bayes', NB)]

#creating ensemble model based on argmax voting (voting = soft)
ensemble = VotingClassifier(tuned_models, voting='soft')

In [None]:
X_train = train[features]
y_train = train[target]

train_dates = train['DATE'].unique()
test_dates = test['DATE'].unique()

n_splits = 4
scores = []
models = []

splits = KFold(n_splits=n_splits, random_state=0,
               shuffle=True).split(train_dates)

for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
    local_train_dates = train_dates[local_train_dates_ids]
    local_test_dates = train_dates[local_test_dates_ids]

    local_train_ids = train['DATE'].isin(local_train_dates)
    local_test_ids = train['DATE'].isin(local_test_dates)
    
    X_local_train = X_train.loc[local_train_ids]
    y_local_train = y_train.loc[local_train_ids]
    X_local_test = X_train.loc[local_test_ids]
    y_local_test = y_train.loc[local_test_ids]

    X_local_train = X_local_train.fillna(0)
    X_local_test = X_local_test.fillna(0)
    
    ensemble.fit(X_local_train, y_local_train)

    y_local_pred = ensemble.predict_proba(X_local_test)[:, 1]
    
    sub = train.loc[local_test_ids].copy()
    sub['pred'] = y_local_pred
    y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values
    
    score = accuracy_score(y_local_test, y_local_pred)
    scores.append(score)
    print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

mean = np.mean(scores)*100
std = np.std(scores)*100
u = (mean + std)
l = (mean - std)
print(f'Accuracy: {mean:.2f}% [{l:.2f} ; {u:.2f}] (+- {std:.2f})')

# Auto-skearn Model

In [None]:
for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
    local_train_dates = train_dates[local_train_dates_ids]
    local_test_dates = train_dates[local_test_dates_ids]

    local_train_ids = train['DATE'].isin(local_train_dates)
    local_test_ids = train['DATE'].isin(local_test_dates)
    
    X_local_train = X_train.loc[local_train_ids]
    y_local_train = y_train.loc[local_train_ids]
    X_local_test = X_train.loc[local_test_ids]
    y_local_test = y_train.loc[local_test_ids]

    X_local_train = X_local_train.fillna(0)
    X_local_test = X_local_test.fillna(0)
    

    model = AutoSklearn2Classifier(time_left_for_this_task=60*60*2,memory_limit= 1000000)
    print("Start to Fit")
    model.fit(X_local_train,y_local_train)
    print("Finished Fit")

    y_local_pred = model.predict_proba(X_local_test)[:, 1]
    
    sub = train.loc[local_test_ids].copy()
    sub['pred'] = y_local_pred
    y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values

    models.append(model)
    score = accuracy_score(y_local_test, y_local_pred)
    scores.append(score)
    print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

mean = np.mean(scores)*100
std = np.std(scores)*100
u = (mean + std)
l = (mean - std)
print(f'Accuracy: {mean:.2f}% [{l:.2f} ; {u:.2f}] (+- {std:.2f})')