# Presets

The original data creators, depositors or copyright holders, the funders of the Data Collections (if different) and the UK Data Service/UK Data Archive, and to acknowledge Crown Copyright where appropriate. The original data creators, depositors or copyright holders, the funders of the Data Collections (if different) and the UK Data Service/UK Data Archive bear no responsibility for their further analysis or interpretation.

In [None]:
import pandas as pd
import numpy as np
#import plotly.express as px
#import plotly.graph_objects as go
#import plotly.figure_factory as ff
#import matplotlib.pyplot as plt
#import seaborn as sns

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Hyperparams tuning
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              ExtraTreesClassifier, VotingClassifier)
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
np.random.seed(42)

In [None]:
def show_model_ga_search_cv(model_grid, classifier, name, cv=10, popsize=50):
    model_grid_search_cv = GASearchCV(
        estimator=classifier,
        cv=cv,
        scoring='accuracy',
        population_size=popsize,
        generations=40,
        tournament_size=3,
        elitism=True,
        crossover_probability=0.8,
        mutation_probability=0.1,
        param_grid=model_grid,
        criteria='max',
        algorithm='eaMuPlusLambda',
        n_jobs=-1,
        verbose=True,
        keep_top_k=4
    ).fit(X_train, y_train)
    print("\nModel:", name, "\n")
    print("Accuracy:", model_grid_search_cv.best_score_, "\n")
    print("Best params", model_grid_search_cv.best_params_, "\n")

# DATA

In [None]:
data = pd.read_csv('data/data_p3.csv')

In [None]:
X = data.copy()
y = X['inactive']
X.drop(columns=['inactive'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Grids

In [None]:
model_grid_ga_logistic = {
    'C': Continuous(0.1, 1000, distribution='uniform')
}

model_grid_ga_logistic_net = {
    'C': Continuous(0.1, 1000, distribution='uniform'),
    'l1_ratio': Continuous(0, 1, distribution='uniform')
}

model_grid_ga_knn = {
    'n_neighbors': Integer(3, 20),
    'leaf_size': Integer(20, 50)
}

model_grid_ga_svm = {
    'C': Continuous(0.1, 1000, distribution='uniform'),
    'gamma': Continuous(0.0001, 1, distribution='uniform'),
}

model_grid_ga_rf = {
    'max_depth': Integer(10, 80),
    'max_features': Integer(1, 7),
    'min_samples_leaf': Integer(1, 7),
    'min_samples_split': Integer(2, 10),
    'n_estimators': Integer(25, 500)#,
}

model_grid_ga_extra_trees = {
    'max_depth': Integer(10, 80),
    'max_features': Integer(1, 7),
    'min_samples_leaf': Integer(1, 7),
    'min_samples_split': Integer(2, 10),
    'n_estimators': Integer(25, 500)#,
}

model_grid_ga_adaboost = {
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500)
}

model_grid_ga_xgboost = {
    "subsample": Continuous(0.75, 1, distribution='uniform'),
    "colsample_bytree": Continuous(0.75, 1, distribution='uniform'),
    "max_depth": Integer(2, 16),
    "min_child_weight": Integer(2, 15),
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500)
}

model_grid_ga_lgbm = {
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500),
    "num_iterations": Integer(100, 1000),
    "lambda_l2": Integer(0, 3),
    "bagging_fraction": Continuous(0.8, 1, distribution='uniform'),
    "min_data_in_leaf": Integer(10, 40),
    "num_leaves": Integer(21, 51),
}

model_grid_ga_nnet = {
    'learning_rate_init': Continuous(0.01, 0.50, distribution='uniform'),
    'max_iter': Integer(200, 2000),
    'hidden_layer_sizes': Integer(100, 1000)
}

# Models

## Logistic regression

In [None]:
show_model_ga_search_cv(model_grid_ga_logistic,
                        LogisticRegression(penalty='none'),
                        'logistic_reg')
show_model_ga_search_cv(model_grid_ga_logistic_net,
                        LogisticRegression(solver='saga', penalty='elasticnet'),
                        'logistic_reg')
show_model_ga_search_cv(model_grid_ga_logistic,
                        LogisticRegression(penalty='l1', solver='saga'),
                        'logistic_reg')
show_model_ga_search_cv(model_grid_ga_logistic, LogisticRegression(penalty='l2'), 'logistic_reg')

## KNN

In [None]:
show_model_ga_search_cv(model_grid_ga_knn, KNeighborsClassifier(), 'knn')

## SVM

In [None]:
show_model_ga_search_cv(model_grid_ga_svm, SVC(kernel='linear'), 'svm_linear')
show_model_ga_search_cv(model_grid_ga_svm, SVC(kernel='rbf'), 'svm_rbf')

## RF

In [None]:
show_model_ga_search_cv(model_grid_ga_rf, RandomForestClassifier(), 'random_forest')

## Extra trees

In [None]:
show_model_ga_search_cv(model_grid_ga_extra_trees, ExtraTreesClassifier(), 'extra_trees')

## XGboost

In [None]:
show_model_ga_search_cv(model_grid_ga_xgboost, XGBClassifier(), 'xgboost')

## lightGBM

In [None]:
show_model_ga_search_cv(model_grid_ga_lgbm, LGBMClassifier(boosting_type='dart'), 'light_gbm')
#show_model_ga_search_cv(model_grid_ga_lgbm, LGBMClassifier(boosting_type='gbdt'), 'light_gbm')

## NNET

In [None]:
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='invscaling'),
                        'nnet')
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='constant'),
                        'nnet')
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='adaptive'),
                        'nnet')

## Results

# Submission

In [None]:
# train ensemble
# prepare test
# predict and write to csv