In [36]:
from numpy.lib.function_base import place
from procan_connectome.model_training.loocv_wrapper import LOOCV_Wrapper
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
import numpy as np 
import os
import pandas as pd
import logging
import datetime
from procan_connectome.config import DATA_PATH, RANDOM_STATE, LOGGER_LEVEL
from procan_connectome.utils.load_dataset import get_rf_dataset, get_svc_dataset
from sklearn.model_selection import train_test_split

In [31]:
from sklearn.datasets import load_breast_cancer
df = load_breast_cancer(return_X_y=False, as_frame=True)

In [42]:
X, y = df.data, df.target

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
149,13.740,17.91,88.12,585.0,0.07944,0.06376,0.02881,0.01329,0.1473,0.05580,...,15.340,22.46,97.19,725.9,0.09711,0.18240,0.15640,0.06019,0.2350,0.07014
124,13.370,16.39,86.10,553.5,0.07115,0.07325,0.08092,0.02800,0.1422,0.05823,...,14.260,22.75,91.99,632.1,0.10250,0.25310,0.33080,0.08978,0.2048,0.07628
421,14.690,13.98,98.22,656.1,0.10310,0.18360,0.14500,0.06300,0.2086,0.07406,...,16.460,18.34,114.10,809.2,0.13120,0.36350,0.32190,0.11080,0.2827,0.09208
195,12.910,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,0.1829,0.05667,...,13.880,22.00,90.81,600.6,0.10970,0.15060,0.17640,0.08235,0.3024,0.06949
545,13.620,23.23,87.19,573.2,0.09246,0.06747,0.02974,0.02443,0.1664,0.05801,...,15.350,29.09,97.58,729.8,0.12160,0.15170,0.10490,0.07174,0.2642,0.06953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,8.888,14.64,58.79,244.0,0.09783,0.15310,0.08606,0.02872,0.1902,0.08980,...,9.733,15.67,62.56,284.4,0.12070,0.24360,0.14340,0.04786,0.2254,0.10840
106,11.640,18.33,75.17,412.5,0.11420,0.10170,0.07070,0.03485,0.1801,0.06520,...,13.140,29.26,85.51,521.7,0.16880,0.26600,0.28730,0.12180,0.2806,0.09097
270,14.290,16.82,90.30,632.6,0.06429,0.02675,0.00725,0.00625,0.1508,0.05376,...,14.910,20.65,94.44,684.6,0.08567,0.05036,0.03866,0.03333,0.2458,0.06120
435,13.980,19.62,91.12,599.5,0.10600,0.11330,0.11260,0.06463,0.1669,0.06544,...,17.040,30.80,113.90,869.3,0.16130,0.35680,0.40690,0.18270,0.3179,0.10550


In [None]:
pipeline = Pipeline(steps = ( ('ss', StandardScaler()), ('pt', PowerTransformer()) ) )
log_dir = os.path.join(DATA_PATH, 'logs')

estimators = [
    # RandomForestClassifier(random_state=RANDOM_STATE),
    LinearSVC(random_state=RANDOM_STATE),
]

grids = [
    # {
    #     'n_estimators': [10, 50, 100, 200, 400],
    #     'criterion': ['gini', 'entropy'],
    #     'min_samples_split': [2, 3, 5],
    #     'min_samples_leaf': [1, 2, 5],
    #     'class_weight': ['balanced', None]
    # },
    {
        'tol': [1E-10, 1E-9, 1E-8],
        'C': [1.0],
        'fit_intercept': [True],
        'max_iter': [100000000]
    }
]

log_names = [
    # "RF_LOOCV",
    "LinearSVC_LOOCV_test_df",
]



for estimator, grid, log_file_name in list(zip(estimators, grids, log_names)): 
    log_file_name = f"{datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')}" + "_" + log_file_name

    logging.basicConfig(
        filename=os.path.join(DATA_PATH, 'logs', log_file_name + "_LOGS"),
        filemode='a',
        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
        datefmt='%H:%M:%S',
        level=LOGGER_LEVEL
    )


    loocv = LOOCV_Wrapper(
        X, 
        y, 
        estimator, 
        pipeline=pipeline, 
        param_grid=grid,
        perform_grid_search=True,
        label_col='target',
        log_file_name = log_file_name,
        log_dir=log_dir,
        balance_classes=True, 
        scoring='f1_weighted',
        verbose=3,
        n_samples=None,
        single_label_upsample=None,
        cv=None,
        select_features=True,
        feature_threshold=0.01
    )
    loocv.fit(X,y) 