In [1]:
from catboost import CatBoostClassifier
from scipy import stats
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from fancyimpute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.model_selection import GridSearchCV
import xlsxwriter
from random import randint
random_state = 76564
from preprocessing import stds, stats, removal, cv_preprocessing
from load_data import load_data
import os

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [6]:
class CustomAnovaFeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=None,):
        super().__init__()

        self.n_features = n_features
        self.features = []
        
    def fit(self, X, y=None):
        print(2)
        if len(X.columns) < 3:
            return X
        
        if self.n_features is None:
            self.n_features = max(int(np.sqrt(len(X.columns))), 3)
        
        features_significance = {}

        # two way anova
        if False:
            for i, a in enumerate(X.columns):
                for j, b in enumerate(X.columns[:i]):

                    model = ols(f'y ~ {a} + {b} + {a}:{b}', data=X).fit()
                    mo = sm.stats.anova_lm(model,typ=2)
                    #print(a, b, c)
                    a_significance = [mo['PR(>F)'][f'{a}'],  mo['PR(>F)'][f'{a}:{b}']]
                    if f'{a}' in features_significance.keys():
                        a_significance.append(features_significance[f'{a}'])
                    features_significance[f'{a}'] = min(a_significance)

                    b_significance = [mo['PR(>F)'][f'{b}'], mo['PR(>F)'][f'{a}:{b}']]
                    if f'{b}' in features_significance.keys():
                        b_significance.append(features_significance[f'{b}'])
                    features_significance[f'{b}'] = min(b_significance)                

        
        # three way anova
        else:
            for i, a in enumerate(X.columns):
                for j, b in enumerate(X.columns[:i]):
                    for c in X.columns[:j]:

                        model = ols(f'y ~ {a} + {b} + {c} + {a}:{b} + {a}:{c} + {b}:{c} + {a}:{b}:{c}', data=X).fit()
                        mo = sm.stats.anova_lm(model,typ=3)
                        #print(a, b, c)
                        a_significance = [mo['PR(>F)'][f'{a}'], mo['PR(>F)'][f'{a}:{c}'], mo['PR(>F)'][f'{a}:{b}'], mo['PR(>F)'][f'{a}:{b}:{c}']]
                        if f'{a}' in features_significance.keys():
                            a_significance.append(features_significance[f'{a}'])
                        features_significance[f'{a}'] = min(a_significance)

                        b_significance = [mo['PR(>F)'][f'{b}'], mo['PR(>F)'][f'{b}:{c}'], mo['PR(>F)'][f'{a}:{b}'], mo['PR(>F)'][f'{a}:{b}:{c}']]
                        if f'{b}' in features_significance.keys():
                            b_significance.append(features_significance[f'{b}'])
                        features_significance[f'{b}'] = min(b_significance)

                        c_significance = [mo['PR(>F)'][f'{c}'], mo['PR(>F)'][f'{a}:{c}'], mo['PR(>F)'][f'{b}:{c}'], mo['PR(>F)'][f'{a}:{b}:{c}']]
                        if f'{c}' in features_significance.keys():
                            c_significance.append(features_significance[f'{c}'])
                        features_significance[f'{c}'] = min(c_significance)                    
        print(3)
        features = sorted(X.columns, key=lambda x: features_significance[x])
        self.features = features[:self.n_features]
        print(3.5)
        #print(self.features)
        i = randint(1, 10000)
        kendall_feats = X.columns.insert(0, 'Features')
        print(3.9)
        with xlsxwriter.Workbook(f'CustomAnovaFeatureSelection_3_ways_{i}.xlsx') as workbook:
            worksheet = workbook.add_worksheet()
            worksheet.write_column(0, 0, kendall_feats)
        
        return self

    def transform(self, X, y=None):
        print(4)
        return X[self.features]

In [7]:
def clean_features(X):
    rename_cols = {i: i.replace('.', '').replace(' ', '') for i in X.columns}
    return X.rename(rename_cols, axis=1)

In [8]:
df_preprocessed, features, target_feature = load_data()
X, X_out, Y, y_out = train_test_split(df_preprocessed[features], df_preprocessed[target_feature[0]],\
                                      test_size=0.25, random_state=random_state,\
                                      stratify=df_preprocessed[target_feature[0]])






T2


pcl1
PCL1_Broad
PCL1_Strict
(array([ 20, 502, 585, 688], dtype=int64),)


In [None]:

for i in [random_state]:
    X_train, X_test, y_train, y_test = train_test_split(X, Y,  random_state=i, test_size=0.2, stratify=Y)
    cv = StratifiedKFold(6, random_state=i, shuffle=True)

    X_train, X_test = cv_preprocessing(X_train, X_test, i)
    
    X_train, X_test = clean_features(X_train), clean_features(X_test)
    pipe = Pipeline(steps=[
    ('rfe', CustomAnovaFeatureSelection(n_features=15)),
    ('classifier', CatBoostClassifier(verbose=0, random_state=i))])
        
    grid_params = [{
    'classifier__class_weights':[[1, 10]],#, [1, 15], [1, 30]],
    'classifier__l2_leaf_reg': [150],# 50],
    'classifier__depth': [6],#, 9]
    }]
    clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')
    clf.fit(X_train, y_train.values.astype(int))#, fit_params = {'classifier__early_stopping_rounds':15})
    print(f"i = {i}, roc_auc = {clf.best_score_}, params = {clf.best_params_}")
    y_pred_target = clf.best_estimator_.predict_proba(X_test)[:, 1]
    print(f"holdout i = {i}, roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target)}")
    clf.best_estimator_['classifier'].get_feature_importance(prettified=True).to_csv(f"tmp_{i}.csv")

2


  F /= J
  cond2 = cond0 & (x <= _a)








