In [28]:
from catboost import CatBoostClassifier
from scipy import stats
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from fancyimpute import IterativeImputer
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier

from sklearn.model_selection import GridSearchCV

random_state = 7656
# seed
# import os
# import random
# os.environ['PYTHONHASHSEED']=str(271828)
# random.seed(271828)
# np.random.seed(271828)


In [29]:
from model_from_last_year.tmp.load_data import load_data, stds, stats, removal

In [30]:
df_preprocessed, features, target_feature = load_data()

(array([ 20, 502, 585, 688], dtype=int64),)


KeyError: "['t2_missing', 't1_missing'] not in index"

In [None]:
df_preprocessed[target_feature].count()

## cut off the test set

In [31]:
X, X_out, Y, y_out = train_test_split(df_preprocessed[features], df_preprocessed[target_feature[0]],\
                                      test_size=0.25, random_state=random_state,\
                                      stratify=df_preprocessed[target_feature[0]])


## the Model

## CV model of roc auc 

In [6]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


In [7]:

for i in range(1):
    X_train, X_test, y_train, y_test = train_test_split(X, Y,  random_state=random_state, test_size=0.2, stratify=Y)
    cv = StratifiedKFold(4)

    mice = IterativeImputer()
    X_train = pd.DataFrame(mice.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(mice.transform(X_test), columns=X_test.columns)
                    
    X_train = stds(X_train)
    X_test = stds(X_test)
    
    X_train = stats(X_train)
    X_test = stats(X_train, X_test)

    X_train = removal(X_train)
    X_test = removal(X_test)
    
    pipe = CatBoostClassifier(verbose=0, random_state=random_state, class_weights=[1, 25], l2_leaf_reg=100)
        
    grid_params = [{
        'class_weights': [1, 25],
        'l2_leaf_reg':100}]

    clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')
    clf.fit(X_train, y_train.values.astype(int), early_stopping_rounds=15)
    print(f"i = {i}, roc_auc = {clf.best_score_}, params = {clf.best_params_}")
    y_pred_target = clf.best_estimator_.predict_proba(X_test)[:, 1]
    print(f"holdout i = {i}, roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target)}")

ValueError: Parameter grid for parameter (l2_leaf_reg) needs to be a list or numpy array, but got (<class 'int'>). Single values need to be wrapped in a list with one element.

In [None]:
roc_auc_score(y_test.astype(int), y_pred_target)

In [None]:
X.shape

In [12]:
X_out.shape

(177, 187)

In [26]:
mice = IterativeImputer()
X = pd.DataFrame(mice.fit_transform(X), columns=X.columns)
X_out = pd.DataFrame(mice.transform(X_out), columns=X_out.columns)

In [27]:
X = stds(X)
X_out = stds(X_out)

KeyError: "None of [Index(['phq1', 'lot1', 'trait1', 'state1', 'PCL1', 'PCL_Broad1', 'PCL_Strict1',\n       'active_coping1', 'planning1', 'positive_reframing1', 'acceptance1',\n       'humor1', 'religion1', 'emotional_support1', 'instrumental_support1',\n       'self_distraction1', 'denial1', 'venting1', 'substance_use1',\n       'behavioral_disengagement1', 'self_blame1', 'q6.1_INTRU_pcl1',\n       'q6.2_DREAM_pcl1', 'q6.3_FLASH_pcl1', 'q6.4_UPSET_pcl1',\n       'q6.5_PHYS_pcl1', 'q6.6_AVTHT_pcl1', 'q6.7_AVSIT_pcl1',\n       'q6.8_AMNES_pcl1', 'q6.9_DISINT_pcl1', 'q6.10_DTACH_pcl1',\n       'q6.11_NUMB_pcl1', 'q6.12_FUTRE_pcl1', 'q6.13_SLEEP_pcl1',\n       'q6.14_ANGER_pcl1', 'q6.15_CONC_pcl1', 'q6.16_HYPER_pcl1',\n       'q6.17_STRTL_pcl1'],\n      dtype='object')] are in the [columns]"

In [8]:
X = stats(X)
X_out = stats(X, X_out)

In [9]:
# X = removal(X)
# X_out = removal(X_out)

In [20]:
pipe = Pipeline(steps=[
            #('rfe', RFE(ExtraTreesClassifier(n_estimators=100), n_features_to_select = 20)),
    ('classifier', CatBoostClassifier(verbose=0, class_weights=[1, 25], l2_leaf_reg=100, random_state=random_state))
]) 
feats = ["outliers_count_t1_25", "delta_self_distraction", 
         "cooks_d_intrusion_pcl2", "delta_denial", "delta_q6.11_NUMB_pcl",
         "delta_q6.8_AMNES_pcl", "delta_trait", "standard_resid_intrusion_pcl2",
         "delta_phq", "delta_state", "delta_q6.13_SLEEP_pcl", "delta_q6.3_FLASH_pcl",
         "highschool_diploma", "outliers_count_t1_35"]
pipe = pipe.fit(X, Y.values.astype(int))

In [22]:
pipe['classifier'].get_feature_importance(prettified=True).to_csv("tmp.csv")

In [23]:
y_pred_target= pipe.predict_proba(X_out)[:, 1]  

In [24]:

print(f"test roc_auc = {roc_auc_score(y_out, y_pred_target)}")

test roc_auc = 0.7453450164293538


In [25]:

fpr, tpr, threshold = roc_curve(y_out, y_pred_target)
roc_auc = auc(fpr, tpr)

                    # method I: plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


NameError: name 'roc_curve' is not defined