In [1]:
from catboost import CatBoostClassifier
from scipy import stats
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from fancyimpute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.model_selection import GridSearchCV
import xlsxwriter
from random import randint
random_state = 7656
from preprocessing import stds, stats, cv_preprocessing
from load_data import load_data
import os


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df_preprocessed, features, target_feature = load_data()
target_feature

['PCL_Strict3', 'target_tred', 'target_intrusion', 'phq3']

In [3]:
trans_2009_2016_features = {
    "highschool_diploma": "bagrut",
    "dyslexia":"dyslexia", 
    "ADHD":"ADHD",
    "T1Acc1t": "Accuracy_threat_T1",
    "T1Acc1n": "Accuracy_NT_T1",
    "T1bias": "Threat_Bias_T1",
    "phq1": "PHQ_T1",
    "trait1": "Trait_T1",
    "state1": "State_T1",
    "PCL1": "PCL_T1",
    "intrusion_score": "Intrusion_T1"
    }
features = [i for i in features if i in trans_2009_2016_features.keys()]

In [4]:
features

['highschool_diploma',
 'dyslexia',
 'ADHD',
 'T1Acc1t',
 'T1Acc1n',
 'T1bias',
 'phq1',
 'trait1',
 'state1',
 'PCL1',
 'intrusion_score']

In [5]:
df_2016 = pd.read_csv(r"C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\IDF_ABM_16.2.15_wide.csv")
df_2016 = df_2016[df_2016['Group']=='control']
df_2016 = df_2016[~df_2016['PCL_T4'].isna()]

## process with 2009

In [6]:
X, X_out, Y, y_out = train_test_split(df_preprocessed[features], df_preprocessed[target_feature[0]],\
                                          test_size=0.25, random_state=random_state,\
                                          stratify=df_preprocessed[target_feature[0]])

In [7]:
cv = StratifiedKFold(6, random_state=random_state, shuffle=True)

In [8]:
pipe = Pipeline(steps=[
        ('classifier', CatBoostClassifier(verbose=0, random_state=random_state))])

In [9]:
mice = IterativeImputer(max_iter=50, random_state=random_state)
X = pd.DataFrame(mice.fit_transform(X), columns=X.columns)
X_out = pd.DataFrame(mice.transform(X_out), columns=X.columns)

In [10]:
grid_params = [{
    'classifier__class_weights':[[1, 14]],#, [1, 15], [1, 30]],
    'classifier__l2_leaf_reg': [150],# 50],
    'classifier__depth': [7, 4],#, 9]
    }]
clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')
clf.fit(X, Y.values.astype(int), classifier__early_stopping_rounds = 15)
print(f"roc_auc = {clf.best_score_}, params = {clf.best_params_}")

roc_auc = 0.7668733538191397, params = {'classifier__class_weights': [1, 14], 'classifier__depth': 4, 'classifier__l2_leaf_reg': 150}


In [11]:
y_pred_target = clf.best_estimator_.predict_proba(X_out)[:, 1]
print( f"roc_auc = {roc_auc_score(y_out.astype(int), y_pred_target)}")

roc_auc = 0.8027777777777778


## train 2009

In [12]:
X_train, y_train = df_preprocessed[features], df_preprocessed[target_feature[0]]


In [13]:
trans_2016_2009_features = {trans_2009_2016_features[i]: i for i in trans_2009_2016_features.keys()}
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)

In [14]:
X_test, y_test = df_2016[features], df_2016['PCL_T4'] > 49

In [15]:
X_test['highschool_diploma'] = X_test['highschool_diploma'] == 'yes'
X_test['dyslexia'] = X_test['dyslexia'] == 'yes'
X_test['ADHD'] = X_test['ADHD'] == 'yes'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
mice = IterativeImputer(max_iter=50, random_state=random_state)
X_train = pd.DataFrame(mice.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(mice.transform(X_test), columns=X_test.columns)

In [19]:
clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')

clf.fit(X_train, y_train, classifier__early_stopping_rounds = 15)
print(f"roc_auc = {clf.best_score_}, params = {clf.best_params_}")

roc_auc = 0.7955097794423637, params = {'classifier__class_weights': [1, 14], 'classifier__depth': 7, 'classifier__l2_leaf_reg': 150}


In [20]:
y_pred_target = clf.best_estimator_.predict_proba(X_test)[:, 1]
print( f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target)}")

roc_auc = 0.8064516129032258


In [1]:
for i, j in zip(X_train.columns, clf.best_estimator_['rfe'].ranking_):
    

NameError: name 'X_train' is not defined