In [1]:
#author: Ermal Toto
#edited by ML Tlachac

import pandas as pd
import numpy as np
import collections
import operator
import argparse
import random
import xgboost as xgb
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn import svm
from statistics import mean 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn import utils
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
#generate 100 random numbers as each model is run 100 times with different random seeds

rlist = []
count = 0
while count<100:
    rlist.append(random.randint(0,1000000000))
    count += 1

In [None]:
resultsDF = pd.DataFrame()
fileList = []
modelList = []
resampleList = []
nFeaturesList = []
phq9cutList = []
kernelNeighborList = []
precisionList = []
recallList = []
f1List = []
accuracyList = []
truePosList = []
trueNegList = []
falsePosList = []
falseNegList = []
randoms = []

verbose = False
dataStart = 3
dataEnd = 51
targetData = -1 #last feature used as class variable. 
targetDataCount = 1
folds = 5
scoring = ['precision', 'recall', 'f1','accuracy']
scoref = ['f1']
missingValues = -999 #Remove Instances: Remove, 0, -100, -1
doFeatureSelection = "False"
printResultHeader = "True"
parser = argparse.ArgumentParser()
modelType = "kNN" #SVC, RF, kNN, XG, LR, NB, ADA
numNeighborsList = [5]
svckernelList = ["poly"]
cutoff = 10 #phq-9 cutoff for prediction
resampleType = "down" #if anything else, no data balancing occurs
numberOfFeaturesList = [10] #if doing feature selection
doFeatureSelection = "False"
datasets = ["nTBdf.csv", "tTBdf.csv"]
newScores = []

for number in rlist:
    r = number
    random.seed(r)
    for d in datasets:
        filename = d
        for f in numberOfFeaturesList:
            numberOfFeatures = f
            for i in range(0,1): #change range for methods with more values in numNeighborsList

                if numberOfFeatures > (dataEnd - dataStart):
                        numberOfFeatures = dataEnd - dataStart

                data = pd.read_csv(filename)

                #remove unwanted column if needed
                if "n_nChar" in set(data.columns):
                    data = data.drop(columns = ["n_nChar"]).reset_index()
                elif "t_nChar" in set(data.columns):
                    data = data.drop(columns = ["t_nChar"]).reset_index()
                else:
                    data = data.drop(columns = ["k_nChar"]).reset_index()

                featureSubset = data[data.columns[dataStart:dataEnd]] #Skip PHQ-9 Responses 
                print(featureSubset.columns)
                target = data[data.columns[targetData]]
                print(target.head())
                featureSubset=featureSubset.assign(target = target)
                if missingValues == '-999':
                    featureSubset = featureSubset.dropna()
                else:
                    featureSubset = featureSubset.replace(np.nan, missingValues, regex=True) # Replace all missing values with missingValues as defined
                featureSubset[featureSubset.columns[-1]] = np.where(featureSubset[featureSubset.columns[-1]] > cutoff, 1, 0)

                #Identify majority and miniority class for downsample proceedures
                targetClassCount = collections.Counter(featureSubset[featureSubset.columns[-1]])
                majorityKey = max(targetClassCount, key=targetClassCount.get)
                majorityCount = targetClassCount[majorityKey]
                minorityKey = min(targetClassCount,  key=targetClassCount.get)
                minorityCount = targetClassCount[minorityKey]

                #Separate minority and majority classes
                featureSubset_majority = featureSubset[featureSubset[featureSubset.columns[len(featureSubset.columns)-1]] == majorityKey]
                featureSubset_minority = featureSubset[featureSubset[featureSubset.columns[len(featureSubset.columns)-1]] == minorityKey]

                # Downsample minority class
                featureSubset_majority_downsampled = resample(featureSubset_majority, 
                                                 replace=False,     # sample with replacement
                                                 n_samples=minorityCount,    # to match majority class
                                                 random_state=r) # reproducible results

                # Combine majority class with upsampled minority class
                featureSubset_downsampled = pd.concat([featureSubset_majority_downsampled, featureSubset_minority])
                featureSubset_downsampled = featureSubset_downsampled.sample(frac=1).reset_index(drop=True)

                if resampleType == "down":
                    featureSubset = featureSubset_downsampled


                # separate target from features
                target = featureSubset[featureSubset.columns[-1]]  
                featureSubset = featureSubset[featureSubset.columns[:targetDataCount*-1]] #Skip PHQ-9 Responses 


                # In[25]: Scale date between 0 and 1. Several algorithms including feature selection need this. 
                min_max_scaler = preprocessing.MinMaxScaler()
                np_scaled = min_max_scaler.fit_transform(featureSubset)
                featureSubset = pd.DataFrame(np_scaled)



                # In[26]: Feature Selection
                if(doFeatureSelection == "True"):
                    from sklearn.datasets import load_digits
                    from sklearn.feature_selection import SelectKBest, chi2
                    featureSubset = SelectKBest(chi2, k=numberOfFeatures).fit_transform(featureSubset, target)

                #SVC, RF, kNN, XG
                if modelType == "SVC":
                    svckernel = svckernelList[i]
                    kernelNeighborList.append(svckernelList[i])
                    clf = svm.SVC(kernel=svckernel, C=1, random_state=0)
                elif modelType == "RF":
                    kernelNeighborList.append(i)
                    clf = RandomForestClassifier(n_estimators=100, max_depth=i,random_state=0)
                elif modelType == "kNN":
                    numNeighbors = numNeighborsList[i]
                    kernelNeighborList.append(numNeighborsList[i])
                    clf = KNeighborsClassifier(n_neighbors=numNeighbors)
                elif modelType == "XG":
                    kernelNeighborList.append(i)
                    clf = xgb.XGBClassifier(max_depth=i)
                elif modelType == "LR":
                    kernelNeighborList.append("NA")
                    clf = LogisticRegression(random_state=r)
                elif modelType == "NB":
                    kernelNeighborList.append("NA")
                    clf = GaussianNB()
                elif modelType == "ADA":
                    kernelNeighborList.append("NA")
                    clf = AdaBoostClassifier(n_estimators=100, random_state=r)

                scores = cross_validate(clf, featureSubset, target, scoring=scoring,cv=folds, return_train_score=False)
                y_pred = cross_val_predict(clf, featureSubset, target, cv=folds)

                conf_mat = confusion_matrix(target, y_pred)
                TP = conf_mat[0][0]
                TN = conf_mat[1][1]
                FP = conf_mat[0][1]
                FN = conf_mat[1][0]
                precision = mean(scores['test_precision'])
                sensitivity = mean(scores['test_recall'])
                f1 = mean(scores['test_f1'])
                accuracy = mean(scores['test_accuracy'])

                randoms.append(r)
                fileList.append(d)
                modelList.append(modelType)
                resampleList.append(s)
                nFeaturesList.append(f)
                phq9cutList.append(c)
                precisionList.append(precision)
                recallList.append(sensitivity)
                f1List.append(f1)
                accuracyList.append(accuracy)
                truePosList.append(TP)
                trueNegList.append(TN)
                falsePosList.append(FP)
                falseNegList.append(FN)

In [None]:
resultsDF["file"] = fileList
resultsDF["model"] = modelList
resultsDF["resample"] = resampleList
resultsDF["nFeatures"] = nFeaturesList
resultsDF["phq9cut"] = phq9cutList
resultsDF["kernelNeighbor"] = kernelNeighborList
resultsDF["precision"] = precisionList
resultsDF["recall"] = recallList
resultsDF["f1List"] = f1List
resultsDF["accuracy"] = accuracyList
resultsDF["truePos"] = truePosList
resultsDF["trueNeg"] = trueNegList
resultsDF["falsePos"] = falsePosList
resultsDF["falseNeg"] = falseNegList
resultsDF["r"] = randoms
resultsDF.to_csv("resultsDF.csv")