In [None]:
#Authors: Tlachac, et al
#Paper: "Automated Construction of Lexicons to Improve Depression Screening with Text Messages"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import preprocessing
from scipy import stats
import collections
import operator
import argparse
import random
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn import metrics
from statistics import mean 
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn import utils
from sklearn.datasets import load_digits
from sklearn import svm
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.decomposition import PCA, KernelPCA, NMF

In [None]:
name = "featuresCombined5r"
data = pd.read_csv(name + ".csv")
label = "scores"
split = 10
numTexts = 5
print(data.shape)

#binary labels
d10 = []
s1 = []
for i in range(0, data.shape[0]):
    if int(data.scores[i]) >= split:
        d10.append(1)
    else:
        d10.append(0)
data[label] = d10

data = data[data["NumTexts"]>=numTexts]
print(data.shape)

data.head()

In [None]:
featureEs = ["Chi"]
modelTypelist = ["NB","LR1","SVC1", "SVC2", "SVC3", "SVC4", "kNN3", "kNN5"]
    
#create lists to populate
flist = [] 
mlist = []
llist = []
featureList = []
f1List = []
accuracyList = []
truePosList = []
trueNegList = []
falsePosList = []
falseNegList = []
predictions = []
rseed = []

for run in range(0,100):
    r = random.randint(0, 100000)

    #train/test split    
    df_train, df_test = train_test_split(data, test_size=0.3, stratify=data[[label]], random_state = r)
    trainids = list(df_train["id"])
    testids = list(df_test["id"])
    print(data.shape)
    testdata = data[data['id'].isin(testids)]
    print(testdata.shape)
    traindata = data[data['id'].isin(trainids)]
    print(traindata.shape)

    #limit to features
    testContent = testdata[testdata.columns[5:]]
    print(testContent.shape)
    trainContent = traindata[traindata.columns[5:]]
    print(trainContent.shape)

    #NEED TO SCALE BEFORE FEATURE SELECTION/REDUCATION
    min_max_scaler = preprocessing.MinMaxScaler()  
    np_scaled = min_max_scaler.fit_transform(trainContent)
    featureSubset = pd.DataFrame(np_scaled)
    np_scaled2 =  min_max_scaler.transform(testContent)
    testSubset = pd.DataFrame(np_scaled2)

    target = list(traindata[label])

    for featureE in featureEs:

        featureDF = []
        testDFs = []

        if featureE == "Chi":
            nFeatureList = list(np.arange(1,11,1))

            for numberOfFeatures in nFeatureList:
                chisetup = SelectKBest(chi2, k=numberOfFeatures)
                chisetup = chisetup.fit(featureSubset, target)
                featureSubset2 = chisetup.transform(featureSubset)
                featureSubset2=pd.DataFrame(featureSubset2).assign(target = target)
                featureDF.append(featureSubset2)
                testSubset2 = chisetup.transform(testSubset)
                testDFs.append(pd.DataFrame(testSubset2))

        for f in range(0, len(featureDF)):

            train_phq9 = featureDF[f]
            X_test = testDFs[f]

            # upsampling 
            #Count 1s and 0s
            ones = len(train_phq9.loc[train_phq9['target'] == 1])
            zeros = len(train_phq9.loc[train_phq9['target'] == 0])
            if ones >= zeros:
                majority = 1
                minority = 0
            else:
                majority = 0
                minority = 1


            # Upsample TrainingSet 
            train_majority = train_phq9[train_phq9.target==majority]
            train_minority = train_phq9[train_phq9.target==minority]

            #print("train_majority ="  + str(len(train_majority)))
            #print("train_minority ="  + str(len(train_minority)))

            # Upsample minority class
            train_minority_upsampled = resample(train_minority, 
                                             replace=True,     # sample with replacement
                                             n_samples=len(train_majority),    # to match majority class
                                             random_state=42) # reproducible results

            # Combine majority class with upsampled minority class
            train_phq9 = pd.concat([train_majority, train_minority_upsampled])

            #seperate features and target
            y_train = train_phq9["target"]
            X_train = train_phq9.drop(columns = "target")

            for modelType in modelTypelist:

                #add data to lists
                llist.append(label)
                featureList.append(f +1)
                flist.append(featureE)
                mlist.append(modelType)

                #chose model type
                if modelType == "SVC1":
                    clf = svm.SVC(kernel='rbf', random_state=r)
                elif modelType == "SVC2":
                    clf = svm.SVC(kernel='linear', random_state=r)
                elif modelType == "SVC3":
                    clf = svm.SVC(kernel='poly', random_state=r)
                elif modelType == "SVC4":
                    clf = svm.SVC(kernel='sigmoid', random_state=r)
                elif modelType == "kNN3":
                    clf = KNeighborsClassifier(n_neighbors=3)
                elif modelType == "kNN5":
                    clf = KNeighborsClassifier(n_neighbors=5)
                elif modelType == "LR1":
                    clf = LogisticRegression(random_state=r)
                elif modelType == "NB":
                    clf = GaussianNB()


                #train model and make predictions
                clf.fit(X_train, y_train)
                
                print(X_train.columns)
                print(X_test.columns)
                y_pred = clf.predict(X_test)

                #evaluate model
                conf_mat = confusion_matrix(list(testdata[label]), y_pred)
                TN = conf_mat[0][0]
                TP = conf_mat[1][1]
                FP = conf_mat[0][1]
                FN = conf_mat[1][0]
                precision = TP/(TP+FP)
                sensitivity = TP/(TP+FN)
                f1 = (2*precision*sensitivity)/(precision + sensitivity)
                accuracy = (TP+TN)/(TN+TP+FP+FN)

                #populate lists with results
                f1List.append(f1)
                accuracyList.append(accuracy)
                truePosList.append(TP)
                trueNegList.append(TN)
                falsePosList.append(FP)
                falseNegList.append(FN)
                predictions.append(y_pred)
                rseed.append(r)

resultsDF = pd.DataFrame()
resultsDF["label"] = llist
resultsDF["Engineering"] = flist
resultsDF["model"] = mlist
resultsDF["nFeatures"] = featureList
resultsDF["F1"] = f1List
resultsDF["Accuracy"] = accuracyList
resultsDF["truePos"] = truePosList
resultsDF["trueNeg"] = trueNegList
resultsDF["falsePos"] = falsePosList
resultsDF["falseNeg"] = falseNegList
resultsDF["predictions"] = predictions
resultsDF["randomSeed"] = rseed

resultsDF.to_csv("results/" + name + str(split) + ".csv")

In [None]:
resultsDF.head()