In [1]:
#Author: ML Tlachac
#Paper: Symptom Detection with Text Message Log Distributions for Holistic Depression and Anxiety Screening
#year: 2024
#github.com/mltlachac/SLOTH

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import preprocessing
from scipy import stats
import collections
import operator
import argparse
import random
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn import metrics
from statistics import mean 
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn import utils
from sklearn.datasets import load_digits
from sklearn import svm
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import AdaBoostClassifier

In [None]:
name = "featuresALL"
data = pd.read_csv("dataCleaned/" + name + ".csv")
qs = pd.read_csv("questionsALL.csv")
print(data.shape)
data = data.merge(qs, on = 'id')
print(data.shape)
mp = 7
data = data[data.exchanges >= mp].reset_index(drop=True)
print(data.shape)
print(data.columns)
data.head()

In [3]:
modelTypelist = ["NB","LR","SVC1", "SVC2", "kNN", "RF", "XG", "Ada"]

labellist = ['phq', "gad", 'PHQ - Q1', 'PHQ - Q2', 'PHQ - Q3', 'PHQ - Q4', 'PHQ - Q5', 'PHQ - Q6', 'PHQ - Q7', 'PHQ - Q8', 'PHQ - Q9', 'GAD - Q1', 'GAD - Q2', 'GAD - Q3', 'GAD - Q4', 'GAD - Q5', 'GAD - Q6', 'GAD - Q7']

for label in labellist: 
    
    #binary labels
    d10 = []
    for i in range(0, data.shape[0]):
        if (label == "phq") | (label == "gad"):
            if int(data[label][i]) >= 10:
                d10.append(1)
            else:
                d10.append(0)
        elif label == 'PHQ - Q9':
            if int(data[label][i]) >= 1:
                d10.append(1)
            else:
                d10.append(0)
        else:
            if int(data[label][i]) >= 2:
                d10.append(1)
            else:
                d10.append(0)
            
    data[label] = d10
    
    print(label)
    print(sum(d10)/len(d10))

    
    #create lists to populate
    mlist = []
    llist = []
    featureList = []
    truePosList = []
    trueNegList = []
    falsePosList = []
    falseNegList = []

    for r in range(0,data.shape[0]):
        testdata = data[r:r+1]
        traindata = data.drop([r])

        #limit to features
        testContent = testdata[testdata.columns[-16-5:-16]] 
        #print(testContent.shape)
        trainContent = traindata[traindata.columns[-16-5:-16]]
        #print(trainContent.shape)

        #NEED TO SCALE
        min_max_scaler = preprocessing.MinMaxScaler()  
        np_scaled = min_max_scaler.fit_transform(trainContent)
        featureSubset = pd.DataFrame(np_scaled)
        np_scaled2 =  min_max_scaler.transform(testContent)
        testSubset = pd.DataFrame(np_scaled2)

        target = list(traindata[label])
        y_test = list(testdata[label])


        featureDF = []
        testDFs = []

        nFeatureList = list(np.arange(1,6,1)) #10 for all
        for numberOfFeatures in nFeatureList:
            #pca = KernelPCA(n_components=numberOfFeatures, kernel = "rbf")
            pca = PCA(n_components=numberOfFeatures)
            pca = pca.fit(featureSubset)
            X_pca = pca.transform(featureSubset)
            pcaDF = pd.DataFrame(X_pca)
            pcaDF = pcaDF.assign(target = target)
            featureDF.append(pcaDF)
            testSubset2 = pca.transform(testSubset)
            testDFs.append(pd.DataFrame(testSubset2))

        for f in range(0, len(featureDF)):

            train_phq9 = featureDF[f]
            X_test = testDFs[f]

            #seperate features and target
            y_train = train_phq9["target"]
            X_train = train_phq9.drop(columns = "target")

            for modelType in modelTypelist:

                #add data to lists
                llist.append(label)
                featureList.append(f +1)
                mlist.append(modelType)

                #chose model type
                if modelType == "SVC1":
                    clf = svm.SVC(random_state=r)
                elif modelType == "SVC2":
                    clf = svm.SVC(random_state=r, kernel = "linear")
                elif modelType == "kNN":
                    clf = KNeighborsClassifier()
                elif modelType == "LR":
                    clf = LogisticRegression(random_state=r)
                elif modelType == "NB":
                    clf = GaussianNB()
                elif modelType == "RF":
                    clf = RandomForestClassifier(random_state=r, max_depth = 3)
                elif modelType == "XG":
                    clf = xgb.XGBClassifier(random_state=r, max_depth = 3)
                elif modelType == "Ada":
                    clf = AdaBoostClassifier(random_state=r)
                else:
                    print("NOT A VALID MODEL!")

                #train model and make predictions
                clf.fit(X_train, y_train)

                #print(X_train.columns)
                #print(X_test.columns)
                y_pred = clf.predict(X_test)

                #evaluate model
                TP = 0
                TN = 0
                FP = 0
                FN = 0
                if (y_pred[0]==1) & (y_test[0]==1):
                    TP = 1
                elif (y_pred[0]==0) & (y_test[0]==0):
                    TN = 1
                elif (y_pred[0]==1) & (y_test[0]==0):
                    FP = 1
                elif (y_pred[0]==0) & (y_test[0]==1):
                    FN = 1
                else:
                    print("ERROR")

                #populate lists with results
                truePosList.append(TP)
                trueNegList.append(TN)
                falsePosList.append(FP)
                falseNegList.append(FN)

    resultsDF = pd.DataFrame()
    resultsDF["label"] = llist
    resultsDF["model"] = mlist
    resultsDF["nFeatures"] = featureList
    resultsDF["truePos"] = truePosList
    resultsDF["trueNeg"] = trueNegList
    resultsDF["falsePos"] = falsePosList
    resultsDF["falseNeg"] = falseNegList

    resultsDF.to_csv("resultsPaperCV/" + name + str(mp) + label + "_ratio.csv")

phq
0.5625
gad
0.5
PHQ - Q1
0.4
PHQ - Q2
0.4375
PHQ - Q3
0.6
PHQ - Q4
0.55
PHQ - Q5
0.4375
PHQ - Q6
0.4
PHQ - Q7
0.4875
PHQ - Q8
0.2
PHQ - Q9
0.425
GAD - Q1
0.4625
GAD - Q2
0.4125
GAD - Q3
0.525
GAD - Q4
0.5
GAD - Q5
0.4
GAD - Q6
0.425
GAD - Q7
0.4375


In [4]:
testdata.columns[-16-5:-16]

Index(['quant10_ratio', 'quant25_ratio', 'quant50_ratio', 'quant75_ratio',
       'quant90_ratio'],
      dtype='object')