In [1]:
import torch
import librosa
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import json

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [2]:
train_df = pd.read_csv('../datasets/addresso/train/meta_data.csv')
train_df['path'] = '../datasets/addresso/train/whisper_med_transcript_processed/' + train_df['ID'] + '.txt'
train_df = train_df.dropna()

train_df.head()

Unnamed: 0,ID,age,gender,mmse,class,path
1,S002,62,female,30.0,0,../datasets/addresso/train/whisper_med_transcr...
2,S003,69,female,29.0,0,../datasets/addresso/train/whisper_med_transcr...
3,S004,71,female,30.0,0,../datasets/addresso/train/whisper_med_transcr...
4,S005,74,female,30.0,0,../datasets/addresso/train/whisper_med_transcr...
5,S006,67,female,29.0,0,../datasets/addresso/train/whisper_med_transcr...


In [8]:
vocab = []

def path_to_features(path):
    with open(path, 'r') as f:
        text = f.read()
    
    words = text.split(' ')

    print(path)
    print(words)

    # bag of words
    features = np.zeros(5000)
    for word in words:
        if word in vocab:
            features[vocab.index(word)] += 1
        else:
            vocab.append(word)
            features = np.append(features, 1)

    # number of words
    features = np.append(features, len(words))

    return features

In [9]:
train_df['features'] = train_df['path'].apply(path_to_features)

X = np.stack(train_df['path'].apply(path_to_features))
y = np.array(train_df['class'])

../datasets/addresso/train/whisper_med_transcript_processed/S002.txt
['the', 'next', 'door', 'sh', 'the', 'door', 'light', 'of', 'the', 'window', 'which', 'is', 'the', 'one', 'like', 'to', 'look', 'at', 'the', 'window', 'its', 'really', 'the', 'list', 'by', 'the', 'door', 'and', 'one', 'of', 'the', 'same', 'time']
../datasets/addresso/train/whisper_med_transcript_processed/S003.txt
['okay', 'there', 'is', 'a', 'little', 'boy', 'and', 'hes', 'getting', 'hes', 'standing', 'on', 'a', 'stool', 'thats', 'upsetting', 'and', 'hes', 'getting', 'a', 'cookie', 'and', 'hes', 'sharing', 'a', 'cookie', 'with', 'the', 'little', 'girl', 'who', 'is', 'pushing', 'her', 'mother', 'with', 'her', 'finger', 'and', 'reaching', 'for', 'the', 'cookie', 'at', 'the', 'same', 'time', 'what', 'are', 'the', 'instructions', 'just', 'give', 'as', 'many', 'details', 'like', 'action', 'oh', 'okay', 'the', 'cupboard', 'door', 'is', 'open', 'the', 'stool', 'is', 'crooked', 'the', 'water', 'is', 'splashing', 'on', 'the',

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = xgb.XGBClassifier()
pred_df = pd.DataFrame()

# 3. Calculate AUC for each fold
auc_scores = []
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]  # Probability of positive class
    auc = roc_auc_score(y_test, y_proba)
    auc_scores.append(auc)

    val_df = train_df.iloc[test_index].copy()
    val_df.loc[:, f'pred'] = y_proba
    pred_df = pd.concat([pred_df, val_df])

# 4. Summarize Results
print("Individual AUC scores:", auc_scores)
print("Mean AUC:", sum(auc_scores) / len(auc_scores))

Individual AUC scores: [0.7024793388429752, 0.7851239669421487, 0.9363636363636363, 0.9272727272727272, 0.8909090909090909]
Mean AUC: 0.8484297520661157


In [6]:
pred_df.to_csv('pred_df_xgboost.csv')

In [7]:
clf = SVC(kernel='linear')

In [8]:
scores = cross_val_score(clf, X_train, y_train, cv=5)

# Test

In [9]:
print(np.mean(scores))

0.7326797385620915
