In [6]:
# ADHD Classification Based on Activity
# Purpose: Determine whether activity can predict diagnosis of ADHD
# Author: Alexander Maksiaev

In [7]:
# Housekeeping

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tsfresh import extract_features
from tsfresh import extract_relevant_features
from tsfresh import select_features
import glob
import os

# Directories
main_dir = "c:/Users/maksi/Documents/Statistics/Projects/Movement_Mental_Health/"

adhd_dir = os.path.join(main_dir, "hyperaktiv/")
activity_dir = os.path.join(adhd_dir, "activity_data/")
hrv_dir = os.path.join(adhd_dir, "hrv_data/")
controls_dir = os.path.join(adhd_dir, "hyperaktiv_with_controls/hyperaktiv_with_controls/")
activity_dir_controls = os.path.join(controls_dir, "activity_data/")

os.chdir(adhd_dir)

# Main files
patient_info = pd.read_csv("patient_info.csv", delimiter=";")
features = pd.read_csv("features.csv", delimiter=";")
patient_info
# print(features)

os.chdir(activity_dir)
activity_data_01 = pd.read_csv("patient_activity_01.csv", delimiter=";")
activity_data_01["ID"] = 1
activity_data_01

# os.chdir(hrv_dir)
# hrv_data = pd.read_csv("patient_hr_1.csv", delimiter=";")
# hrv_data

Unnamed: 0,TIMESTAMP,ACTIVITY,ID
0,02-23-2009 16:00,0,1
1,02-23-2009 16:01,195,1
2,02-23-2009 16:02,240,1
3,02-23-2009 16:03,209,1
4,02-23-2009 16:04,202,1
...,...,...,...
8631,03-01-2009 15:51,0,1
8632,03-01-2009 15:52,0,1
8633,03-01-2009 15:53,0,1
8634,03-01-2009 15:54,131,1


In [8]:
# Data initialization

os.chdir(controls_dir)

controls_info = pd.read_csv("patient_info.csv", delimiter=";") # Includes patients as well
# print(controls_info)

adhd_patients = controls_info[controls_info["ADHD"] == 1.0] # ADHD only
# print(adhd_patients)
# print(controls_info.columns)

controls = controls_info[np.sum(controls_info.loc[:, "ADHD":"OTHER"], axis=1) == 0] # Neurotypicals only
non_adhd_controls = controls_info[controls_info["ADHD"] == 0] # Non-ADHD only
# print(controls)
# print(non_adhd_controls)


In [20]:
# Concat all activity dataframes

os.chdir(activity_dir_controls)

csv_files = glob.glob("*.csv")

# Create an empty dataframe to store the combined data
mega_activity_df = pd.DataFrame()

# Loop through each CSV file and append it to the combined dataframe
for file in csv_files:
    file_name_pieces = str.split(file, sep="_")
    pt_id = file_name_pieces[-1][:-4] # Last part of the file name is patient id, without the .csv
    df = pd.read_csv(file, delimiter=";")
    df["ID"] = int(pt_id)
    mega_activity_df = pd.concat([mega_activity_df, df], ignore_index=True)

mega_activity_df
print(len(mega_activity_df))
mega_activity_df = mega_activity_df.dropna()

print(len(mega_activity_df[mega_activity_df["ID"] <= 5]))

1494322
82323


In [25]:
# Pre-processing: Activity

# Pts with motor activity recordings
active_controls = controls_info[controls_info["ACC"] == 1]
# print(active_controls)

# Each pt has their own csv file...
# demographics_01 = active_controls[active_controls["ID"] == 1]
y = pd.Series(active_controls["ADHD"], index=active_controls["ID"])
y = y.fillna(0)
# y["ADHD"] = active_controls["ADHD"]
# y.index = active_controls["ID"]
# y["ID"] = y.index
# print(y)
# print(list(y.index))
# print(demographics_01)
# print(activity_data_01)
print(y.loc[1:10])

# activity_data_01.plot(figsize=(10,10))
# plt.show()

# Find relevant features using tsfresh
relevant_features = extract_relevant_features(mega_activity_df[mega_activity_df["ID"] <= 5], y.loc[1:5], column_id='ID', column_sort='TIMESTAMP')

relevant_features.head()

# X_full_train, X_full_test, y_train, y_test = train_test_split(relevant_features, y, test_size=.3, random_state=42)
# X_filtered_train = select_features(X_full_train, y_train)
# X_filtered_train.head()

ID
1     1.0
2     1.0
3     0.0
5     0.0
7     0.0
8     0.0
9     1.0
10    0.0
Name: ADHD, dtype: float64


Feature Extraction:   0%|          | 0/8 [00:38<?, ?it/s]


MemoryError: Unable to allocate 1.67 GiB for an array with shape (8634, 8634, 3) and data type int64

In [66]:
# Splitting into training set and testing set

# Don't let the machine know which ones are ADHD and which ones aren't
X = controls_info.drop("ADHD", axis=1)
y = controls_info["ADHD"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify = y, random_state=2025)
