In [64]:
# Kaggle competition: March/April 2023

# Background

# Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression 

from sklearn.model_selection import train_test_split, cross_validate, \
cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, silhouette_score, roc_curve, auc, \
RocCurveDisplay, classification_report, ConfusionMatrixDisplay 

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from pathlib import Path
import joblib

import xgboost
import time

import warnings
warnings.filterwarnings('ignore')

# Data Understanding

## Downloading data files and unzipping locally

In [4]:
# source for competition information and data downloads:
# https://www.kaggle.com/competitions/tlvmc-parkinsons-freezing-gait-prediction/overview

# ! unzip tlvmc-parkinsons-freezing-gait-prediction.zip

## Reading csv's

In [5]:
tasks = pd.read_csv('tlvmc-parkinsons-freezing-gait-prediction/tasks.csv')
tasks

Unnamed: 0,Id,Begin,End,Task
0,02ab235146,10.000,190.480,Rest1
1,02ab235146,211.240,271.560,Rest2
2,02ab235146,505.880,522.400,4MW
3,02ab235146,577.960,594.640,4MW-C
4,02ab235146,701.320,715.280,MB1
...,...,...,...,...
2812,f9fc61ce85,800.586,810.714,TUG-DT
2813,f9fc61ce85,904.416,926.181,Turning-ST
2814,f9fc61ce85,959.211,989.855,Turning-DT
2815,f9fc61ce85,1087.313,1105.086,Hotspot1


"tasks.csv Task metadata for series in the defog dataset. (Not relevant for the series in the fog or daily datasets.)

- Id The data series where the task was measured.
- Begin Time (s) the task began.
- End Time (s) the task ended.
- Task One of seven tasks types in the DeFOG protocol, described on this page."

In [6]:
subjects = pd.read_csv('tlvmc-parkinsons-freezing-gait-prediction/subjects.csv')
subjects

Unnamed: 0,Subject,Visit,Age,Sex,YearsSinceDx,UPDRSIII_On,UPDRSIII_Off,NFOGQ
0,04fcdb,1.0,63,M,3.0,30.0,,0
1,05595e,1.0,56,M,8.0,28.0,,0
2,0967b2,1.0,59,M,10.0,38.0,48.0,19
3,0967b2,2.0,59,M,10.0,37.0,44.0,13
4,097078,,70,F,10.0,27.0,50.0,20
...,...,...,...,...,...,...,...,...
168,f90887,1.0,72,M,16.0,35.0,46.0,26
169,fc1e1b,1.0,82,F,11.0,38.0,42.0,21
170,fe5d84,2.0,72,M,14.0,32.0,45.0,17
171,fe5d84,1.0,72,F,14.0,13.0,33.0,15


"Metadata for each Subject in the study, including their Age and Sex as well as:

- Visit Only available for subjects in the daily and defog datasets.
- YearsSinceDx Years since Parkinson's diagnosis.
- UPDRSIIIOn/UPDRSIIIOff Unified Parkinson's Disease Rating Scale score during on/off medication respectively.
- NFOGQ Self-report FoG questionnaire score. See:
https://pubmed.ncbi.nlm.nih.gov/19660949/"

In [7]:
events = pd.read_csv('tlvmc-parkinsons-freezing-gait-prediction/events.csv')
events

Unnamed: 0,Id,Init,Completion,Type,Kinetic
0,003f117e14,8.61312,14.7731,Turn,1.0
1,009ee11563,11.38470,41.1847,Turn,1.0
2,009ee11563,54.66470,58.7847,Turn,1.0
3,011322847a,28.09660,30.2966,Turn,1.0
4,01d0fe7266,30.31840,31.8784,Turn,1.0
...,...,...,...,...,...
3707,f9fc61ce85,628.56000,631.6650,Walking,0.0
3708,f9fc61ce85,782.49800,782.6530,Walking,1.0
3709,f9fc61ce85,931.93900,933.4470,Turn,1.0
3710,f9fc61ce85,990.85900,991.8580,Turn,0.0


"Metadata for each FoG event in all data series. The event times agree with the labels in the data series.

- Id The data series the event occured in.
- Init Time (s) the event began.
- Completion Time (s) the event ended.
- Type Whether StartHesitation, Turn, or Walking.
- Kinetic Whether the event was kinetic (1) and involved movement, or akinetic (0) and static"

In [10]:
tdcs_meta = pd.read_csv('tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv')
tdcs_meta

Unnamed: 0,Id,Subject,Visit,Test,Medication
0,003f117e14,13abfd,3,2,on
1,009ee11563,d81e3a,4,2,on
2,011322847a,203e85,2,2,on
3,01d0fe7266,203e85,2,1,off
4,024418ba39,cecfb8,19,3,on
...,...,...,...,...,...
828,feba449e1a,47860d,19,1,on
829,ff4f844fd3,43fcae,2,3,on
830,ff53514514,a2a051,2,3,on
831,ff92d9244d,a9e866,20,2,on


"Identifies each series in the tdcsfog dataset by a unique Subject, Visit, Test, Medication condition.

- Visit Lab visits consist of a baseline assessment, two post-treatment assessments for different treatment stages, and one follow-up assessment.
- Test Which of three test types was performed, with 3 the most challenging.
- Medication Subjects may have been either off or on anti-parkinsonian medication during the recording."


In [11]:
defog_meta = pd.read_csv('tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv')
defog_meta

Unnamed: 0,Id,Subject,Visit,Medication
0,02ab235146,ab54e1,2,on
1,02ea782681,bf608b,2,on
2,06414383cf,c0b71e,2,off
3,092b4c1819,b6a627,1,off
4,0a900ed8a2,b7bd52,2,on
...,...,...,...,...
132,f3a921edee,ce8b0b,1,off
133,f40e8c6ebe,d9529b,1,off
134,f8ddbdd98d,fc1e1b,1,on
135,f9efef91fb,fe5d84,2,off


" Identifies each series in the defog dataset by a unique Subject, Visit, Medication condition."

In [12]:
daily_meta = pd.read_csv('tlvmc-parkinsons-freezing-gait-prediction/daily_metadata.csv')
daily_meta

Unnamed: 0,Id,Subject,Visit,Beginning of recording [00:00-23:59]
0,00c4c9313d,3d8b73,1,10:19
1,07a96f89ec,a15b56,1,07:30
2,0d1bc672a8,21e523,2,08:30
3,0e333c9833,b068a2,1,11:30
4,164adaed7b,ffa798,1,13:00
...,...,...,...,...
60,e658b0aa3d,b7bd52,1,08:00
61,ed0a487f20,268a2e,1,12:30
62,ef1db3ca64,473568,1,08:00
63,f16c5cda55,b6a627,1,08:30


"Each series in the daily dataset is identified by the Subject id. This file also contains the time of day the recording began."

In [14]:
# for the end
sample_submission = pd.read_csv('tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv')
sample_submission

Unnamed: 0,Id,StartHesitation,Turn,Walking
0,003f117e14_0,0,0,0
1,003f117e14_1,0,0,0
2,003f117e14_2,0,0,0
3,003f117e14_3,0,0,0
4,003f117e14_4,0,0,0
...,...,...,...,...
286365,02ab235146_281683,0,0,0
286366,02ab235146_281684,0,0,0
286367,02ab235146_281685,0,0,0
286368,02ab235146_281686,0,0,0


## Exploring data

### tDCS FOG data - in lab

In [23]:
# can replace this pathname with the full path to the folder locally
path_tdcs = r'C:\Users\JonMc\Documents\Flatiron\kaggle_Parkinsons\tlvmc-parkinsons-freezing-gait-prediction\train\tdcsfog' 

# Get the files from the path provided
files_tdcs = Path(path_tdcs).glob('*.csv')

In [24]:
# this for loop will create a separate column ('File') based on the filename, to separate subjects if needed

dfs_1 = []
for f in files_tdcs:
    data = pd.read_csv(f)
    # .stem is method for pathlib objects to get the filename without the extension
    data['File'] = f.stem
    dfs_1.append(data)

In [43]:
# concatenating all files into one dataframe
tdcs = pd.concat(dfs_1, ignore_index=True)

In [44]:
tdcs.head()

# Time - 128Hz (128 timesteps per second)
# Acc V - vertical
# ACC ML - mediolateral
# Acc AP - anteroposterior

# acceleration in m/s^2

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,File
0,0,-9.533939,0.566322,-1.413525,0,0,0,003f117e14
1,1,-9.53614,0.564137,-1.440621,0,0,0,003f117e14
2,2,-9.529345,0.561765,-1.429332,0,0,0,003f117e14
3,3,-9.531239,0.564227,-1.41549,0,0,0,003f117e14
4,4,-9.540825,0.561854,-1.429471,0,0,0,003f117e14


In [45]:
tdcs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7062672 entries, 0 to 7062671
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Time             int64  
 1   AccV             float64
 2   AccML            float64
 3   AccAP            float64
 4   StartHesitation  int64  
 5   Turn             int64  
 6   Walking          int64  
 7   File             object 
dtypes: float64(3), int64(4), object(1)
memory usage: 431.1+ MB


In [46]:
tdcs.isna().sum()

Time               0
AccV               0
AccML              0
AccAP              0
StartHesitation    0
Turn               0
Walking            0
File               0
dtype: int64

### DeFOG data - in home

In [30]:
path_defog = r'C:\Users\JonMc\Documents\Flatiron\kaggle_Parkinsons\tlvmc-parkinsons-freezing-gait-prediction\train\defog' 
files_defog = Path(path_defog).glob('*.csv')

In [31]:
dfs_2 = []
for f in files_defog:
    data = pd.read_csv(f)
    data['File'] = f.stem
    dfs_2.append(data)

In [47]:
defog = pd.concat(dfs_2, ignore_index=True)

In [61]:
defog.head()

# Time - 100Hz (100 timesteps per second)

# acceleration in g

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,Valid,Task,File
0,0,-1.0,0.044129,-0.25,0,0,0,False,False,02ea782681
1,1,-1.0,0.034431,-0.25,0,0,0,False,False,02ea782681
2,2,-1.0,0.03125,-0.25,0,0,0,False,False,02ea782681
3,3,-1.0,0.03125,-0.25,0,0,0,False,False,02ea782681
4,4,-1.0,0.03125,-0.25,0,0,0,False,False,02ea782681


For DeFOG (and therefore notype) dataset:
- Valid: if "True", then a true FOG event.  If not, then it's ambiguous whether user had an akinetic FOG event or stopped voluntarily
- Task: series were only annotated where this value is "True".  Portions marked "False" should be considered unannotated

In [49]:
defog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13525702 entries, 0 to 13525701
Data columns (total 10 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Time             int64  
 1   AccV             float64
 2   AccML            float64
 3   AccAP            float64
 4   StartHesitation  int64  
 5   Turn             int64  
 6   Walking          int64  
 7   Valid            bool   
 8   Task             bool   
 9   File             object 
dtypes: bool(2), float64(3), int64(4), object(1)
memory usage: 851.3+ MB


In [50]:
defog.isna().sum()

Time               0
AccV               0
AccML              0
AccAP              0
StartHesitation    0
Turn               0
Walking            0
Valid              0
Task               0
File               0
dtype: int64

### DeFOG unlabeled data

In [55]:
path_notype = r'C:\Users\JonMc\Documents\Flatiron\kaggle_Parkinsons\tlvmc-parkinsons-freezing-gait-prediction\train\notype' 
files_notype = Path(path_notype).glob('*.csv')

In [56]:
dfs_3 = []
for f in files_notype:
    data = pd.read_csv(f)
    data['File'] = f.stem
    dfs_3.append(data)

In [57]:
notype = pd.concat(dfs_3, ignore_index=True)

In [63]:
notype.head()

# Time - 100Hz (100 timesteps per second)

# acceleration in g

Unnamed: 0,Time,AccV,AccML,AccAP,Event,Valid,Task,File
0,0,-0.914652,-0.300851,0.298156,0,False,False,02ab235146
1,1,-0.914367,-0.301572,0.298325,0,False,False,02ab235146
2,2,-0.77954,-0.343197,0.27569,0,False,False,02ab235146
3,3,-0.993162,-0.270281,0.315775,0,False,False,02ab235146
4,4,-0.948767,-0.307849,0.297191,0,False,False,02ab235146


Unlabeled dataset ONLY:

- Event: indicator variable for ANY FOG-type event

In [59]:
notype.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10251114 entries, 0 to 10251113
Data columns (total 8 columns):
 #   Column  Dtype  
---  ------  -----  
 0   Time    int64  
 1   AccV    float64
 2   AccML   float64
 3   AccAP   float64
 4   Event   int64  
 5   Valid   bool   
 6   Task    bool   
 7   File    object 
dtypes: bool(2), float64(3), int64(2), object(1)
memory usage: 488.8+ MB


In [60]:
notype.isna().sum()

Time     0
AccV     0
AccML    0
AccAP    0
Event    0
Valid    0
Task     0
File     0
dtype: int64

In [None]:
# check correlations / matrix for each dataset
# set-up classification progression
# load parquet files from "unlabeled"

In [None]:
# Load test datasets later (contains one CSV each for tdsc and DeFOG)