## Final Project 
## Brainster DS x Parkinson's Disease Specifications

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from functools import partial
import re
import pickle
from scipy.stats import skew, kurtosis
from scipy import stats
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, KFold, StratifiedKFold

## Path to  User files

In [2]:
user_root = "D:/Brainster/Final Project/ArchivedUsers/ArchivedUsers/" # change to your path
user_fn_list = os.listdir(user_root)

### Function for read User files
#### cleaning the data

In [3]:
def read_one_file(fn, root):
    out = dict()
    with open(root + fn) as f:
        for line in f.readlines():
            k, v = line.split(": ")
            out[k] = v.strip()
            out['ID'] = re.findall(r'_(\w+)\.', fn)[0]
    return out

## Read all user files in list

In [4]:
users_list = list(map(partial(read_one_file, root=user_root), user_fn_list))

## Convert list to dataframe
## Cleaning the data

In [5]:
users = pd.DataFrame(users_list)
users.replace('------', np.nan, inplace=True)
users.replace('', np.nan, inplace=True)
users['Levadopa'] = users['Levadopa'] == 'True'
users['MAOB'] = users['MAOB'] == 'True'
users['Parkinsons'] = users['Parkinsons'] == 'True'
users['Tremors'] = users['Tremors'] == 'True'
users['Other'] = users['Other'] == 'True'

## Function for read Tappy files

#### cleaning the data and fix corupted data

In [6]:
keys_root = "D:/Brainster/Final Project/TappyData/TappyData/" # change to your path
keys_fn_list = os.listdir(keys_root)

In [7]:
def read_one_key_file(fn, root):
    # try:
    df = pd.read_csv(root + fn, delimiter='\t', header=None, on_bad_lines='skip',
                         usecols=range(8), low_memory=False)
    df.columns = ['ID', 'Date', 'TS', 'Hand', 'HoldTime', 'Direction', 'LatencyTime', 'FlightTime']
    df = df[df['ID'].apply(lambda x: len(str(x)) == 10)
                   & df['Date'].apply(lambda x: len(str(x)) == 6)
                   & df['TS'].apply(lambda x: len(str(x)) == 12)
                   & np.in1d(df['Hand'], ["L", "R", "S"])
                   & df['HoldTime'].apply(lambda x: re.search(r"[^\d.]", str(x)) is None)
                   & np.in1d(df['Direction'], ['LL', 'LR', 'RL', 'RR', 'LS', 'SL', 'RS', 'SR', 'RR'])
                   & df['LatencyTime'].apply(lambda x: re.search(r"[^\d.]", str(x)) is None)
                   & df['FlightTime'].apply(lambda x: re.search(r"[^\d.]", str(x)) is None)]
    df['HoldTime'] = df['HoldTime'].astype(float)
    df['LatencyTime'] = df['LatencyTime'].astype(float)
    df['FlightTime'] = df['FlightTime'].astype(float)
    return df

### Read all Tappy files in list

In [8]:
keys_list = list(map(partial(read_one_key_file, root=keys_root), keys_fn_list))

### Combine all Tappy files

In [9]:
keys = pd.concat(keys_list, ignore_index=True, axis=0)

In [10]:
keys.shape

(4675453, 8)

In [11]:
keys['ID'].nunique()

113

### Find users with sufficient data

In [12]:
user_w_sufficient_data = set((keys.groupby('ID').size() >= 2000).index)
user_eligible = set(users[((users['Parkinsons']) & (users['Impact'] == 'Mild') 
                       | (~users['Parkinsons']))
                      & (~users['Levadopa'])]['ID'])
user_valid = user_w_sufficient_data.intersection(user_eligible)

In [13]:
valid_keys = keys[(keys['HoldTime'] > 0)
                   & (keys['LatencyTime'] > 0)
                   & (keys['HoldTime'] < 2000)
                   & (keys['LatencyTime'] < 2000)
                   & np.in1d(keys['ID'], list(user_valid))]

### Find and remuve duplicates rows

In [14]:
print(f'Number of duplicated row: {valid_keys.duplicated().sum()}')
valid_keys.drop_duplicates()
print(f'Records dropped.The new shape of the DataFrame is: {valid_keys.shape}')

Number of duplicated row: 26835
Records dropped.The new shape of the DataFrame is: (3662927, 8)


### Save the DataFrame to file

In [15]:
users.to_csv('df_user.csv', index=False)
valid_keys.to_csv('df_keys.csv', index=False)

users.to_pickle('df_user.pkl')
valid_keys.to_pickle('df_keys.pkl')