In [653]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np

df = pd.read_csv('CombinedTestResultMetric.csv')
df.drop(['participantCreatedOn', 'testResultMetricCreatedOn', 'testResultMetricTimestamp1', 'testResultMetricTimestamp2', 'Unnamed: 19'], axis = 1, inplace = True)

In [654]:
df['testStartedAt'] = df['testStartedAt'].apply(lambda x: pd.to_datetime(x[0:-10]))
df['testEndedAt'] = df['testEndedAt'].apply(lambda x: pd.to_datetime(x[0:-10]))
# df['timecomplete'] = (df['testEndedAt'] - df['testStartedAt']).apply(lambda x: x.seconds)

In [655]:
df['age'] = [df.testStartedAt[i].year - df.participantBirthYear[i] 
                    for i in range(0, len(df))]
df.drop(['participantBirthYear', 'testStartedAt', 'testEndedAt'], axis = 1, inplace = True)

df['participantSex'].replace(['male', 'female'], [0, 1], inplace = True)

df['participantIsControl'].replace([True, False], [1, 0], inplace = True)

In [656]:
mean = df.groupby(['floodlightOpenId', 'testCode', 'testMetricCode']).mean()[['testResultMetricValue']]
mean.columns = ['val_mean']
mean_reset = mean.reset_index()
mean_reset['test'] = mean_reset['testCode'] + mean_reset['testMetricCode']
meantest = mean_reset.drop(['testCode', 'testMetricCode'], axis = 1)
meantest = meantest.set_index(['floodlightOpenId', 'test'])
meantest = meantest.reset_index()
meantest = meantest.pivot(index='floodlightOpenId', columns='test', values=['val_mean'])
meantest.columns = [col[0]+col[1] for col in meantest.columns]

In [657]:
var = df.groupby(['floodlightOpenId', 'testCode', 'testMetricCode']).var()[['testResultMetricValue']]
var.columns = ['val_var']
var_reset = var.reset_index()
var_reset['test'] = var_reset['testCode'] + var_reset['testMetricCode']
vartest = var_reset.drop(['testCode', 'testMetricCode'], axis = 1)
vartest = vartest.set_index(['floodlightOpenId', 'test'])
vartest = vartest.reset_index()
vartest = vartest.pivot(index='floodlightOpenId', columns='test', values=['val_var'])
vartest.columns = [col[0]+col[1] for col in vartest.columns]

In [658]:
count = df.groupby(['floodlightOpenId', 'testCode', 'testMetricCode']).count()[['testResultMetricValue']]
count.columns = ['test_count']
count_reset = count.reset_index()
count_reset['test'] = count_reset['testCode'] + count_reset['testMetricCode']
counttest = count_reset.drop(['testCode', 'testMetricCode'], axis = 1)
counttest = counttest.set_index(['floodlightOpenId', 'test'])
counttest = counttest.reset_index()
counttest = counttest.pivot(index='floodlightOpenId', columns='test', values='test_count')
counttest.columns = [col+'_count' for col in counttest.columns]

In [659]:
dfs = [meantest, vartest, counttest]
maindf = pd.concat(dfs, axis=1)

In [660]:
tests = df.groupby(['testCode', 'testMetricCode']).size().reset_index(name="Time")
subset = tests[['testCode', 'testMetricCode']]
testcombs = [tuple(x) for x in subset.values]
testcombs

[('daily_questions', 'mood_response'),
 ('draw_a_shape', 'bottom_to_top_hausdorff_distance_best'),
 ('draw_a_shape', 'circle_hausdorff_distance_best'),
 ('draw_a_shape', 'figure8_hausdorff_distance_best'),
 ('draw_a_shape', 'hand_used'),
 ('draw_a_shape', 'mean_hausdorff_distance_best'),
 ('draw_a_shape', 'number_shapes_drawn_correctly'),
 ('draw_a_shape', 'spiral_hausdorff_distance_best'),
 ('draw_a_shape', 'square_hausdorff_distance_best'),
 ('draw_a_shape', 'top_to_bottom_hausdorff_distance_best'),
 ('five_u_turn', 'turn_speed_avg'),
 ('five_u_turn', 'turns'),
 ('ips', 'correct_responses'),
 ('ips', 'response_time_avg'),
 ('ips_baseline', 'correct_responses'),
 ('ips_baseline', 'response_time_avg'),
 ('mobility', 'life_space_daily'),
 ('pinching', 'hand_used'),
 ('pinching', 'successful_pinches'),
 ('static_balance', 'sway_path'),
 ('two_min_walk', 'steps')]

In [661]:
names = ['mood', 'bottomtop', 'circle', 'figure8', 'shapehand', 'meandrawdist', 'numshapes', 'spiral', 'square',
        'topbottom', 'turnspeed', 'turns', 'ips', 'ipstime', 'bips', 'bipstime', 'mobility', 'pinchhand', 'pinches',
        'sway', 'steps']

In [662]:
def cols(names, metric):
    allcols = []
    for name in names: 
        allcols.append(name + '_' + metric)
    return allcols

In [663]:
meancols = cols(names, 'mean')
varcols = cols(names, 'var')
countcols = cols(names, 'count')

In [664]:
allcols = meancols + varcols + countcols

In [665]:
maindf.columns = allcols

In [666]:
diagnosis = df.groupby(['floodlightOpenId', 'participantIsControl']).size().reset_index()[['floodlightOpenId', 'participantIsControl']]
sex = df.groupby(['floodlightOpenId', 'participantSex']).size().reset_index()[['floodlightOpenId', 'participantSex']]
country = df.groupby(['floodlightOpenId', 'participantCountryOfResidence']).size().reset_index(name='Time')[['floodlightOpenId', 'participantCountryOfResidence']]
age = df.groupby(['floodlightOpenId']).mean().reset_index()[['floodlightOpenId', 'age']]
weight = df.groupby(['floodlightOpenId']).mean().reset_index()[['floodlightOpenId', 'participantWeightLbs']]
height = df.groupby(['floodlightOpenId']).mean().reset_index()[['floodlightOpenId', 'participantHeightCms']]


In [667]:
maindf['sex'] = list(sex['participantSex'])
maindf['country'] = list(country['participantCountryOfResidence'])
maindf['age'] = list(age['age'])
maindf['weight'] = list(weight['participantWeightLbs'])
maindf['height'] = list(height['participantHeightCms'])
maindf['diagnosis'] = list(diagnosis['participantIsControl'])

In [668]:
def fillvar(df, metric):
    col = np.where(((maindf['{}_var'.format(metric)].isna()) & (maindf['{}_count'.format(metric)] == 1)), 
             0, maindf['{}_var'.format(metric)])
    return col

In [669]:
for name in names: 
    maindf['{}_var'.format(name)] = fillvar(maindf, name)

In [646]:
# maindf = [maindf.dropna() if maindf.isna().sum() > 500]
# if maindf.isna().mean() > 0.5: 
#     print(maindf.isna())

SyntaxError: invalid syntax (<ipython-input-646-26396a18d94c>, line 1)

In [670]:
# sum(maindf.isnull().mean(axis=1) > 0.5)

thresh = len(maindf) * .5
maindf.dropna(thresh = thresh, axis = 1, inplace = True)

In [671]:
threshc = len(maindf.columns) * .5
maindf.dropna(thresh = threshc, axis = 0, inplace = True)

0

In [677]:
print(sum(maindf.isnull().mean(axis=1) > 0.5))
print(sum(maindf.isnull().mean(axis=0) > 0.5))

0
0


In [682]:
def fillna(df, metric):
    df = maindf['{}'.format(metric)] = maindf.groupby('diagnosis').transform(lambda x: x.fillna(x.median()))
    return df

for name in names: 
        maindf['{}'.format(name)] = fillna(maindf, name)

ValueError: Wrong number of items passed 64, placement implies 1

In [679]:


maindf['mood_count'] = maindf.groupby('diagnosis').transform(lambda x: x.fillna(x.median()))

In [680]:
maindf.mood_count.isna().sum()

0