In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os

In [3]:
## Load Datasets
baseline_df = pd.read_csv('Curated_Data_Cuts_Files/PPMI_Baseline_Data_02Jul2018.csv')
years_df = pd.read_csv('Curated_Data_Cuts_Files/PPMI_Year_1-3_Data_02Jul2018.csv')
mri_df = pd.read_csv('MRI.csv')

In [4]:
## Get patients that have both baseline and year progression data
baseline_patno_counts = baseline_df["PATNO"].value_counts()
baseline_ids = baseline_patno_counts.index
years_patno_counts = years_df["PATNO"].value_counts()
years_ids = years_patno_counts.index
mri_subject_counts = mri_df["Subject"].value_counts()
mri_ids = mri_subject_counts.index
stage_data_ids = pd.Series(list(set(years_ids) & set(baseline_ids) & set(mri_ids)))

baseline_df = baseline_df[baseline_df['PATNO'].isin(stage_data_ids)]
years_df = years_df[years_df['PATNO'].isin(stage_data_ids)]
mri_df = mri_df[mri_df['Subject'].isin(stage_data_ids)]

In [5]:
print(stage_data_ids.shape)

(564,)


In [6]:
years_stages_counts = years_df["hy"].value_counts()
print(years_stages_counts)

2    587
0    491
.    249
1    216
3     46
Name: hy, dtype: int64


In [7]:
def stage_row(patno):
    dur = baseline_df[baseline_df['PATNO'] == patno]['duration'].iloc[0]
    bl_hy = baseline_df[baseline_df['PATNO'] == patno]['hy'].iloc[0]
    
    years_rows = years_df[years_df['PATNO'] == patno]
    
    y1 = years_rows[years_rows['YEAR'] == 1]
    if (y1.shape[0] > 0):
        y1_hy = y1['hy'].iloc[0]
    else:
        y1_hy = '.'
        
    y2 = years_rows[years_rows['YEAR'] == 2]
    if (y2.shape[0] > 0):
        y2_hy = y2['hy'].iloc[0]
    else:
        y2_hy = '.'
        
    y3 = years_rows[years_rows['YEAR'] == 3]
    if (y3.shape[0] > 0):
        y3_hy = y3['hy'].iloc[0]
    else:
        y3_hy = '.'
    
    new_row = pd.DataFrame([[patno, dur, bl_hy, y1_hy, y2_hy, y3_hy]])
    return new_row

stage_df = pd.concat([stage_row(i) for i in stage_data_ids], ignore_index=True)
stage_df.columns = ['PATNO', 'duration', 'bl_hy', 'y1_hy', 'y2_hy', 'y3_hy']

# print(stage_df)

In [8]:
print(mri_df.columns)

Index(['Image Data ID', 'Subject', 'Group', 'Sex', 'Age', 'Visit', 'Modality',
       'Description', 'Type', 'Acq Date', 'Format', 'Downloaded'],
      dtype='object')


In [69]:
def prog_label(image_id):
    img_row = mri_df[mri_df['Image Data ID'] == image_id]
    pat_row = stage_df[stage_df['PATNO'] == img_row['Subject'].iloc[0]]
    visit = int(img_row['Visit'])
    
    bl = pat_row['bl_hy'].iloc[0]
    y1 = pat_row['y1_hy'].iloc[0]
    y2 = pat_row['y2_hy'].iloc[0]
    y3 = pat_row['y3_hy'].iloc[0]
    
    if visit < 12 and bl != '.' and y1 != '.':
        label = int(y1 > bl)
        stage = bl
    elif visit < 24 and y1 != '.' and y2 != '.':
        label = int(y2 > y1)
        stage = y1
    elif visit < 36 and y2 != '.' and y3 != '.':
        label = int(y3 > y2)
        stage = y2
    else:
        label = -1
        stage = y3
    
    return [label, stage]
    
# mri_labels = pd.DataFrame([prog_label(i) for i in mri_df['Image Data ID']], columns=['Progression', 'Stage'])
mri_df['Progression'] = [prog_label(i)[0] for i in mri_df['Image Data ID']]
mri_df['Stage'] = [prog_label(i)[1] for i in mri_df['Image Data ID']]

In [72]:
# print(mri_labels)
print(mri_df['Progression'].value_counts(dropna=False))
print(mri_df['Stage'].value_counts(dropna=False))

 0    3732
-1     975
 1     786
Name: Progression, dtype: int64
2    1850
1    1441
0    1346
.     810
3      46
Name: Stage, dtype: int64


In [73]:
no_prog = mri_df[mri_df['Progression'] == -1].index
# print(no_prog[0])
no_stage = mri_df[mri_df['Stage'] == '.'].index
missing_data = list(set(no_prog) | set(no_stage))
print(len(missing_data))
print(mri_df.shape)
# print(missing_data[0])

975
(5493, 14)


In [74]:
mri_complete = mri_df.drop(index=missing_data)
print(mri_complete.shape)

(4518, 14)


In [89]:
mri_complete.to_csv('mri_progression.csv')

In [79]:
aa = mri_complete[mri_complete['Stage'] == '0']
print(aa.shape)
aa = aa[aa['Progression'] == 1]
print(aa.shape)

(1313, 14)
(54, 14)


In [82]:
mri_t2 = mri_complete[mri_complete['Description'].str.contains('T2')]
print(mri_t2.shape)

(2681, 14)


In [86]:
aa = mri_t2[mri_t2['Stage'] == '3']
print(aa.shape)
aa = aa[aa['Progression'] == 1]
print(aa.shape)

(4, 14)
(0, 14)


In [88]:
mri_t2.to_csv('mri_t2_progression.csv')