In [69]:
import pandas as pd
import numpy as np
import csv
import json

nfb = pd.read_csv('NeuroFB_part1.csv', low_memory=False)
column_descriptions = nfb.iloc[0].values
nfb = nfb.loc[1:,]
nfb.set_index('queried_ursi', inplace=True)
column_description_dict = dict(zip(nfb.columns.get_values(), column_descriptions.T))

#Remove Nan keys and values without associated question/description
remove_keys = [] 
for key, value in column_description_dict.items(): 
    if(type(value) == float or type(value) == np.float_):
        if(np.isnan(value)): 
            remove_keys.append(key) 

for i in range(len(remove_keys)):
    if remove_keys[i] in column_description_dict:
        del column_description_dict[remove_keys[i]]

#Codebook reading
codebook = {}
codebook_lines = []
with open('CB.csv') as csvfile:
    csvreader = csv.reader(csvfile, dialect='excel')
    for row in csvreader:
        codebook_lines.append(row)

current_key = ''
for line_vals in codebook_lines:
    if 'Instrument' in line_vals[0]:
        continue  
    if line_vals[3]:
        if line_vals[3] in codebook:
            raise ValueError('Key {0} already exists in dictionary!'.format(line_vals[3]))
        codebook[line_vals[3]] = {}
        current_key = line_vals[3]
    if not current_key:
        raise ValueError("Trying to set value with empty key")
    codebook[current_key][line_vals[5]] = line_vals[4]
     

#Instrument keys readin from codebook
instrument_dict = {}
current_instrmt = ''
previous_instrmt = ''
for line_vals in codebook_lines:
    if 'Instrument' in line_vals[0]:
        continue
    previous_instrmt = ''    
    if line_vals[0]:
        if line_vals[0] in instrument_dict:
            raise ValueError('Error')
        current_instrmt = line_vals[0]
        instrument_dict.setdefault(current_instrmt, [])
    while(current_instrmt!=previous_instrmt):
        if line_vals[3]:
            instrument_dict[current_instrmt].append(line_vals[3])
        previous_instrmt = current_instrmt

def drop_NanRows(df):
    drop_row = True
    drop_list = []
    for i in range(len(df.index)):
        row = df.iloc[i]
        drop_row = True
        for f in range(len(row)-2):
            if(type(row[f+2]) == float or type(row[f+2]) == np.float64):
                if not(np.isnan(row[f+2])):
                    drop_row = False
            elif(type(row[f+2]) == str and row[f+2].lower() != 'nan'):
                drop_row = False
            else:
                drop_row = False
        if(drop_row):
            drop_list.append(i)
    return drop_list

#Convert values in dataframe into corresponding values in the codebook according to column name
def convertToCodebook(k, value):
    if(type(value)==str and value.lower()!='nan'):
        if k in codebook.keys():
            if value in codebook[k]:
                return codebook[k][value]
            elif value not in codebook[k]:
                return value 
        else:
            return value
    elif((type(value) == float or type(value) == np.float64) and np.isnan(value)):
        return value

def get_instrumentID(column_name, var_id):
    instr_id = ''
    found = False
    if not var_id:
        for inst, variables in instrument_dict.items():
            for val in variables:
                if(column_name == val):
                    instr_id = inst
                    found = True
    if (not found) or var_id:
        if((column_name[column_name.rfind('_', 0, column_name.rfind('_'))+1:][0]).isdigit()): #if first character after the last '_' is a digit
            instr_id = column_name[:(column_name.rfind('_', 0, column_name.rfind('_')))]
        else:
            instr_id = column_name[:(column_name.rfind('_'))]
        for inst, variables in instrument_dict.items():             
            if (instr_id == get_instrumentID(variables[0], not var_id)):
                instr_id = inst
    return instr_id

#List of possible visit_ids
visitID_list = ['V1', 'V1REP', 'V1REP_2', 'V2', 'VA', 'VA_ALG', 
                'V2REP', 'V2REP_2', 'VA-REP', 'V3', 'V4', 'V5']

#Split column name to return visit_id and final column name
def get_VisitID_ColName(column_name):
    ncol, visit = '', ''
    for v in (sorted(visitID_list, key=len, reverse=True)):
        if(v+'_' in column_name):
            ncol = column_name.replace(v+'_', '')
            visit = v
            if(ncol[0].isdigit()):
                ncol = ncol[ncol.find('_')+1:]
                visit = column_name[:column_name.find(ncol)-1]        
    return visit, ncol

#Returns dataframe with instrument string input
def create_instrumentDF(instrmt):
    #get column names
    df_columns = ['queried_ursi','visit_id']
    for c in nfb_processed.columns:
        if instrmt in c:
            df_columns.append(c)
    df = pd.DataFrame(columns=df_columns)
    
    #add to dataframe
    for i in range(len(nfb_processed.index)):
        for c in df_columns:
            if instrmt in c:
                df.at[i, 'queried_ursi'] = nfb_processed.queried_ursi[i]
                df.at[i, 'visit_id'] = nfb_processed.visit_id[i]
                df.at[i, c] = nfb_processed.loc[i, c]
    #Drop nan rows
    df = df.drop(drop_NanRows(df))
    df.set_index('queried_ursi', inplace=True)
    return df

#Unique list using visitID_list
new_cols = ['queried_ursi', 'visit_id']
processed = []
for c in nfb.columns:
    for visit in (sorted(visitID_list, key=len, reverse=True)):
        if(visit+'_' in c) and (c not in processed):
            col = c.replace(visit+'_', '')
            if(col[0].isdigit()):
                col = col[col.find('_')+1:]
            new_cols.append(col)
            processed.append(c)
unique_list = [x for i, x in enumerate(new_cols) if new_cols.index(x) == i] #or set(new_cols)

ursi_visit_df = pd.DataFrame(columns=unique_list)
columns = nfb.columns.sort_values()

#Dataframe restructuring WITH codebook converting
row_counter = -1
target_index_counter = -1
previous_visit = ''

for i in range(len(nfb.index)):
    previous_visit = ''
    row_counter+=1
    for c in columns:
        for key in unique_list:
            if key in c:
                visit_col, nfb_col = get_VisitID_ColName(c)
                if (previous_visit != visit_col):
                    target_index_counter+=1
                    ursi_visit_df.at[target_index_counter, 'visit_id'] = visit_col
                    ursi_visit_df.at[target_index_counter, 'queried_ursi'] = nfb.index[row_counter]
                    previous_visit = visit_col
                val = nfb.loc[nfb.index[row_counter],c]
                ursi_visit_df.at[target_index_counter, nfb_col] = convertToCodebook(key, val)


#Drop nan rows of dataframe
ursi_visit_df = ursi_visit_df.drop(drop_NanRows(ursi_visit_df))
nfb_processed = ursi_visit_df.reset_index(drop=True)
nfb_processed = nfb_processed.dropna(axis=1, how='all')

#create dictionary according to instrument/assessment
current_instrument, previous_instrument, instrument_id = '', '', ''
instrumentDF_dict = {}

for c in nfb_processed.columns.sort_values(): 
    if(c != 'queried_ursi' and c != 'visit_id'):
        current_instrument = c[:(c.find('_'))] 
        instrument_id = get_instrumentID(c, False)
    if (previous_instrument != current_instrument):
        instrumentDF_dict[instrument_id] = create_instrumentDF(current_instrument)
        previous_instrument = current_instrument

In [138]:
def exportJSON(instrument):
    organized_data = {}
    for key, df_gb in instrumentDF_dict[instrument].groupby('queried_ursi'):
        organized_data[str(key)] = df_gb.to_dict('records')
    if(' ' in instrument):
        instrument = instrument.replace(' ', '_')
    if('/' in instrument):
        instrument = instrument.replace('/', '_')
    with open(instrument+'.json', 'w') as outfile:
        json.dump(organized_data, outfile, indent = 4, ensure_ascii = False)