In [1]:
import pandas as pd
import re, csv
from ast import literal_eval
from os import listdir
from os.path import isfile, join

In [2]:
def loc_list_to_ints(loc_list):
    to_return = []
    for loc_str in loc_list:
        loc_strs = loc_str.split(";")
        for loc in loc_strs:
            start, end = loc.split()
            to_return.append((int(start), int(end)))
    return to_return

In [3]:
class CharacterTokenizer():
    
    def tokenize(self, text):
        return [char for char in text]

In [4]:
#TODO transform 3 lists into one list of tuples
def simplify_dataset(train_path, features_path, pn_notes_path, tokenizer):
    '''
    #TODO transform 3 lists into one list of tuples
    
    Takes 3 .csv file paths which should be in the same form as the train.csv, features.csv and patient_notes.csv in the NBME kaggle competition. 
    Creates a simplified representation of the dataset which can later be used for data augmentation. The simplified data consists of 3 lists of
    equal length.
    
    1st list contains a sequence of tokens, 2nd holds their location (start index, end index)
    in the patient note,  3rd contains their labels.
    
    Label -1 denotes denotes that a token has no feature.
    '''
    df_train = pd.read_csv(train_path)
    df_feats = pd.read_csv(features_path)
    df_notes = pd.read_csv(pn_notes_path)
    
    df_train["location_list"] = [literal_eval(x) for x in df_train["location"]]
    
    simplified_data = []
    
    id_feat_dict = dict(zip(df_feats.feature_num, df_feats.feature_text))
    unique_pn_nums = df_train.pn_num.unique()
    for pn_num in unique_pn_nums:
        train = df_train.loc[df_train['pn_num'] == pn_num]
        case_num = train.case_num.unique()[0]
        pn_note = df_notes.loc[df_notes['pn_num'] == pn_num].values[0][2]
        tokens = tokenizer.tokenize(pn_note)
        word_locs = []
        curr_loc = 0
        labels = [-1] * len(tokens)
        
        for word in tokens:
            word_locs.append((curr_loc, curr_loc + len(word)))
            curr_loc += len(word)
        
        for index, row in train.iterrows():
            feat_num = row['feature_num']
            
            feat_locs = loc_list_to_ints(row['location_list'])
            for l in feat_locs:
                for i, w in enumerate(word_locs):
                    if l[0] <= w[0] and l[1] >= w[1]:
                        labels[i] = feat_num
    
        simplified_data.append((tokens, word_locs, labels, case_num, pn_num))
    return simplified_data

In [5]:
def complexify_data(data_path, features_path):
    '''
    Takes path of a folder with simplified data .csv files in it. Parses all simplified data in the folder into a single dataframe, 
    corresponding to train.csv from the NBME Kaggle competition
    
    ucitava sve podatke u pojednostavljenom formatu (lista tokena i lista oznaka), parsira ih u zajednicki dataframe nalik onom u train.csv
    '''
    data_files = [f for f in listdir(data_path) if isfile(join(data_path, f))]
    list_out = []
    
    #dict u kojem su kljucevi case_numovi a valuesi lista svih pripadajucih feature_numova 
    case_feat_dict = {}
    df_feats = pd.read_csv(features_path)
    for case_num in df_feats.case_num.unique():
        case_num_feats = df_feats[df_feats['case_num'] == case_num].feature_num.unique()
        case_feat_dict[case_num] = list(case_num_feats) 
#         print(case_num, case_feat_dict[case_num])

    col_names = ['id', 'case_num', 'pn_num', 'feature_num', 'annotation', 'location']
    for file in data_files:
        df = pd.read_csv(data_path + file)
        #print(data_files[0])
        pn_num, case_num = (int(i) for i in file[:-4].split('_'))
        #print(pn_num, case_num)
        df_feats = pd.read_csv(features_path)
        id_feat_dict = dict(zip(df_feats.feature_num, df_feats.feature_text))
        
        for feature_num in case_feat_dict[case_num]:
        
        #pretražuje nalazi li se u oznakama tokena current feature. 
            feat_locs = []
            curr_loc = []
            annotation = []
            i = 0

            is_same = False
            for word, loc, label in df.values:
                loc = tuple([int(n) for n in loc[1:-1].split()])
                if label == feature_num:
                    if is_same == True:
                        curr_loc = (curr_loc[0], loc[1])
                    else:
                        curr_loc = loc
                        is_same = True
                else:
                    if is_same == True:
                        feat_locs.append(curr_loc)
                        annotation.append(''.join(df.values[:, 0][curr_loc[0]:curr_loc[1]]))
                        is_same = False
                    else:
                        pass
            row = []
            row.append(parse_id(pn_num, feature_num))
            row.append(parse_case_num(case_num))
            row.append(parse_pn_num(pn_num))
            row.append(parse_feature_num(feature_num))
            row.append(parse_annotation(annotation))
            row.append(parse_location(feat_locs))
            list_out.append(row)

        df_out = pd.DataFrame(list_out, columns=col_names)
    return df_out

In [6]:
def save_simplified_data(tokens, locations, labels, path):
        
        #file = open(OUTPUT_PATH + str(case_num) + '_' +str(pn_num) + '.csv', 'w')
        file = open(path, 'w')
        csv_writer = csv.writer(file)
        csv_writer.writerow(['word', 'location', 'label'])
        for word, loc, label in zip(tokens, locations, labels):
            csv_writer.writerow([word, '[' + str(loc[0]) + ' ' + str(loc[1]) + ']', label])
        file.close()

In [7]:
def parse_id(pn_num, feature_num):
    return str(pn_num)+'_'+str(feature_num)

def parse_case_num(case_num):
    return int(case_num)

def parse_pn_num(pn_num):
    return int(pn_num)

def parse_feature_num(feature_num):
    return int(feature_num)

def parse_annotation(annotation):
    if len(annotation) == 0:
        return '[]'
    return str(annotation)

def parse_location(location):
    if len(location) == 0:
        return '[]'
    parsed_loc = "["
    for i, loc in enumerate(location):
        parsed_loc += "'" + str(loc[0]) + ' ' + str(loc[1])
        if i != len(location) - 1:
            parsed_loc += "', "
        else:
            parsed_loc += "']"
    return parsed_loc  