In [37]:
import pandas as pd
import re, csv
from ast import literal_eval
from os import listdir
from os.path import isfile, join

In [38]:
FEATS_PATH = 'data/features.csv'
NOTES_PATH = 'data/patient_notes.csv'
TRAIN_PATH = 'data/train.csv'
OUTPUT_PATH = 'data/simplified/'

def loc_list_to_ints(loc_list):
    to_return = []
    for loc_str in loc_list:
        loc_strs = loc_str.split(";")
        for loc in loc_strs:
            start, end = loc.split()
            to_return.append((int(start), int(end)))
    return to_return


def simplify_dataset():
    '''
    iz train.csv kakav je dan na kaggle natjecanju radi pojednostavljenu reprezentaciju podataka. od svakog patient notea stvara 
    csv datoteku pod imenom patientnum_casenum.csv
    
    stupci u csv datoteci su (token, lokacija, oznaka)
    
    Trenutno se tekst tokenizira po characterima, to mi se cinilo najjednostavnije, ali zbog toga je stupac "lokacija" trenutno redundantan.
    
    label -1 znaci da nema nijednog featurea na tokenu. 
    '''
    df_train = pd.read_csv(TRAIN_PATH)
    df_feats = pd.read_csv(FEATS_PATH)
    df_notes = pd.read_csv(NOTES_PATH)
    
    df_train["location_list"] = [literal_eval(x) for x in df_train["location"]]
        
    id_feat_dict = dict(zip(df_feats.feature_num, df_feats.feature_text))
    
    unique_pn_nums = df_train.pn_num.unique()
    print((unique_pn_nums.shape))
    for pn_num in unique_pn_nums:
        train = df_train.loc[df_train['pn_num'] == pn_num]
        case_num = train.case_num.unique()[0]
        pn_note = df_notes.loc[df_notes['pn_num'] == pn_num].values[0][2]
        pn_note_tokens = [char for char in pn_note]
        word_locs = []
        curr_loc = 0
        labels = [-1] * len(pn_note_tokens)
        
        for word in pn_note_tokens:
            word_locs.append((curr_loc, curr_loc + len(word)))
            curr_loc += len(word)
        
        for index, row in train.iterrows():
            feat_num = row['feature_num']
            
            feat_locs = loc_list_to_ints(row['location_list'])
            for l in feat_locs:
                for i, w in enumerate(word_locs):
                    if l[0] <= w[0] and l[1] >= w[1]:
                        labels[i] = feat_num
    
        out_file = open(OUTPUT_PATH + str(case_num) + '_' +str(pn_num) + '.csv', 'w')
        csv_writer = csv.writer(out_file)
        csv_writer.writerow(['word', 'location', 'label'])
        for word, loc, label in zip(pn_note_tokens, word_locs, labels):
            csv_writer.writerow([word, '[' + str(loc[0]) + ' ' + str(loc[1]) + ']', label])
            
        out_file.close()
                
simplify_dataset() 

(1000,)


In [39]:

def parse_id(pn_num, feature_num):
    return str(pn_num)+'_'+str(feature_num)
def parse_case_num(case_num):
    return int(case_num)
def parse_pn_num(pn_num):
    return int(pn_num)
def parse_feature_num(feature_num):
    return int(feature_num)
def parse_annotation(annotation):
    if len(annotation) == 0:
        return '[]'
    return str(annotation)
def parse_location(location):
    if len(location) == 0:
        return '[]'
    parsed_loc = "["
    for i, loc in enumerate(location):
        parsed_loc += "'" + str(loc[0]) + ' ' + str(loc[1])
        if i != len(location) - 1:
            parsed_loc += "', "
        else:
            parsed_loc += "']"
    return parsed_loc
            
DATA_PATH = 'data/simplified/'

def complexify_data():
    '''
    ucitava sve podatke u pojednostavljenom formatu (lista tokena i lista oznaka), parsira ih u zajednicki dataframe nalik onom u train.csv
    '''
    data_files = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]
    list_out = []
    
    #dict u kojem su kljucevi case_numovi a valuesi lista svih pripadajucih feature_numova 
    case_feat_dict = {}
    df_feats = pd.read_csv(FEATS_PATH)
    for case_num in df_feats.case_num.unique():
        case_num_feats = df_feats[df_feats['case_num'] == case_num].feature_num.unique()
        case_feat_dict[case_num] = list(case_num_feats) 
        #print(case_num, case_feat_dict[case_num])

    col_names = ['id', 'case_num', 'pn_num', 'feature_num', 'annotation', 'location']
    for file in data_files:
        df = pd.read_csv(DATA_PATH + file)
        #print(data_files[0])
        case_num, pn_num = (int(i) for i in file[:-4].split('_'))
        #print(pn_num, case_num)
        df_feats = pd.read_csv(FEATS_PATH)
        id_feat_dict = dict(zip(df_feats.feature_num, df_feats.feature_text))
        
        for feature_num in case_feat_dict[case_num]:
        
        #pretražuje nalazi li se u oznakama tokena current feature. 
            feat_locs = []
            curr_loc = []
            annotation = []
            i = 0

            is_same = False
            for word, loc, label in df.values:
                loc = tuple([int(n) for n in loc[1:-1].split()])
                if label == feature_num:
                    if is_same == True:
                        curr_loc = (curr_loc[0], loc[1])
                    else:
                        curr_loc = loc
                        is_same = True
                else:
                    if is_same == True:
                        feat_locs.append(curr_loc)
                        annotation.append(''.join(df.values[:, 0][curr_loc[0]:curr_loc[1]]))
                        is_same = False
                    else:
                        pass
            row = []
            row.append(parse_id(pn_num, feature_num))
            row.append(parse_case_num(case_num))
            row.append(parse_pn_num(pn_num))
            row.append(parse_feature_num(feature_num))
            row.append(parse_annotation(annotation))
            row.append(parse_location(feat_locs))
            list_out.append(row)

        df_out = pd.DataFrame(list_out, columns=col_names)
    return df_out


In [None]:
RESTORED_PATH = 'data/train_restored.csv'
df_out = complexify_data()
df_out.to_csv(RESTORED_PATH, index=False)

In [None]:
df_out

In [None]:
df_test = pd.read_csv(TRAIN_PATH)


In [None]:
df_test.dtypes
df_test

In [None]:
df_test = df_test.loc[df_test['pn_num'] == 71432]
df_test.values[8, 5] == df_out.values[8, 5]