In [1]:
import re

import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
def read_sample(fn="../../data/test_sample.xlsx"):
    data = pd.read_excel(fn)
    return data

def read_csv(fn):
    data = pd.read_csv(fn, index_col=0)
    return data

def reorder_text(x):
    order = x[0]
    text = x[1]

    text = np.array(text)
    reordered_text = text[np.argsort(order)]
    return ' '.join(reordered_text)

def merge_aspect(x):
    tmp = set()
    for aspects in x:
        for each in aspects:
            tmp.add(each)
    return list(tmp)

def merge_sentiment(x):
    max_v = float("-inf")
    for sentiment in x:
        for each in sentiment:
            each = int(each)
            if each > max_v:
                max_v = each
    return max_v

def merge_post_level_top_k(data, top_k):
    merge_id = "merge_id"
    sentence_id = "sentence_id"
    text = 'text'
    group = 'group'

    # select groups whoes frequency is on the top of k.
    top_groups = data[group].value_counts().index[: top_k]
    data = data[data[group].isin(top_groups)]

    # group_id = merge_id + sentence_id
    splitted_id = data['id'].str.split('-')
    data[merge_id] = splitted_id.apply(lambda x: '-'.join(x[:-1]))
    data[sentence_id] = splitted_id.apply(lambda x: x[-1]).astype(int)

    columns = [merge_id, sentence_id, text]
    data = data[columns]

    print("group")
    # group data by merge id
    agg_data = data.groupby([merge_id]).agg(lambda x: list(x))

    print("reorder")
    agg_data[text] = list(zip(agg_data[sentence_id], agg_data[text]))
    agg_data['reordered_text'] = agg_data[text].apply(lambda x: reorder_text(x))
    agg_data['group'] = agg_data.index.to_series().apply(lambda x: x.split('-')[0])

    return agg_data


def merge_post_level(data):
    merge_id = "merge_id"
    sentence_id = "sentence_id"
    text = 'text'
    group = 'group'

    # group_id = merge_id + sentence_id
    splitted_id = data['id'].str.split('-')
    data[merge_id] = splitted_id.apply(lambda x: '-'.join(x[:-1]))
    data[sentence_id] = splitted_id.apply(lambda x: x[-1]).astype(int)

    columns = [merge_id, sentence_id, text]
    data = data[columns]

    print("group")
    # group data by merge id
    agg_data = data.groupby([merge_id]).agg(lambda x: list(x))

    print("reorder")
    agg_data[text] = list(zip(agg_data[sentence_id], agg_data[text]))
    agg_data['reordered_text'] = agg_data[text].apply(lambda x: reorder_text(x))
    agg_data['group'] = agg_data.index.to_series().apply(lambda x: x.split('-')[0])

    return agg_data

def merge_post_level_and_aspect_top_k(data, top_k):
    
    merge_id = "merge_id"
    sentence_id = "sentence_id"
    text = 'text'
    group = 'group'
    ground_truth_aspect = "ground_truth_aspect"

    # select groups whoes frequency is on the top of k.
    top_groups = data[group].value_counts().index[: top_k]
    data = data[data[group].isin(top_groups)]

    # group_id = merge_id + sentence_id
    splitted_id = data['id'].str.split('-')
    data[merge_id] = splitted_id.apply(lambda x: '-'.join(x[:-1]))
    data[sentence_id] = splitted_id.apply(lambda x: x[-1]).astype(int)

    columns = [merge_id, sentence_id, text, ground_truth_aspect]
    data = data[columns]

    print("group")
    # group data by merge id
    agg_data = data.groupby([merge_id]).agg(lambda x: list(x))

    print("reorder")
    agg_data[text] = list(zip(agg_data[sentence_id], agg_data[text]))
    agg_data['reordered_text'] = agg_data[text].apply(lambda x: reorder_text(x))
    agg_data['group'] = agg_data.index.to_series().apply(lambda x: x.split('-')[0])
    agg_data['merged_aspects'] = agg_data[ground_truth_aspect].apply(lambda x: merge_aspect(x))    

    return agg_data    

def merge_post_level_and_aspect_top_k(data, top_k):
    
    merge_id = "merge_id"
    sentence_id = "sentence_id"
    text = 'text'
    group = 'group'
    ground_truth_aspect = "ground_truth_aspect"

    # select groups whoes frequency is on the top of k.
    top_groups = data[group].value_counts().index[: top_k]
    data = data[data[group].isin(top_groups)]

    # group_id = merge_id + sentence_id
    splitted_id = data['id'].str.split('-')
    data[merge_id] = splitted_id.apply(lambda x: '-'.join(x[:-1]))
    data[sentence_id] = splitted_id.apply(lambda x: x[-1]).astype(int)

    columns = [merge_id, sentence_id, text, ground_truth_aspect]
    data = data[columns]

    print("group")
    # group data by merge id
    agg_data = data.groupby([merge_id]).agg(lambda x: list(x))

    print("reorder")
    agg_data[text] = list(zip(agg_data[sentence_id], agg_data[text]))
    agg_data['reordered_text'] = agg_data[text].apply(lambda x: reorder_text(x))
    agg_data['group'] = agg_data.index.to_series().apply(lambda x: x.split('-')[0])
    agg_data['merged_aspects'] = agg_data[ground_truth_aspect].apply(lambda x: merge_aspect(x))    

    return agg_data 

def merge_post_level_aspect_sentiment_top_k(data, top_k):
    
    merge_id = "merge_id"
    sentence_id = "sentence_id"
    text = 'text'
    group = 'group'
    ground_truth_aspect = "ground_truth_aspect"
    sentiment = "sentiment"

    # select groups whoes frequency is on the top of k.
    top_groups = data[group].value_counts().index[: top_k]
    data = data[data[group].isin(top_groups)]

    # group_id = merge_id + sentence_id
    splitted_id = data['id'].str.split('-')
    data[merge_id] = splitted_id.apply(lambda x: '-'.join(x[:-1]))
    data[sentence_id] = splitted_id.apply(lambda x: x[-1]).astype(int)

    columns = [merge_id, sentence_id, text, ground_truth_aspect, sentiment]
    data = data[columns]

    print("group")
    # group data by merge id
    agg_data = data.groupby([merge_id]).agg(lambda x: list(x))

    print("reorder")
    agg_data[text] = list(zip(agg_data[sentence_id], agg_data[text]))
    agg_data['reordered_text'] = agg_data[text].apply(lambda x: reorder_text(x))
    agg_data['group'] = agg_data.index.to_series().apply(lambda x: x.split('-')[0])
    agg_data['merged_aspects'] = agg_data[ground_truth_aspect].apply(lambda x: merge_aspect(x))    
    agg_data['merged_sentiment'] = agg_data[sentiment].apply(lambda x: merge_sentiment(x))    

    return agg_data 

def merge_post_level_aspect_sentiment(data):
    
    merge_id = "merge_id"
    sentence_id = "sentence_id"
    text = 'text'
    group = 'group'
    ground_truth_aspect = "ground_truth_aspect"
    sentiment = "sentiment"

#     # select groups whoes frequency is on the top of k.
#     top_groups = data[group].value_counts().index[: top_k]
#     data = data[data[group].isin(top_groups)]

    # group_id = merge_id + sentence_id
    splitted_id = data['id'].str.split('-')
    data[merge_id] = splitted_id.apply(lambda x: '-'.join(x[:-1]))
    data[sentence_id] = splitted_id.apply(lambda x: x[-1]).astype(int)

    columns = [merge_id, sentence_id, text, ground_truth_aspect, sentiment]
    data = data[columns]

    print("group")
    # group data by merge id
    agg_data = data.groupby([merge_id]).agg(lambda x: list(x))

    print("reorder")
    agg_data[text] = list(zip(agg_data[sentence_id], agg_data[text]))
    agg_data['reordered_text'] = agg_data[text].apply(lambda x: reorder_text(x))
    agg_data['group'] = agg_data.index.to_series().apply(lambda x: x.split('-')[0])
    agg_data['merged_aspects'] = agg_data[ground_truth_aspect].apply(lambda x: merge_aspect(x))    
    agg_data['merged_sentiment'] = agg_data[sentiment].apply(lambda x: merge_sentiment(x))    

    return agg_data 

def process(x):
    ans = set()
    for each in x[1:-1].split(','): 
        words = each.strip().strip("'|\"")
        m = re.match('(\w+)-', words)
        if m is not None:
            ans.add(m[1])
        else:
            ans.add(words)
    return list(ans)

In [3]:
# train_fn = "../../data/lower/medical_sieve_training_set_merged.xlsx"
# test_fn = "../../data/lower/medical_sieve_test_set_merged.xlsx"


# train_data = read_sample(train_fn)
# test_data = read_sample(test_fn)

# cols = ['ground_truth_aspect', 'group', 'text', 'id']

# train_data = train_data[cols]
# test_data = test_data[cols]

# train_data = pd.concat([train_data, test_data], axis=0)

In [4]:
def read_from_bert_prediction():
#     fn = "../../data/bert_prediction/5label_pred.csv"
    fn = "../../data/bert_prediction/whole_5label+sentiment.csv"
    data = pd.read_csv(fn, index_col=0)
    data.rename(columns={"['pred']": "ground_truth_aspect"}, inplace=True)
    data.rename(columns={"['1']": "sentiment"}, inplace=True)    
    return data

train_data = read_from_bert_prediction()

  mask |= (ar1 == a)


In [5]:
train_data.head(2)

Unnamed: 0,aspect,course_of_problem,group,id,test,text,trainOrtest,treatment,ground_truth_aspect,sentiment
0,[],[],Ankle_Problems,Ankle_Problems-656172-3-1,[],"What I find weird is , even with no pain I am ...",test,[],['not about'],['0']
1,[],[],Ankle_Problems,Ankle_Problems-656172-3-2,[],I guess it will take awhile and more practice ...,test,[],['not about'],['0']


In [6]:
# convert string to list
train_data["ground_truth_aspect"] = train_data["ground_truth_aspect"].apply(lambda x: process(x))
train_data["sentiment"] = train_data["sentiment"].apply(lambda x: process(x))

In [7]:
train_data.head(2)

Unnamed: 0,aspect,course_of_problem,group,id,test,text,trainOrtest,treatment,ground_truth_aspect,sentiment
0,[],[],Ankle_Problems,Ankle_Problems-656172-3-1,[],"What I find weird is , even with no pain I am ...",test,[],[not about],[0]
1,[],[],Ankle_Problems,Ankle_Problems-656172-3-2,[],I guess it will take awhile and more practice ...,test,[],[not about],[0]


In [8]:
start = time.time()
# merged_data = merge_post_level_aspect_sentiment_top_k(train_data, 10)
merged_data = merge_post_level_aspect_sentiment(train_data)
print("elapsed: ", time.time() - start)

group
reorder
elapsed:  610.92222905159


In [9]:
merged_data.head(2)

Unnamed: 0_level_0,sentence_id,text,ground_truth_aspect,sentiment,reordered_text,group,merged_aspects,merged_sentiment
merge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ACE_Inhibitors--11362-0,"[1, 2, 3, 4, 5, 6]","([1, 2, 3, 4, 5, 6], [Hi - I have just this mo...","[[not about], [not about], [not about], [not a...","[[0], [1], [0], [0], [0], [0]]",Hi - I have just this morning taken my first d...,ACE_Inhibitors,[not about],1
ACE_Inhibitors--11362-1,"[1, 2, 3, 4]","([1, 2, 3, 4], [I am in the same position as y...","[[not about], [not about], [not about], [not a...","[[0], [0], [0], [0]]","I am in the same position as you , I have Atri...",ACE_Inhibitors,[not about],0


# Encod Label

In [10]:
def encode_label(multilabels, classes):    
    binarizer = MultiLabelBinarizer(classes=classes)
    encoded_labels = binarizer.fit_transform(multilabels)    
    return encoded_labels, binarizer.classes_, binarizer

labels = ['access', 'costs', 'delays', 'errors', 'trusts'] 
encoded_labels, classes, binarizer = encode_label(merged_data['merged_aspects'], labels)  

merged_data['merged_aspects'] = binarizer.inverse_transform(encoded_labels)
merged_data['merged_aspects'] = merged_data['merged_aspects'].apply(lambda x: list(x))

encoded_labels = pd.DataFrame(encoded_labels, columns=labels)

  .format(sorted(unknown, key=str)))


In [11]:
encoded_labels.set_index(merged_data.index, inplace=True)

In [12]:
print("# of multi aspect: ", np.sum(np.sum(encoded_labels.values, axis=1) > 1))

# of multi aspect:  29372


In [13]:
output_data = pd.concat([merged_data, encoded_labels], axis=1)
output_data.head(2)

Unnamed: 0_level_0,sentence_id,text,ground_truth_aspect,sentiment,reordered_text,group,merged_aspects,merged_sentiment,access,costs,delays,errors,trusts
merge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ACE_Inhibitors--11362-0,"[1, 2, 3, 4, 5, 6]","([1, 2, 3, 4, 5, 6], [Hi - I have just this mo...","[[not about], [not about], [not about], [not a...","[[0], [1], [0], [0], [0], [0]]",Hi - I have just this morning taken my first d...,ACE_Inhibitors,[],1,0,0,0,0,0
ACE_Inhibitors--11362-1,"[1, 2, 3, 4]","([1, 2, 3, 4], [I am in the same position as y...","[[not about], [not about], [not about], [not a...","[[0], [0], [0], [0]]","I am in the same position as you , I have Atri...",ACE_Inhibitors,[],0,0,0,0,0,0


In [14]:
a = output_data['merged_aspects'].apply(lambda x: ','.join(sorted(x))).value_counts() 
a / np.sum(a)

                                     0.849702
trusts                               0.045536
delays                               0.030032
errors                               0.027767
costs                                0.012561
access                               0.012077
errors,trusts                        0.004808
delays,trusts                        0.003392
access,trusts                        0.002279
delays,errors                        0.002240
costs,trusts                         0.001969
access,delays                        0.001201
access,errors                        0.000929
costs,errors                         0.000876
costs,delays                         0.000784
access,costs                         0.000781
delays,errors,trusts                 0.000610
access,errors,trusts                 0.000413
costs,errors,trusts                  0.000356
access,delays,trusts                 0.000350
access,costs,trusts                  0.000290
costs,delays,trusts               

In [15]:
def write_in_format_1(merged_data_fn, merged_data, index):
    merged_data_copy = merged_data.copy()
    columns = ["reordered_text"]
    merged_data_copy["reordered_text"] = merged_data_copy["reordered_text"].apply(lambda x: x.lower())
    merged_data_copy[columns].to_csv(merged_data_fn, sep="\t", header=False, index=index)    
    
def write_in_format_meta(merged_data_fn, merged_data, index):
    columns = ["group", "reordered_text", "merged_aspects", "access", "costs", "delays", "errors", "trusts", "merged_sentiment"]
    merged_data[columns].to_csv(merged_data_fn, sep="\t", index=index)        

In [16]:
print("writing in format meta")
write_in_format_meta("../../data/bert_prediction/whole_corpus_meta.csv", output_data, True)
write_in_format_1("../../data/bert_prediction/whole_corpus.csv", output_data, False)

writing in format meta


# sample data

sample data based on weights of each group

In [17]:
from sklearn.model_selection import train_test_split

meta = pd.read_csv("../../data/bert_prediction/whole_corpus_meta.csv", sep='\t')

# print(meta.columns)
train_meta, test_meta = train_test_split(meta, test_size=0.1, stratify=meta['group'], random_state=42)
test_meta = test_meta.set_index('merge_id')
# test_meta.head(5)
write_in_format_1("../../data/bert_prediction/whole_corpus_tiny.csv", test_meta, False)
write_in_format_meta("../../data/bert_prediction/whole_corpus_meta_tiny.csv", test_meta, True)

In [29]:
meta = pd.read_csv("../../data/bert_prediction/whole_corpus_meta.csv", sep='\t')

cols = ['Abdominal_Disorders', 'Cataract', 'Gallbladder_Problems', 'Irritable_Bowel_Syndrome', 'Prostate_Problems']
meta = meta[meta.group.isin(cols)]

# print(meta.columns)
train_meta, test_meta = train_test_split(meta, test_size=0.5, stratify=meta['group'], random_state=42)
test_meta = test_meta.set_index('merge_id')

write_in_format_1("../../data/bert_prediction/whole_corpus_top_5_viz.csv", test_meta, False)
write_in_format_meta("../../data/bert_prediction/whole_corpus_meta_top_5_viz.csv", test_meta, True)


In [30]:
test_meta.shape

(52951, 9)