# Import Data from Parser Output

In [220]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_rows", 1000)

import sys
sys.path.append('../src')
from utils import save_as_pickle, load_from_pickle
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [202]:
df = pd.read_csv('../../data/parser_output/parser-output-pride-and-prejudice-jane-austen-2019-05-05.csv')

In [203]:
from ast import literal_eval
df['tokenized_sent'] = df['tokenized_sent'].apply(lambda x: literal_eval(x))
df['raw_utter_list'] = df['raw_utter_list'].apply(lambda x: literal_eval(x))

In [204]:
df.head(100)

Unnamed: 0.1,Unnamed: 0,para_index,tag,para,num_utterances,raw_utter_list,tokenized_sent,chapter_tag
0,0,0,narrative,By Jane Austen,0,[],[By Jane Austen],
1,1,1,narrative,CONTENTS,0,[],[CONTENTS],
2,2,2,narrative,Chapter 1,0,[],[Chapter 1],Chapter 1
3,3,3,narrative,Chapter 2,0,[],[Chapter 2],Chapter 2
4,4,4,narrative,Chapter 3,0,[],[Chapter 3],Chapter 3
5,5,5,narrative,Chapter 4,0,[],[Chapter 4],Chapter 4
6,6,6,narrative,Chapter 5,0,[],[Chapter 5],Chapter 5
7,7,7,narrative,Chapter 6,0,[],[Chapter 6],Chapter 6
8,8,8,narrative,Chapter 7,0,[],[Chapter 7],Chapter 7
9,9,9,narrative,Chapter 8,0,[],[Chapter 8],Chapter 8


# Heuristic Rule-Based Conversation Miner

In [205]:
def is_utterance(sent):
    return ((get_num_quotes(sent)%2==0) & (get_num_quotes(sent)!=0))

def get_num_quotes(sent):
    return sent.count('"')

def iob_tag_df(df):
    iob_format_list = []
    for i in df.index:
        for j, sent in enumerate(df.loc[i]['tokenized_sent']):
            para_index = df.loc[i]['para_index']
            chapter_tag = df.loc[i]['chapter_tag']
            if (is_utterance(sent)) & (j==0):
                iob_format_list.append((para_index, sent, 'B-OTHER', chapter_tag))
            elif (is_utterance(sent)) & (j>0):
                iob_format_list.append((para_index, sent, 'I-OTHER', chapter_tag))
            else:
                iob_format_list.append((para_index, sent, 'O', chapter_tag))
                
    df_final = pd.DataFrame()
    df_final['para_index'] = [x[0] for x in iob_format_list]
    df_final['sent'] = [x[1] for x in iob_format_list]
    df_final['label'] = [x[2] for x in iob_format_list]
    df_final['chapter_tag'] = [x[3] for x in iob_format_list]

    return df_final


def tag_b_start(df_final):
    some_dict = dict()

    count_consec_o = 0
    o_index = -1
    first_convo_index = -1
    utterindex_chapter = tuple()

    for i in df_final.index:
        if df_final.loc[i]['label'] == 'O':
            if o_index == i-1:
                count_consec_o += 1
                o_index = i
            else:
                o_index = i
                count_consec_o = 1
        elif df_final.loc[i]['label'] == 'B-OTHER':
            curr_utterindex_chapter = (i, df_final.loc[i]['chapter_tag'])
            # if number of consecutive narratives before utterance is at least 3
            if count_consec_o >= 3:
                utterindex_chapter = curr_utterindex_chapter
                some_dict[i] = 'B-START'
                first_convo_index = i
                count_consec_o = 0
            # if utterance is first utterance of a chapter
            elif curr_utterindex_chapter[1] != utterindex_chapter[1]:
                utterindex_chapter = curr_utterindex_chapter
                some_dict[i] = 'B-START'
                first_convo_index = i
                count_consec_o = 0
    
    for index in list(some_dict.keys()):
        df_final.loc[index, 'label'] = 'B-START'
    
    return df_final

def tag_i_start(df_final):
    some_dict = dict()
    prev_label = ''

    for i in df_final.index:
        curr_label = df_final.loc[i]['label']
        if curr_label == 'I-OTHER':
            if prev_label == 'B-START':
                some_dict[i] = 'I-START'
                prev_label = curr_label
            elif prev_label == 'I-START':
                some_dict[i] = 'I-START'
                prev_label = curr_label
            else:
                prev_label = curr_label
        else:
            prev_label = curr_label
    
    for index in list(some_dict.keys()):
        df_final.loc[index, 'label'] = 'I-START'
    
    return df_final

def convo_miner_heuristic(df):
    df_final = iob_tag_df(df)
    df_final = tag_b_start(df_final)
    df_final = tag_i_start(df_final)
    return df_final



In [206]:
# df_final = iob_tag_df(df)
# df_final = convo_miner_rb_heuristic(df_final)

In [207]:
df_final = convo_miner_heuristic(df)

In [208]:
somedict = dict()
prev_label = ''

for i in df_final.index:
    curr_label = df_final.loc[i]['label']
    if curr_label == 'I':
        if prev_label == 'B-START':
            somedict[i] = 'I-START'
            prev_label = curr_label
        elif prev_label == 'I-START':
            somedict[i] = 'I-START'
            prev_label = curr_label
        else:
            prev_label = curr_label
    else:
        prev_label = curr_label




In [209]:
df_final.loc[84:95]

Unnamed: 0,para_index,sent,label,chapter_tag
84,80,"""Is that his design in settling here?""",B-OTHER,Chapter 1
85,81,"""Design! Nonsense, how can you talk so! But it is very likely that he may fall in love with one of them, and therefore you must visit him as soon as he comes.""",B-OTHER,Chapter 1
86,82,"""I see no occasion for that. You and the girls may go, or you may send them by themslves, which perhaps will be still better, for as you are as handsome as any of them, Mr Bingley may like you the best of the party.""",B-OTHER,Chapter 1
87,83,"""My dear, you flatter me. I certainly have had my share of beauty, but I do not pretend to be anything extraordinary now. When a woman has five grown-up daughters, she ought to give over thinking of her own beauty.""",B-OTHER,Chapter 1
88,84,"""In such cases, a woman has not often much beauty to think of.""",B-OTHER,Chapter 1
89,85,"""But, my dear, you must indeed go and see Mr Bingley when he comes into the neighbourhood.""",B-OTHER,Chapter 1
90,86,"""It is more than I engage for, I assure you.""",B-OTHER,Chapter 1
91,87,"""But consider your daughters. Only think what an establishment it would be for one of them. Sir William and Lady Lucas are determined to go, merely on that account, for in general, you know, they visit no newcomers. Indeed you must go, for it will be impossible for us to visit him if you do not.""",B-OTHER,Chapter 1
92,88,"""You are over-scrupulous, surely. I dare say Mr Bingley will be very glad to see you; and I will send a few lines by you to assure him of my hearty consent to his marrying whichever he chooses of the girls; though I must throw in a good word for my little Lizzy.""",B-OTHER,Chapter 1
93,89,"""I desire you will do no such thing. Lizzy is not a bit better than the others; and I am sure she is not half so handsome as Jane, nor half so good-humoured as Lydia. But you are always giving her the preference.""",B-OTHER,Chapter 1


In [217]:
df_final.to_csv('../../data/predicted/heuristic_pred.csv')

# Evaluation

In [210]:
df_labeled = pd.read_csv('../../data/parsed-n-labeled-data/iob-labeled-sent-final-020519.csv', index_col=[0])

In [211]:
df_labeled.shape

(2747, 4)

In [212]:
index_to_exclude = [i for i in range(1,63)]
df_final = df_final.drop(index_to_exclude, axis=0)
df_final = df_final.reset_index()
df_final.shape

(2747, 5)

In [213]:
df_labeled['split_tag'].value_counts()

train         2214
test          281 
validation    252 
Name: split_tag, dtype: int64

In [214]:
df_labeled.head()

Unnamed: 0,para_index,sent,label,split_tag
0,0,By Jane Austen,O,train
1,1,Chapter 1,O,train
2,2,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",O,train
3,3,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.",O,train
4,4,"""My dear Mr Bennet,""",B-START,train


In [215]:
df_final.head()

Unnamed: 0,index,para_index,sent,label,chapter_tag
0,0,0,By Jane Austen,O,
1,63,63,Chapter 1,O,Chapter 1
2,64,64,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",O,Chapter 1
3,65,65,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.",O,Chapter 1
4,66,66,"""My dear Mr Bennet,""",B-START,Chapter 1


In [216]:
y_true = df_labeled[df_labeled['split_tag']=='test']['label'].values

In [91]:
len(y_true)

281

In [81]:
test_indices = list(df_labeled[df_labeled['split_tag']=='test']['label'].index.values)

In [82]:
df_test = df_final.loc[test_indices]

In [83]:
y_pred = df_test['label'].values

In [52]:
# y_true = df_labeled['label'].values

In [84]:
labels_sorted = sorted(set(df_labeled['label'].values))
labels_sorted

['B-OTHER', 'B-START', 'I-OTHER', 'I-START', 'O']

In [85]:
from sklearn.metrics import classification_report as classification_report_all

print(classification_report_all(y_true, y_pred,
                                labels=labels_sorted,
                                digits=3))

              precision    recall  f1-score   support

     B-OTHER      0.903     0.989     0.944        94
     B-START      0.909     0.500     0.645        20
     I-OTHER      0.745     1.000     0.854        41
     I-START      1.000     0.300     0.462        20
           O      1.000     1.000     1.000       106

   micro avg      0.911     0.911     0.911       281
   macro avg      0.911     0.758     0.781       281
weighted avg      0.924     0.911     0.896       281



In [86]:
labels_sorted.remove('O')

In [87]:
labels_sorted

['B-OTHER', 'B-START', 'I-OTHER', 'I-START']

In [88]:
from sklearn.metrics import classification_report as classification_report_all

print(classification_report_all(y_true, y_pred,
                                labels=labels_sorted,
                                digits=3))

              precision    recall  f1-score   support

     B-OTHER      0.903     0.989     0.944        94
     B-START      0.909     0.500     0.645        20
     I-OTHER      0.745     1.000     0.854        41
     I-START      1.000     0.300     0.462        20

   micro avg      0.857     0.857     0.857       175
   macro avg      0.889     0.697     0.726       175
weighted avg      0.878     0.857     0.834       175



In [90]:
len(y_true)

281

In [95]:
save_as_pickle(y_pred, 'heuristic_y_pred')

heuristic_y_pred saved!
