## Overview

* This file read in result of previous pipeline (pipe 0) which provides all dialog1, 2 in a single csv file.
* This file output sentence features.

In [2]:
import pandas as pd
from time import time
import csv
import os

In [2]:
import spacy

# spacy.prefer_gpu()
nlp = spacy.load("en_core_web_lg")

In [3]:
# doc = nlp(u'What I need is...somebody who listens to me.')
doc = nlp(u'What are you talking about?')

for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

What pobj about ADP []
are aux talking VERB []
you nsubj talking VERB []
talking ROOT talking VERB [are, you, about, ?]
about prep talking VERB [What]
? punct talking VERB []


In [1]:
print('|', list(nlp('Both a bit fast, are they? Yeah no kidding.').sents)[-1], '|')

NameError: name 'nlp' is not defined

In [23]:
DEPS = set(['nsubj', 'dobj', 'amod', 'advmod'])
SPEC = '-PRON-'

def _lemma(token, ent_map):
    if token.lemma_ == SPEC:
        return token.text.lower()
    l = token.lemma_
    if l in ent_map:
        return ent_map[l]
    return l

def _entities(doc):
    res = {}  # key: name, value: type of entity
    for ent in doc.ents:
        for e in ent.text.split():
            res[e] = ent.label_
    return res
#         print(ent.text, ent.start_char, ent.end_char, ent.label_)

def _get_tree(doc):
    res = []
    ents = _entities(doc)
    if len(list(doc)) < 4:
        return []
    for token in doc:
#         if token.dep_ in DEPS:
#         if token.tag_ == Noun:
        res.append(('POS', token.tag_, _lemma(token, ents), _lemma(token, ents), token.idx, token.is_stop,))
        res.append(('DEP', token.dep_, _lemma(token, ents), _lemma(token.head, ents), (token.head.idx - token.idx), token.is_stop,))
    return res

def parse(sent):
    # tuple format: (dep relation, word1, word2, relative pos)
    # break multiple sentences, question mark have priority, and later sentence have lower priority. 
    sents = list(nlp(sent).sents)
    res = []
    for s in sents[::-1]:
        if s.text.strip()[-1] == '?':
            q_res = _get_tree(s)
            if len(q_res) < 3:
                continue
            return q_res
        if not res:
            res = _get_tree(s)
        if len(res) < 3:
            res = []
    if len(res) < 3:
        return []
    return res

def parse_all(sent):
    # same as last parse function, except not break the sentence to pieces.
    res = _get_tree(nlp(sent))
    if len(res) < 3:
        return []
    return res

def test_sent_parser(sent, check_root):
    # sentence root is used to determine if the correct sentence is parsed.
    print('------')
    print('INPUT:|{}|\nROOT:|{}|'.format(sent, check_root))
    _res = parse(sent)
    print('RESULT:{}'.format(_res))
    if not check_root:  # indicate result has to be empty
        assert len(_res) == 0
        return  # end check
    for e in _res:
        if e[1] == 'ROOT':
            assert e[2] == check_root
            print()  # nice view format
            return
    if not len(_res):
        raise Exception('No return results.')
    raise Exception('Can not find root, function return result is broken: |{}|'.format(sent))

# # question sentence has priority (due to it is dialog1, so dialog 2 most likely response to question sentence).
test_sent_parser('What are you talking about? Kill Kevin if you can.', 'talk')

# question sentence has priority, however the sentence lack of structure and content
# Therefore second sentence has priority.
test_sent_parser('What? Kill Kevin if you can.', 'kill')

# Same as last one, should parse the first sentence.
test_sent_parser(u'Kill Kevin if you can. What?', 'kill')

# Both first and second sentence lacks of structure and content, empty is returned.
test_sent_parser(u'What? Kill Kevin.', check_root=None)

# Only the last sentence has structure.
test_sent_parser(u'What. Kill Kevin if you can.', check_root='kill')

# The question sentence has priority.
test_sent_parser(u'What. Kill Kevin if you can. What are you doing?', check_root='do')

# Only the first sentence has structure.
test_sent_parser(u'Kill Kevin if you can. What.', check_root='kill')

# both of the sentence has complete structure, the later sentence has priority
test_sent_parser(u'Kill Kevin if you can. What are you doing.', check_root='do')

------
INPUT:|What are you talking about? Kill Kevin if you can.|
ROOT:|talk|
RESULT:[('POS', 'WP', 'what', 'what', 0, True), ('DEP', 'pobj', 'what', 'about', 21, True), ('POS', 'VBP', 'be', 'be', 5, True), ('DEP', 'aux', 'be', 'talk', 8, True), ('POS', 'PRP', 'you', 'you', 9, True), ('DEP', 'nsubj', 'you', 'talk', 4, True), ('POS', 'VBG', 'talk', 'talk', 13, False), ('DEP', 'ROOT', 'talk', 'talk', 0, False), ('POS', 'IN', 'about', 'about', 21, True), ('DEP', 'prep', 'about', 'talk', -8, True), ('POS', '.', '?', '?', 26, False), ('DEP', 'punct', '?', 'talk', -13, False)]

------
INPUT:|What? Kill Kevin if you can.|
ROOT:|kill|
RESULT:[('POS', 'VB', 'kill', 'kill', 6, False), ('DEP', 'ROOT', 'kill', 'kill', 0, False), ('POS', 'NNP', 'PERSON', 'PERSON', 11, False), ('DEP', 'dobj', 'PERSON', 'kill', -5, False), ('POS', 'IN', 'if', 'if', 17, True), ('DEP', 'mark', 'if', 'can', 7, True), ('POS', 'PRP', 'you', 'you', 20, True), ('DEP', 'nsubj', 'you', 'can', 4, True), ('POS', 'MD', 'can', 'c

In [14]:
parse_all("Surely you don't expect me to believe that?")

[('POS', 'RB', 'surely', 'surely', 0, False),
 ('DEP', 'advmod', 'surely', 'expect', 17, False),
 ('POS', 'PRP', 'you', 'you', 7, True),
 ('DEP', 'nsubj', 'you', 'expect', 10, True),
 ('POS', 'VBP', 'do', 'do', 11, True),
 ('DEP', 'aux', 'do', 'expect', 6, True),
 ('POS', 'RB', 'not', 'not', 13, True),
 ('DEP', 'neg', 'not', 'expect', 4, True),
 ('POS', 'VB', 'expect', 'expect', 17, False),
 ('DEP', 'ROOT', 'expect', 'expect', 0, False),
 ('POS', 'PRP', 'me', 'me', 24, True),
 ('DEP', 'nsubj', 'me', 'believe', 6, True),
 ('POS', 'TO', 'to', 'to', 27, True),
 ('DEP', 'aux', 'to', 'believe', 3, True),
 ('POS', 'VB', 'believe', 'believe', 30, False),
 ('DEP', 'ccomp', 'believe', 'expect', -13, False),
 ('POS', 'DT', 'that', 'that', 38, True),
 ('DEP', 'dobj', 'that', 'believe', -8, True),
 ('POS', '.', '?', '?', 42, False),
 ('DEP', 'punct', '?', 'expect', -25, False)]

In [7]:
parse_all("I want to eat the apple.")

[('PRP', 'i', 'i', 0),
 ('nsubj', 'i', 'want', 2),
 ('VBP', 'want', 'want', 2),
 ('ROOT', 'want', 'want', 0),
 ('TO', 'to', 'to', 7),
 ('aux', 'to', 'eat', 3),
 ('VB', 'eat', 'eat', 10),
 ('xcomp', 'eat', 'want', -8),
 ('DT', 'the', 'the', 14),
 ('det', 'the', 'apple', 4),
 ('NN', 'apple', 'apple', 18),
 ('dobj', 'apple', 'eat', -8),
 ('.', '.', '.', 23),
 ('punct', '.', 'want', -21)]

In [15]:
dialogs = pd.read_csv('all_dialogs.csv')
dialogs

Unnamed: 0,show_id,char_name,char_id,dia1,dia2
0,DoctorWho,FOURTH DOCTOR,l8653,Get me the medical officer. Lieutenant Sulliva...,Human history.
1,DoctorWho,FOURTH DOCTOR,l8653,It's something that happened when we first met.,"I tell you, Brigadier, there's nothing to worr..."
2,DoctorWho,FOURTH DOCTOR,l8653,"This the patient, sir?",And stupid. If the square on the hypotenuse eq...
3,DoctorWho,FOURTH DOCTOR,l8653,"There you are. Now come along, Doctor, you're ...",Am I? Don't you mean the infirmary?
4,DoctorWho,FOURTH DOCTOR,l8653,"No, I do not mean the infirmary. I mean the si...",Not fit? I'm the Doctor.
5,DoctorWho,FOURTH DOCTOR,l8653,"No, Doctor, I'm the doctor and I say that you'...","You may be a doctor, but I'm the Doctor. The d..."
6,DoctorWho,FOURTH DOCTOR,l8653,"Look here, Doctor. You're not fit",Not fit? Not fit? Of course I'm fit. All syste...
7,DoctorWho,FOURTH DOCTOR,l8653,"I say, I don't think that can be right.","Both a bit fast, are they?"
8,DoctorWho,FOURTH DOCTOR,l8653,"Well, I","Still, must be patient. A new body's like a ne..."
9,DoctorWho,FOURTH DOCTOR,l8653,"Well, I really don't know.","Well, of course you don't. Why should you? You..."


In [16]:
PRE_PRINT = 100000
WORKLOAD = 1073044

def write_all_syntactic(df_dialogs, output_file, rm_sent_file, clip_sent=True):
    assert not os.path.exists(output_file)
    assert not os.path.exists(rm_sent_file)
    with open(output_file, 'a+') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow('sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop'.split(','))
    with open(rm_sent_file, 'a+') as csvfile2:
        writer = csv.writer(csvfile2)
        writer.writerow('show_id,char_name,char_id,dia1,dia2'.split(','))
    _time = time()
    _last_ind = 1
    for row in df_dialogs.itertuples():
        if ((row.Index < 10000) and (row.Index % 1000 == 0)) or (row.Index % PRE_PRINT) == 0:
            print(row.Index, '{:.2f} hours left.'.format((time()-_time)/(row.Index-_last_ind)*(WORKLOAD-row.Index)/60/60))
            _time = time()
            _last_ind = row.Index
        if type(row.dia1) != str or len(row.dia1) == 0:
            continue
        with open(output_file, 'a+') as csvfile:
            writer = csv.writer(csvfile)
            if clip_sent:
                heads = parse(row.dia1)
            else:
                heads = parse_all(row.dia1)
            if len(heads) == 0:
                 with open(rm_sent_file, 'a+') as csvfile2:
                    writer2 = csv.writer(csvfile2)
                    writer2.writerow([row.show_id, row.char_name, row.char_id, row.dia1, row.dia2])
            for head in heads:
                writer.writerow([row.Index, row.show_id, row.char_name, row.char_id, head[0], head[1], head[2], head[3], head[4], head[5]])

In [None]:
raise Exception('write all (with best guess sentence), this ops costs about 5 hours on an decent CPU machine.')

In [17]:
# write all sentence with filtering to most important sentence.
write_all_syntactic(dialogs, 'all_dialogs_heads.csv', 'all_dialogs_removed.csv')

0 -1.59 hours left.
1000 5.40 hours left.
2000 5.00 hours left.
3000 5.01 hours left.
4000 5.08 hours left.
5000 4.78 hours left.
6000 4.91 hours left.
7000 4.82 hours left.
8000 4.65 hours left.
9000 4.78 hours left.
100000 4.77 hours left.
200000 4.60 hours left.
300000 3.86 hours left.
400000 3.35 hours left.
500000 2.90 hours left.
600000 2.45 hours left.
700000 1.88 hours left.
800000 1.28 hours left.
900000 0.86 hours left.
1000000 0.36 hours left.


In [None]:
raise Exception('write all, this ops costs about 5 hours on an decent CPU machine.')

In [None]:
# write all sentence without sub-sentence filtering
# write_all_syntactic(dialogs, 'all_dialogs_heads_nolimit.csv', 'all_dialogs_removed_nolimit.csv', clip_sent=False)

In [3]:
dialog_heads = pd.read_csv('all_dialogs_heads.csv')
dialog_heads

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop
0,0,DoctorWho,FOURTH DOCTOR,l8653,POS,VB,come,come,60,False
1,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,ROOT,come,come,0,False
2,0,DoctorWho,FOURTH DOCTOR,l8653,POS,IN,to,to,65,True
3,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,prep,to,come,-5,True
4,0,DoctorWho,FOURTH DOCTOR,l8653,POS,DT,the,the,68,True
5,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,det,the,lab,4,True
6,0,DoctorWho,FOURTH DOCTOR,l8653,POS,NN,lab,lab,72,False
7,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,pobj,lab,to,-7,False
8,0,DoctorWho,FOURTH DOCTOR,l8653,POS,IN,at,at,76,True
9,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,prep,at,come,-16,True


In [4]:
# -- direct inspection for sentence neighbourhoood (exact match of sentence head) --
def _head_string(row):
    return '{}|{}|{}'.format(row.head_pos, row.token, row.token_par)

dialog_heads['head_text'] = dialog_heads.apply(_head_string, axis=1)
dialog_heads

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,0,DoctorWho,FOURTH DOCTOR,l8653,POS,VB,come,come,60,False,VB|come|come
1,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,ROOT,come,come,0,False,ROOT|come|come
2,0,DoctorWho,FOURTH DOCTOR,l8653,POS,IN,to,to,65,True,IN|to|to
3,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,prep,to,come,-5,True,prep|to|come
4,0,DoctorWho,FOURTH DOCTOR,l8653,POS,DT,the,the,68,True,DT|the|the
5,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,det,the,lab,4,True,det|the|lab
6,0,DoctorWho,FOURTH DOCTOR,l8653,POS,NN,lab,lab,72,False,NN|lab|lab
7,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,pobj,lab,to,-7,False,pobj|lab|to
8,0,DoctorWho,FOURTH DOCTOR,l8653,POS,IN,at,at,76,True,IN|at|at
9,0,DoctorWho,FOURTH DOCTOR,l8653,DEP,prep,at,come,-16,True,prep|at|come


In [5]:
dialog_heads.head_text.value_counts()

.|.|.                        576174
,|,|,                        428302
PRP|you|you                  310320
PRP|i|i                      265113
.|?|?                        252663
ROOT|be|be                   219064
DT|the|the                   205609
VBZ|be|be                    180086
NNP|PERSON|PERSON            146516
TO|to|to                     146201
VBP|be|be                    145453
RB|not|not                   141320
punct|.|be                   136676
DT|a|a                       135013
PRP|it|it                    132420
CC|and|and                    92985
punct|,|be                    85277
WP|what|what                  79302
IN|of|of                      78788
PRP|we|we                     77222
DT|that|that                  74381
punct|?|be                    61287
VBD|be|be                     60514
IN|in|in                      57542
PRP|me|me                     56727
VBP|do|do                     53599
nsubj|it|be                   53510
DT|this|this                

In [8]:
# write them back with the extra column
dialog_heads.to_csv('all_dialogs_heads.csv', index=False)

## Clean Data for Matrix Multiplication of Sentence Neighbourhood

Let us play around to get some sentence neighourhood grouping.

The Matrix Multiplication is in next pipeline (pipe3). This section only clean and write data.

In [6]:
# Filter the sentence head features that has low frequency.
# we keep at >4 freqency features.

_count = 0
_len = dialog_heads.shape[0]
for text, freq in dialog_heads.head_text.value_counts().iteritems():
    if 5 < freq:
        _count += freq

print(_count/_len)  # precentage of data still got left after filtering.

0.9127074590081874


In [None]:
# remove all low frequency features and save to file
filtered_heads = dialog_heads[dialog_heads.groupby('head_text')['head_text'].transform('count').ge(5)]
filtered_heads

In [None]:
# write to file for next pipeline
assert not os.path.exists('all_dialogs_heads_filtered.csv')
filtered_heads.to_csv('all_dialogs_heads_filtered.csv', index=False)

In [None]:
filtered_heads.loc[filtered_heads.sent_id==617921]

In [None]:
filtered_heads.loc[filtered_heads.sent_id==562875]

In [None]:
dialogs.iloc[508079].dia1

In [None]:
dialogs.iloc[159581].dia1

In [None]:
parse(dialogs.iloc[159581].dia1)