In [1]:
import pandas as pd
import nltk
import os
import csv
import numpy as np
import re
from collections import OrderedDict, Counter
from tqdm import tqdm
from IPython.core.debugger import Tracer; debug_here = Tracer()

# Load dataset

In [2]:
data = pd.read_csv('../project/dm_text.csv', dtype={'relation': str, 'dm_span': str, 'arg1_span':str, 'arg1_text':str, 
                                                    'arg2_span':str, 'arg2_text':str})
len(data)

100982

# Vectorize input

In [3]:
def simple_char_embedding(word):
    """ Embeds a word as count vector of characters
    """
    
    wdvec = [0]*127
    for c in word:
        if ord(c) < 127:
            wdvec[ord(c)] += 1
    
    return wdvec

# Mean, max length of text
def word_len(text):
    if not isinstance(text, str):
        return 0
    else:
        return len(text.split())

In [6]:
arg1_lens = data['arg1_text'].map(word_len).tolist()
arg2_lens = data['arg2_text'].map(word_len).tolist()
n_words = arg1_lens + arg2_lens
print(np.mean(n_words))
print(np.std(n_words))
print(max(n_words))

print("Rows with >1000 words: {}/{} ({:%})".format(len([n for n in n_words if n > 1000]), len(n_words), 
                                                   len([n for n in n_words if n > 1000])/len(n_words)))

21.2370868075
31.6925938275
1104
Rows with >1000 words: 34/201964 (0.016835%)


In [13]:
def vectorize(arg1, dm, arg2):
    
    if isinstance(arg1, str):
        arg1vec = np.array([simple_char_embedding(w) for w in arg1.strip().split()])
    else:
        arg1vec = np.zeros(127,1)
        
    dmvec = np.array([simple_char_embedding(w) for w in dm.strip().split()])
    
    if isinstance(arg2, str):
        arg2vec = np.array([simple_char_embedding(w) for w in arg2.strip().split()])
    else:
        arg2vec = np.zeros(127,1)
    
    ones = np.ones(127)
    
    vec = np.vstack([arg1vec, ones, dmvec, ones, arg2vec])
#     full_vec = np.pad(vec, ((0,0),(0,1000-vec.shape[1])), 'constant', constant_values=0)
#     return full_vec
    return vec

In [15]:
len(data)

100982

In [14]:
# Vectorize input
text_input = list(zip(data['arg1_text'], data['dm_text'], data['arg2_text']))
X = np.zeros((127,1000,len(data)))

for i, (a1, d, a2) in enumerate(tqdm(text_input[:100])):
    X[:,:,i] = vectorize(a1, d, a2)
    
# X = list(map(vectorize, zip(data['arg1_text'], data['dm_text'], data['arg2_text'])))
# X = np.asarray(arrs)
X.shape

MemoryError: 

In [12]:
X

array([ array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
       array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
       array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
       array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
  

# Stats on dataset

In [32]:
data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text,full_text_preprocessed,discourse_connective
0,Explicit,"[[2457, 2460]]",and,"[[2280, 2455]]",mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors,"[[2461, 2517]]",the successful son wishes his embarrassing siblings dead,Mourning for the father is overshadowed by the shame of burying him in a pauper's grave.The family moves to another house at night to conceal shabby belongings from neighbors And the successful son wishes his embarrassing siblings dead,mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors and the successful son wishes his embarrassing siblings dead,True
1,Explicit,"[[4298, 4309]]",accordingly,"[[4119, 4165]]","now , the push is on for more-distinctive shows","[[4311, 4481]]",cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds,"Now, the push is on for more-distinctive shows accordingly CNN is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most ""differentiated"" programs in viewers' minds","now , the push is on for more-distinctive shows accordingly cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds",True
2,Explicit,"[[535, 546]]",accordingly,"[[339, 533]]",nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets,"[[548, 662]]",cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $700 million from $516 million between June and September.Approximately 85% of the total consisted of nonperforming commercial real estate assets accordingly CityFed estimated that it will provide between $85 million and $110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets accordingly cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,True
3,Explicit,"[[1298, 1309]]",accordingly,"[[1181, 1237]]",the outlook for natural gas is better than it is for oil,"[[1239, 1297]]",and have shifted their exploration and development budgets,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly,True
4,Explicit,"[[5776, 5787]]",accordingly,"[[5555, 5677]]","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him","[[5719, 5775]]",who had the chance to tilt their next day 's presentation,"to hire six people who would mirror the actual jury demographically, sit in on the trial and report their reactions to him who had the chance to tilt their next day's presentation accordingly","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him who had the chance to tilt their next day 's presentation accordingly",True
5,Explicit,"[[2133, 2144]]",accordingly,"[[1973, 2131]]","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species","[[2146, 2305]]",the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories,"The received Darwinian wisdom of the day said that animals living so long ago must be simple in design, limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the Burgess fossils in such a way that they could be shoehorned into familiar categories","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories",True
6,Explicit,"[[925, 937]]",additionally,"[[9, 352]]","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline","[[939, 1015]]","crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share","Crossland Savings Bank's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because Crossland may not meet the new government capital criteria effective Dec. 7. In composite trading on the New York Stock Exchange Friday, Crossland closed at $5.25, down $1.875, a 26% decline additionally Crossland reported a third-quarter loss of $175.5 million, or $13...","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline additionally crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share",True
7,Explicit,"[[3242, 3254]]",additionally,"[[3085, 3240]]",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market,"[[3287, 3396]]",it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,"its $110 million provision for credit losses and $11 million provision for other real estate owned is related to weakness in the Arizona real estate market additionally it downgraded Valley National's senior debt and confirmed the company's commercial paper rating of ""not prime",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market additionally it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,True
8,Explicit,"[[755, 767]]",additionally,"[[390, 753]]","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division","[[769, 878]]","the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share","For the full fiscal year, Varian posted a 13% profit rise to $31.5 million, or $1.53 a share, up from $27.8 million, or $1.27 a share, last year.Sales for the year rose almost 15% to $1.34 billion from $1.17 billion last year. A profit last year in both the quarter and year included a net gain of $9.6 million, or 44 cents a share, from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $22.8 million, or $1.04 a share","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share",True
9,Explicit,"[[5142, 5154]]",additionally,"[[4964, 5140]]","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators","[[5156, 5230]]",the end of the month position-squaring might have also played a minor role,"Contributing to the market's reserved stance was the release later in the day of new data on the health of the U.S. economy, in the form of the U.S. index of leading indicators additionally the end of the month position-squaring might have also played a minor role","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators additionally the end of the month position-squaring might have also played a minor role",True


In [34]:
# Pos/negative split
print("Number of positive examples: {}".format(len(data[data['discourse_connective']==True])))
print("Number of negative examples: {}".format(len(data[data['discourse_connective']==False])))

Number of positive examples: 18205
Number of negative examples: 82823


In [35]:
# Pos/neg per discourse marker
pos_counter = Counter()
neg_counter = Counter()

for dm in dms:
    n_pos = len(data[(data['dm_text']==dm) & (data['discourse_connective']==True)])
    n_neg = len(data[(data['dm_text']==dm) & (data['discourse_connective']==False)])
    
    pos_counter[dm] = n_pos
    neg_counter[dm] = n_neg
    
    print("{}: {} pos, {} neg".format(dm, n_pos, n_neg))

accordingly: 5 pos, 3 neg
additionally: 7 pos, 1 neg
after: 575 pos, 694 neg
afterward: 10 pos, 6 neg
also: 1706 pos, 575 neg
alternatively: 6 pos, 0 neg
although: 326 pos, 80 neg
and: 2946 pos, 11491 neg
as: 738 pos, 16891 neg
as a result: 75 pos, 68 neg
as an alternative: 2 pos, 4 neg
as if: 16 pos, 15 neg
as long as: 24 pos, 23 neg
as soon as: 20 pos, 16 neg
as though: 5 pos, 4 neg
as well: 6 pos, 126 neg
because: 856 pos, 629 neg
before: 325 pos, 351 neg
before and after: 1 pos, 1 neg
besides: 19 pos, 12 neg
but: 3268 pos, 1659 neg
by comparison: 11 pos, 5 neg
by contrast: 27 pos, 6 neg
by then: 6 pos, 6 neg
consequently: 10 pos, 2 neg
conversely: 2 pos, 2 neg
earlier: 15 pos, 266 neg
either or: 4 pos, 0 neg
else: 1 pos, 99 neg
except: 10 pos, 54 neg
finally: 31 pos, 53 neg
for: 3 pos, 8063 neg
for example: 196 pos, 99 neg
for instance: 98 pos, 56 neg
further: 9 pos, 205 neg
furthermore: 11 pos, 5 neg
hence: 4 pos, 6 neg
however: 485 pos, 149 neg
if: 1217 pos, 2800 neg
if and when:

# Remove multi-span discourse markers

In [36]:
data[data['dm_text']=='on the one hand on the other hand']

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text,full_text_preprocessed,discourse_connective
14208,Explicit,"[[2046, 2061], [2133, 2150]]",on the one hand on the other hand,"[[2079, 2130]]",it would be misguided to sell into `` a classic panic,"[[2152, 2203]]",it 's not necessarily a good time to jump in and buy,"on the one hand it would be misguided to sell into ""a classic panic on the other hand it's not necessarily a good time to jump in and buy",on the one hand it would be misguided to sell into `` a classic panic on the other hand it 's not necessarily a good time to jump in and buy,True


In [7]:
# Identify discourse markers that have more than one span
def mult_spans(text):
    if isinstance(text, str) and len(text.split(',')) > 2:
#         return False
        return True
    else:
        return False
#         return True
    
mask = data['dm_span'].map(mult_spans)
sorted(set(data[mask]['dm_text']))

['either or', 'if then', 'neither nor', 'on the one hand on the other hand']

In [8]:
# Remove rows that have multi-span discourse markers (feature for later)

def mult_spans(text):
    if isinstance(text, str) and len(text.split(',')) > 2:
        return False
    else:
        return True
    
mask = data['dm_span'].map(mult_spans)
data = data[mask]
len(data)

100982

In [9]:
data.to_csv('../project/dm_text.csv', index=False)

In [10]:
# Load list of dms
with open('../project/pdtb_discourse_connectives.txt') as f:
    dms = f.read().splitlines()
len(dms)

100

In [11]:
multispan_dm = ['either or', 'if then', 'neither nor', 'on the one hand on the other hand']
for dm in multispan_dm:
    dms.remove(dm)
    
len(dms)

96

In [12]:
with open('../project/pdtb_discourse_connectives_no_multispan.txt', 'w') as f:
    for dm in dms:
        f.write(dm + '\n')

# Preprocess full text

In [11]:
# Preprocess
data['full_text_preprocessed'] = data['full_text'].map(preprocess)
data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text,full_text_preprocessed
0,Explicit,"[[2457, 2460]]",and,"[[2280, 2455]]",mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors,"[[2461, 2517]]",the successful son wishes his embarrassing siblings dead,Mourning for the father is overshadowed by the shame of burying him in a pauper's grave.The family moves to another house at night to conceal shabby belongings from neighbors And the successful son wishes his embarrassing siblings dead,mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors and the successful son wishes his embarrassing siblings dead
1,Explicit,"[[4298, 4309]]",accordingly,"[[4119, 4165]]","now , the push is on for more-distinctive shows","[[4311, 4481]]",cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds,"Now, the push is on for more-distinctive shows accordingly CNN is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most ""differentiated"" programs in viewers' minds","now , the push is on for more-distinctive shows accordingly cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds"
2,Explicit,"[[535, 546]]",accordingly,"[[339, 533]]",nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets,"[[548, 662]]",cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $700 million from $516 million between June and September.Approximately 85% of the total consisted of nonperforming commercial real estate assets accordingly CityFed estimated that it will provide between $85 million and $110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets accordingly cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter
3,Explicit,"[[1298, 1309]]",accordingly,"[[1181, 1237]]",the outlook for natural gas is better than it is for oil,"[[1239, 1297]]",and have shifted their exploration and development budgets,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly
4,Explicit,"[[5776, 5787]]",accordingly,"[[5555, 5677]]","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him","[[5719, 5775]]",who had the chance to tilt their next day 's presentation,"to hire six people who would mirror the actual jury demographically, sit in on the trial and report their reactions to him who had the chance to tilt their next day's presentation accordingly","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him who had the chance to tilt their next day 's presentation accordingly"
5,Explicit,"[[2133, 2144]]",accordingly,"[[1973, 2131]]","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species","[[2146, 2305]]",the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories,"The received Darwinian wisdom of the day said that animals living so long ago must be simple in design, limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the Burgess fossils in such a way that they could be shoehorned into familiar categories","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories"
6,Explicit,"[[925, 937]]",additionally,"[[9, 352]]","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline","[[939, 1015]]","crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share","Crossland Savings Bank's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because Crossland may not meet the new government capital criteria effective Dec. 7. In composite trading on the New York Stock Exchange Friday, Crossland closed at $5.25, down $1.875, a 26% decline additionally Crossland reported a third-quarter loss of $175.5 million, or $13...","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline additionally crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share"
7,Explicit,"[[3242, 3254]]",additionally,"[[3085, 3240]]",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market,"[[3287, 3396]]",it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,"its $110 million provision for credit losses and $11 million provision for other real estate owned is related to weakness in the Arizona real estate market additionally it downgraded Valley National's senior debt and confirmed the company's commercial paper rating of ""not prime",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market additionally it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime
8,Explicit,"[[755, 767]]",additionally,"[[390, 753]]","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division","[[769, 878]]","the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share","For the full fiscal year, Varian posted a 13% profit rise to $31.5 million, or $1.53 a share, up from $27.8 million, or $1.27 a share, last year.Sales for the year rose almost 15% to $1.34 billion from $1.17 billion last year. A profit last year in both the quarter and year included a net gain of $9.6 million, or 44 cents a share, from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $22.8 million, or $1.04 a share","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share"
9,Explicit,"[[5142, 5154]]",additionally,"[[4964, 5140]]","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators","[[5156, 5230]]",the end of the month position-squaring might have also played a minor role,"Contributing to the market's reserved stance was the release later in the day of new data on the health of the U.S. economy, in the form of the U.S. index of leading indicators additionally the end of the month position-squaring might have also played a minor role","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators additionally the end of the month position-squaring might have also played a minor role"


In [12]:
data.to_csv('../project/dm_text.csv', index=False)

# Find negative examples

In [3]:
# Load list of dms
with open('../project/pdtb_discourse_connectives.txt') as f:
    dms = f.read().splitlines()
len(dms)

100

In [5]:
pd.set_option('display.max_colwidth', 999)

In [13]:
data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text,full_text_preprocessed
0,Explicit,"[[2457, 2460]]",and,"[[2280, 2455]]",mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors,"[[2461, 2517]]",the successful son wishes his embarrassing siblings dead,Mourning for the father is overshadowed by the shame of burying him in a pauper's grave.The family moves to another house at night to conceal shabby belongings from neighbors And the successful son wishes his embarrassing siblings dead,mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors and the successful son wishes his embarrassing siblings dead
1,Explicit,"[[4298, 4309]]",accordingly,"[[4119, 4165]]","now , the push is on for more-distinctive shows","[[4311, 4481]]",cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds,"Now, the push is on for more-distinctive shows accordingly CNN is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most ""differentiated"" programs in viewers' minds","now , the push is on for more-distinctive shows accordingly cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds"
2,Explicit,"[[535, 546]]",accordingly,"[[339, 533]]",nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets,"[[548, 662]]",cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $700 million from $516 million between June and September.Approximately 85% of the total consisted of nonperforming commercial real estate assets accordingly CityFed estimated that it will provide between $85 million and $110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets accordingly cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter
3,Explicit,"[[1298, 1309]]",accordingly,"[[1181, 1237]]",the outlook for natural gas is better than it is for oil,"[[1239, 1297]]",and have shifted their exploration and development budgets,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly
4,Explicit,"[[5776, 5787]]",accordingly,"[[5555, 5677]]","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him","[[5719, 5775]]",who had the chance to tilt their next day 's presentation,"to hire six people who would mirror the actual jury demographically, sit in on the trial and report their reactions to him who had the chance to tilt their next day's presentation accordingly","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him who had the chance to tilt their next day 's presentation accordingly"
5,Explicit,"[[2133, 2144]]",accordingly,"[[1973, 2131]]","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species","[[2146, 2305]]",the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories,"The received Darwinian wisdom of the day said that animals living so long ago must be simple in design, limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the Burgess fossils in such a way that they could be shoehorned into familiar categories","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories"
6,Explicit,"[[925, 937]]",additionally,"[[9, 352]]","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline","[[939, 1015]]","crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share","Crossland Savings Bank's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because Crossland may not meet the new government capital criteria effective Dec. 7. In composite trading on the New York Stock Exchange Friday, Crossland closed at $5.25, down $1.875, a 26% decline additionally Crossland reported a third-quarter loss of $175.5 million, or $13...","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline additionally crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share"
7,Explicit,"[[3242, 3254]]",additionally,"[[3085, 3240]]",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market,"[[3287, 3396]]",it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,"its $110 million provision for credit losses and $11 million provision for other real estate owned is related to weakness in the Arizona real estate market additionally it downgraded Valley National's senior debt and confirmed the company's commercial paper rating of ""not prime",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market additionally it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime
8,Explicit,"[[755, 767]]",additionally,"[[390, 753]]","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division","[[769, 878]]","the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share","For the full fiscal year, Varian posted a 13% profit rise to $31.5 million, or $1.53 a share, up from $27.8 million, or $1.27 a share, last year.Sales for the year rose almost 15% to $1.34 billion from $1.17 billion last year. A profit last year in both the quarter and year included a net gain of $9.6 million, or 44 cents a share, from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $22.8 million, or $1.04 a share","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share"
9,Explicit,"[[5142, 5154]]",additionally,"[[4964, 5140]]","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators","[[5156, 5230]]",the end of the month position-squaring might have also played a minor role,"Contributing to the market's reserved stance was the release later in the day of new data on the health of the U.S. economy, in the form of the U.S. index of leading indicators additionally the end of the month position-squaring might have also played a minor role","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators additionally the end of the month position-squaring might have also played a minor role"


In [14]:
data['discourse_connective'] = [True] * len(data)
data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text,full_text_preprocessed,discourse_connective
0,Explicit,"[[2457, 2460]]",and,"[[2280, 2455]]",mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors,"[[2461, 2517]]",the successful son wishes his embarrassing siblings dead,Mourning for the father is overshadowed by the shame of burying him in a pauper's grave.The family moves to another house at night to conceal shabby belongings from neighbors And the successful son wishes his embarrassing siblings dead,mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors and the successful son wishes his embarrassing siblings dead,True
1,Explicit,"[[4298, 4309]]",accordingly,"[[4119, 4165]]","now , the push is on for more-distinctive shows","[[4311, 4481]]",cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds,"Now, the push is on for more-distinctive shows accordingly CNN is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most ""differentiated"" programs in viewers' minds","now , the push is on for more-distinctive shows accordingly cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds",True
2,Explicit,"[[535, 546]]",accordingly,"[[339, 533]]",nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets,"[[548, 662]]",cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $700 million from $516 million between June and September.Approximately 85% of the total consisted of nonperforming commercial real estate assets accordingly CityFed estimated that it will provide between $85 million and $110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets accordingly cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,True
3,Explicit,"[[1298, 1309]]",accordingly,"[[1181, 1237]]",the outlook for natural gas is better than it is for oil,"[[1239, 1297]]",and have shifted their exploration and development budgets,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly,True
4,Explicit,"[[5776, 5787]]",accordingly,"[[5555, 5677]]","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him","[[5719, 5775]]",who had the chance to tilt their next day 's presentation,"to hire six people who would mirror the actual jury demographically, sit in on the trial and report their reactions to him who had the chance to tilt their next day's presentation accordingly","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him who had the chance to tilt their next day 's presentation accordingly",True
5,Explicit,"[[2133, 2144]]",accordingly,"[[1973, 2131]]","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species","[[2146, 2305]]",the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories,"The received Darwinian wisdom of the day said that animals living so long ago must be simple in design, limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the Burgess fossils in such a way that they could be shoehorned into familiar categories","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories",True
6,Explicit,"[[925, 937]]",additionally,"[[9, 352]]","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline","[[939, 1015]]","crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share","Crossland Savings Bank's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because Crossland may not meet the new government capital criteria effective Dec. 7. In composite trading on the New York Stock Exchange Friday, Crossland closed at $5.25, down $1.875, a 26% decline additionally Crossland reported a third-quarter loss of $175.5 million, or $13...","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline additionally crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share",True
7,Explicit,"[[3242, 3254]]",additionally,"[[3085, 3240]]",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market,"[[3287, 3396]]",it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,"its $110 million provision for credit losses and $11 million provision for other real estate owned is related to weakness in the Arizona real estate market additionally it downgraded Valley National's senior debt and confirmed the company's commercial paper rating of ""not prime",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market additionally it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,True
8,Explicit,"[[755, 767]]",additionally,"[[390, 753]]","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division","[[769, 878]]","the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share","For the full fiscal year, Varian posted a 13% profit rise to $31.5 million, or $1.53 a share, up from $27.8 million, or $1.27 a share, last year.Sales for the year rose almost 15% to $1.34 billion from $1.17 billion last year. A profit last year in both the quarter and year included a net gain of $9.6 million, or 44 cents a share, from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $22.8 million, or $1.04 a share","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share",True
9,Explicit,"[[5142, 5154]]",additionally,"[[4964, 5140]]","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators","[[5156, 5230]]",the end of the month position-squaring might have also played a minor role,"Contributing to the market's reserved stance was the release later in the day of new data on the health of the U.S. economy, in the form of the U.S. index of leading indicators additionally the end of the month position-squaring might have also played a minor role","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators additionally the end of the month position-squaring might have also played a minor role",True


In [15]:
data.to_csv('../project/dm_text.csv', index=False)

In [20]:
# Count how many negative examples in arg1_text
total_neg = 0
for col in ['arg1_text', 'arg2_text']:
    for text in tqdm(data['arg1_text'].tolist(), total=len(data)):
        for dm in dms:
            if dm in text:
                total_neg += 1
total_neg

100%|██████████| 18205/18205 [00:00<00:00, 48554.45it/s]
100%|██████████| 18205/18205 [00:00<00:00, 48274.92it/s]


69564

In [24]:
data.columns

Index(['relation', 'dm_span', 'dm_text', 'arg1_span', 'arg1_text', 'arg2_span',
       'arg2_text', 'full_text', 'full_text_preprocessed',
       'discourse_connective'],
      dtype='object')

In [22]:
neg_egs = [] # outrows
for col in ['arg1_text', 'arg2_text']:
    for i, text in tqdm(data[col].iteritems(), total=len(data)):
        text_dms = []
        for dm in dms:
            matches = [m.start() for m in re.finditer(dm, text)]
            text_dms.extend(zip(matches, [dm]*len(matches)))
        for j,w in text_dms:
            # Build negative example row
#             new_arg1 = ' '.join(data.loc[i]['full_text_preprocessed'].split()[:j])
#             new_arg2 = ' '.join(data.loc[i]['full_text_preprocessed'].split()[j+1:])
            full_text = data.loc[i]['full_text']
            full_text_preprocessed = data.loc[i]['full_text_preprocessed']
            old_argstart = full_text_preprocessed.find(text)
            new_arg1 = full_text_preprocessed[:old_argstart+j]
            new_arg2 = full_text_preprocessed[old_argstart+j+len(w):]
            new_row = [None, None, w, None, new_arg1, None, new_arg2, full_text, full_text_preprocessed, False]
            neg_egs.append(new_row)
        
len(neg_egs)

100%|██████████| 18205/18205 [00:21<00:00, 839.76it/s]
100%|██████████| 18205/18205 [00:19<00:00, 933.77it/s] 


82823

In [23]:
neg_egs[0]

[None,
 None,
 'for',
 None,
 'mourning ',
 None,
 " the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors and the successful son wishes his embarrassing siblings dead",
 "Mourning for the father is overshadowed by the shame of burying him in a pauper's grave.The family moves to another house at night to conceal shabby belongings from neighbors   And the successful son wishes his embarrassing siblings dead",
 "mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors and the successful son wishes his embarrassing siblings dead",
 False]

In [28]:
full_data = data.append(pd.DataFrame(neg_egs, columns=data.columns))
full_data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text,full_text_preprocessed,discourse_connective
0,Explicit,"[[2457, 2460]]",and,"[[2280, 2455]]",mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors,"[[2461, 2517]]",the successful son wishes his embarrassing siblings dead,Mourning for the father is overshadowed by the shame of burying him in a pauper's grave.The family moves to another house at night to conceal shabby belongings from neighbors And the successful son wishes his embarrassing siblings dead,mourning for the father is overshadowed by the shame of burying him in a pauper 's grave.the family moves to another house at night to conceal shabby belongings from neighbors and the successful son wishes his embarrassing siblings dead,True
1,Explicit,"[[4298, 4309]]",accordingly,"[[4119, 4165]]","now , the push is on for more-distinctive shows","[[4311, 4481]]",cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds,"Now, the push is on for more-distinctive shows accordingly CNN is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most ""differentiated"" programs in viewers' minds","now , the push is on for more-distinctive shows accordingly cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds",True
2,Explicit,"[[535, 546]]",accordingly,"[[339, 533]]",nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets,"[[548, 662]]",cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $700 million from $516 million between June and September.Approximately 85% of the total consisted of nonperforming commercial real estate assets accordingly CityFed estimated that it will provide between $85 million and $110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets accordingly cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,True
3,Explicit,"[[1298, 1309]]",accordingly,"[[1181, 1237]]",the outlook for natural gas is better than it is for oil,"[[1239, 1297]]",and have shifted their exploration and development budgets,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly,True
4,Explicit,"[[5776, 5787]]",accordingly,"[[5555, 5677]]","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him","[[5719, 5775]]",who had the chance to tilt their next day 's presentation,"to hire six people who would mirror the actual jury demographically, sit in on the trial and report their reactions to him who had the chance to tilt their next day's presentation accordingly","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him who had the chance to tilt their next day 's presentation accordingly",True
5,Explicit,"[[2133, 2144]]",accordingly,"[[1973, 2131]]","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species","[[2146, 2305]]",the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories,"The received Darwinian wisdom of the day said that animals living so long ago must be simple in design, limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the Burgess fossils in such a way that they could be shoehorned into familiar categories","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories",True
6,Explicit,"[[925, 937]]",additionally,"[[9, 352]]","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline","[[939, 1015]]","crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share","Crossland Savings Bank's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because Crossland may not meet the new government capital criteria effective Dec. 7. In composite trading on the New York Stock Exchange Friday, Crossland closed at $5.25, down $1.875, a 26% decline additionally Crossland reported a third-quarter loss of $175.5 million, or $13...","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline additionally crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share",True
7,Explicit,"[[3242, 3254]]",additionally,"[[3085, 3240]]",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market,"[[3287, 3396]]",it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,"its $110 million provision for credit losses and $11 million provision for other real estate owned is related to weakness in the Arizona real estate market additionally it downgraded Valley National's senior debt and confirmed the company's commercial paper rating of ""not prime",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market additionally it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,True
8,Explicit,"[[755, 767]]",additionally,"[[390, 753]]","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division","[[769, 878]]","the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share","For the full fiscal year, Varian posted a 13% profit rise to $31.5 million, or $1.53 a share, up from $27.8 million, or $1.27 a share, last year.Sales for the year rose almost 15% to $1.34 billion from $1.17 billion last year. A profit last year in both the quarter and year included a net gain of $9.6 million, or 44 cents a share, from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $22.8 million, or $1.04 a share","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share",True
9,Explicit,"[[5142, 5154]]",additionally,"[[4964, 5140]]","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators","[[5156, 5230]]",the end of the month position-squaring might have also played a minor role,"Contributing to the market's reserved stance was the release later in the day of new data on the health of the U.S. economy, in the form of the U.S. index of leading indicators additionally the end of the month position-squaring might have also played a minor role","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators additionally the end of the month position-squaring might have also played a minor role",True


In [29]:
full_data.to_csv('../project/dm_text.csv', index=False)

# Remove examples with more than one discourse connector

In [2]:
data = pd.read_csv('../project/dm_text_no_preprocessing.csv')
len(data)

18453

In [4]:
# Preprocess
for t in ['dm_text', 'arg1_text', 'arg2_text']:
    data[t] = data[t].map(preprocess)

In [5]:
data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text
0,Explicit,"[[2457, 2460]]",and,"[[2280, 2455]]",mourning for the father is overshadowed by the...,"[[2461, 2517]]",the successful son wishes his embarrassing sib...,Mourning for the father is overshadowed by the...
1,Explicit,"[[4298, 4309]]",accordingly,"[[4119, 4165]]","now , the push is on for more-distinctive shows","[[4311, 4481]]",cnn is adding a world-affairs show in the morn...,"Now, the push is on for more-distinctive shows..."
2,Explicit,"[[535, 546]]",accordingly,"[[339, 533]]",nonperforming assets rose to slightly more tha...,"[[548, 662]]",cityfed estimated that it will provide between...,nonperforming assets rose to slightly more tha...
3,Explicit,"[[1298, 1309]]",accordingly,"[[1181, 1237]]",the outlook for natural gas is better than it ...,"[[1239, 1297]]",and have shifted their exploration and develop...,the outlook for natural gas is better than it ...
4,Explicit,"[[5776, 5787]]",accordingly,"[[5555, 5677]]",to hire six people who would mirror the actual...,"[[5719, 5775]]",who had the chance to tilt their next day 's p...,to hire six people who would mirror the actual...
5,Explicit,"[[2133, 2144]]",accordingly,"[[1973, 2131]]",the received darwinian wisdom of the day said ...,"[[2146, 2305]]",the hidebound traditionalist reconstructed hyp...,The received Darwinian wisdom of the day said ...
6,Explicit,"[[925, 937]]",additionally,"[[9, 352]]",crossland savings bank 's stock plummeted afte...,"[[939, 1015]]",crossland reported a third-quarter loss of $ 1...,Crossland Savings Bank's stock plummeted after...
7,Explicit,"[[3242, 3254]]",additionally,"[[3085, 3240]]",its $ 110 million provision for credit losses ...,"[[3287, 3396]]",it downgraded valley national 's senior debt a...,its $110 million provision for credit losses a...
8,Explicit,"[[755, 767]]",additionally,"[[390, 753]]","for the full fiscal year , varian posted a 13 ...","[[769, 878]]",the full-year profit last year reflected an af...,"For the full fiscal year, Varian posted a 13% ..."
9,Explicit,"[[5142, 5154]]",additionally,"[[4964, 5140]]",contributing to the market 's reserved stance ...,"[[5156, 5230]]",the end of the month position-squaring might h...,Contributing to the market's reserved stance w...


In [6]:
data_nodups = data.loc[data['full_text'].drop_duplicates(keep=False).index]
len(data_nodups)

18205

In [7]:
data_nodups.to_csv('../project/dm_text.csv', index=False)

In [36]:
# Make sure aren't multiple examples of discourse connectors AT ALL (don't just keep one of them, which defeats purpose)
print(len(data))
print(len(set(data['full_text'])))

18453
18321


# Look at sparsity

In [30]:
dms = set(data['dm_text'])
len(dms)

100

In [34]:
dm_counter = Counter()
for dm in dms:
    dm_counter[dm] = len(data[data['dm_text']==dm])
    
dm_counter.most_common()

[('but', 3305),
 ('and', 2997),
 ('also', 1746),
 ('if', 1223),
 ('when', 989),
 ('because', 858),
 ('while', 781),
 ('as', 743),
 ('after', 577),
 ('however', 485),
 ('then', 340),
 ('although', 328),
 ('before', 326),
 ('though', 320),
 ('so', 263),
 ('for example', 196),
 ('meanwhile', 193),
 ('still', 190),
 ('since', 184),
 ('in addition', 165),
 ('until', 162),
 ('instead', 112),
 ('thus', 112),
 ('indeed', 104),
 ('yet', 101),
 ('moreover', 101),
 ('for instance', 98),
 ('or', 98),
 ('unless', 95),
 ('later', 91),
 ('once', 84),
 ('in fact', 82),
 ('as a result', 78),
 ('separately', 74),
 ('previously', 49),
 ('nevertheless', 44),
 ('if then', 38),
 ('on the other hand', 37),
 ('finally', 32),
 ('so that', 31),
 ('nor', 31),
 ('in turn', 30),
 ('by contrast', 27),
 ('nonetheless', 27),
 ('therefore', 26),
 ('otherwise', 24),
 ('as long as', 24),
 ('now that', 22),
 ('as soon as', 20),
 ('besides', 19),
 ('similarly', 18),
 ('ultimately', 18),
 ('in other words', 17),
 ('rather'

# Preprocessing

In [10]:
def preprocess(text):
    return ' '.join(w.lower() for w in nltk.word_tokenize(text))

In [24]:
data['dm_text'] = data['dm_text'].map(preprocess)

In [26]:
data = data.sort_values(['dm_text'])
data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text
1,Explicit,"[[4298, 4309]]",accordingly,"[[4119, 4165]]","Now, the push is on for more-distinctive shows","[[4311, 4481]]","CNN is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most ""differentiated"" programs in viewers' minds","Now, the push is on for more-distinctive shows accordingly CNN is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most ""differentiated"" programs in viewers' minds"
2,Explicit,"[[535, 546]]",accordingly,"[[339, 533]]",nonperforming assets rose to slightly more than $700 million from $516 million between June and September.Approximately 85% of the total consisted of nonperforming commercial real estate assets,"[[548, 662]]",CityFed estimated that it will provide between $85 million and $110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $700 million from $516 million between June and September.Approximately 85% of the total consisted of nonperforming commercial real estate assets accordingly CityFed estimated that it will provide between $85 million and $110 million for credit losses in the third quarter
3,Explicit,"[[1298, 1309]]",accordingly,"[[1181, 1237]]",the outlook for natural gas is better than it is for oil,"[[1239, 1297]]",and have shifted their exploration and development budgets,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly
4,Explicit,"[[5776, 5787]]",accordingly,"[[5555, 5677]]","to hire six people who would mirror the actual jury demographically, sit in on the trial and report their reactions to him","[[5719, 5775]]",who had the chance to tilt their next day's presentation,"to hire six people who would mirror the actual jury demographically, sit in on the trial and report their reactions to him who had the chance to tilt their next day's presentation accordingly"
5,Explicit,"[[2133, 2144]]",accordingly,"[[1973, 2131]]","The received Darwinian wisdom of the day said that animals living so long ago must be simple in design, limited in scope and ancestral to contemporary species","[[2146, 2305]]",the hidebound traditionalist reconstructed hypothetical organisms from the Burgess fossils in such a way that they could be shoehorned into familiar categories,"The received Darwinian wisdom of the day said that animals living so long ago must be simple in design, limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the Burgess fossils in such a way that they could be shoehorned into familiar categories"
9,Explicit,"[[5142, 5154]]",additionally,"[[4964, 5140]]","Contributing to the market's reserved stance was the release later in the day of new data on the health of the U.S. economy, in the form of the U.S. index of leading indicators","[[5156, 5230]]",the end of the month position-squaring might have also played a minor role,"Contributing to the market's reserved stance was the release later in the day of new data on the health of the U.S. economy, in the form of the U.S. index of leading indicators additionally the end of the month position-squaring might have also played a minor role"
6,Explicit,"[[925, 937]]",additionally,"[[9, 352]]","Crossland Savings Bank's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because Crossland may not meet the new government capital criteria effective Dec. 7. In composite trading on the New York Stock Exchange Friday, Crossland closed at $5.25, down $1.875, a 26% decline","[[939, 1015]]","Crossland reported a third-quarter loss of $175.5 million, or $13.44 a share","Crossland Savings Bank's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because Crossland may not meet the new government capital criteria effective Dec. 7. In composite trading on the New York Stock Exchange Friday, Crossland closed at $5.25, down $1.875, a 26% decline additionally Crossland reported a third-quarter loss of $175.5 million, or $13..."
7,Explicit,"[[3242, 3254]]",additionally,"[[3085, 3240]]",its $110 million provision for credit losses and $11 million provision for other real estate owned is related to weakness in the Arizona real estate market,"[[3287, 3396]]","it downgraded Valley National's senior debt and confirmed the company's commercial paper rating of ""not prime","its $110 million provision for credit losses and $11 million provision for other real estate owned is related to weakness in the Arizona real estate market additionally it downgraded Valley National's senior debt and confirmed the company's commercial paper rating of ""not prime"
8,Explicit,"[[755, 767]]",additionally,"[[390, 753]]","For the full fiscal year, Varian posted a 13% profit rise to $31.5 million, or $1.53 a share, up from $27.8 million, or $1.27 a share, last year.Sales for the year rose almost 15% to $1.34 billion from $1.17 billion last year. A profit last year in both the quarter and year included a net gain of $9.6 million, or 44 cents a share, from the sale of a division","[[769, 878]]","the full-year profit last year reflected an after-tax restructuring charge of $22.8 million, or $1.04 a share","For the full fiscal year, Varian posted a 13% profit rise to $31.5 million, or $1.53 a share, up from $27.8 million, or $1.27 a share, last year.Sales for the year rose almost 15% to $1.34 billion from $1.17 billion last year. A profit last year in both the quarter and year included a net gain of $9.6 million, or 44 cents a share, from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $22.8 million, or $1.04 a share"
10,Explicit,"[[679, 691]]",additionally,"[[9, 262]]","Poughkeepsie Savings Bank said a plan to sell its South Carolina branch offices to First Citizens Bank, of Columbia, S.C., fell through. Poughkeepsie also expects to post a one-time charge of $8.3 million, resulting in a net loss for the third quarter","[[693, 789]]",the bank is increasing its loan-loss reserves for the third quarter by $8.5 million before taxes,"Poughkeepsie Savings Bank said a plan to sell its South Carolina branch offices to First Citizens Bank, of Columbia, S.C., fell through. Poughkeepsie also expects to post a one-time charge of $8.3 million, resulting in a net loss for the third quarter additionally the bank is increasing its loan-loss reserves for the third quarter by $8.5 million before taxes"


In [27]:
for t in ['dm_text', 'arg1_text', 'arg2_text']:
    data[t] = data[t].map(preprocess)
    
data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text
1,Explicit,"[[4298, 4309]]",accordingly,"[[4119, 4165]]","now , the push is on for more-distinctive shows","[[4311, 4481]]",cnn is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most `` differentiated '' programs in viewers ' minds,"Now, the push is on for more-distinctive shows accordingly CNN is adding a world-affairs show in the morning because surveys show its global-news hour in the afternoon is among its most ""differentiated"" programs in viewers' minds"
2,Explicit,"[[535, 546]]",accordingly,"[[339, 533]]",nonperforming assets rose to slightly more than $ 700 million from $ 516 million between june and september.approximately 85 % of the total consisted of nonperforming commercial real estate assets,"[[548, 662]]",cityfed estimated that it will provide between $ 85 million and $ 110 million for credit losses in the third quarter,nonperforming assets rose to slightly more than $700 million from $516 million between June and September.Approximately 85% of the total consisted of nonperforming commercial real estate assets accordingly CityFed estimated that it will provide between $85 million and $110 million for credit losses in the third quarter
3,Explicit,"[[1298, 1309]]",accordingly,"[[1181, 1237]]",the outlook for natural gas is better than it is for oil,"[[1239, 1297]]",and have shifted their exploration and development budgets,the outlook for natural gas is better than it is for oil and have shifted their exploration and development budgets accordingly
4,Explicit,"[[5776, 5787]]",accordingly,"[[5555, 5677]]","to hire six people who would mirror the actual jury demographically , sit in on the trial and report their reactions to him","[[5719, 5775]]",who had the chance to tilt their next day 's presentation,"to hire six people who would mirror the actual jury demographically, sit in on the trial and report their reactions to him who had the chance to tilt their next day's presentation accordingly"
5,Explicit,"[[2133, 2144]]",accordingly,"[[1973, 2131]]","the received darwinian wisdom of the day said that animals living so long ago must be simple in design , limited in scope and ancestral to contemporary species","[[2146, 2305]]",the hidebound traditionalist reconstructed hypothetical organisms from the burgess fossils in such a way that they could be shoehorned into familiar categories,"The received Darwinian wisdom of the day said that animals living so long ago must be simple in design, limited in scope and ancestral to contemporary species accordingly the hidebound traditionalist reconstructed hypothetical organisms from the Burgess fossils in such a way that they could be shoehorned into familiar categories"
9,Explicit,"[[5142, 5154]]",additionally,"[[4964, 5140]]","contributing to the market 's reserved stance was the release later in the day of new data on the health of the u.s. economy , in the form of the u.s. index of leading indicators","[[5156, 5230]]",the end of the month position-squaring might have also played a minor role,"Contributing to the market's reserved stance was the release later in the day of new data on the health of the U.S. economy, in the form of the U.S. index of leading indicators additionally the end of the month position-squaring might have also played a minor role"
6,Explicit,"[[925, 937]]",additionally,"[[9, 352]]","crossland savings bank 's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because crossland may not meet the new government capital criteria effective dec. 7 . in composite trading on the new york stock exchange friday , crossland closed at $ 5.25 , down $ 1.875 , a 26 % decline","[[939, 1015]]","crossland reported a third-quarter loss of $ 175.5 million , or $ 13.44 a share","Crossland Savings Bank's stock plummeted after management recommended a suspension of dividend payments on both its common and preferred stock because Crossland may not meet the new government capital criteria effective Dec. 7. In composite trading on the New York Stock Exchange Friday, Crossland closed at $5.25, down $1.875, a 26% decline additionally Crossland reported a third-quarter loss of $175.5 million, or $13..."
7,Explicit,"[[3242, 3254]]",additionally,"[[3085, 3240]]",its $ 110 million provision for credit losses and $ 11 million provision for other real estate owned is related to weakness in the arizona real estate market,"[[3287, 3396]]",it downgraded valley national 's senior debt and confirmed the company 's commercial paper rating of `` not prime,"its $110 million provision for credit losses and $11 million provision for other real estate owned is related to weakness in the Arizona real estate market additionally it downgraded Valley National's senior debt and confirmed the company's commercial paper rating of ""not prime"
8,Explicit,"[[755, 767]]",additionally,"[[390, 753]]","for the full fiscal year , varian posted a 13 % profit rise to $ 31.5 million , or $ 1.53 a share , up from $ 27.8 million , or $ 1.27 a share , last year.sales for the year rose almost 15 % to $ 1.34 billion from $ 1.17 billion last year . a profit last year in both the quarter and year included a net gain of $ 9.6 million , or 44 cents a share , from the sale of a division","[[769, 878]]","the full-year profit last year reflected an after-tax restructuring charge of $ 22.8 million , or $ 1.04 a share","For the full fiscal year, Varian posted a 13% profit rise to $31.5 million, or $1.53 a share, up from $27.8 million, or $1.27 a share, last year.Sales for the year rose almost 15% to $1.34 billion from $1.17 billion last year. A profit last year in both the quarter and year included a net gain of $9.6 million, or 44 cents a share, from the sale of a division additionally the full-year profit last year reflected an after-tax restructuring charge of $22.8 million, or $1.04 a share"
10,Explicit,"[[679, 691]]",additionally,"[[9, 262]]","poughkeepsie savings bank said a plan to sell its south carolina branch offices to first citizens bank , of columbia , s.c. , fell through . poughkeepsie also expects to post a one-time charge of $ 8.3 million , resulting in a net loss for the third quarter","[[693, 789]]",the bank is increasing its loan-loss reserves for the third quarter by $ 8.5 million before taxes,"Poughkeepsie Savings Bank said a plan to sell its South Carolina branch offices to First Citizens Bank, of Columbia, S.C., fell through. Poughkeepsie also expects to post a one-time charge of $8.3 million, resulting in a net loss for the third quarter additionally the bank is increasing its loan-loss reserves for the third quarter by $8.5 million before taxes"


In [28]:
data.to_csv('../project/dm_text.csv', index=False)

# Filter dataset

In [12]:
pd.set_option('display.max_colwidth', 999)

In [13]:
data = pd.read_csv('../project/dm_text.csv')
data

Unnamed: 0,relation,dm_span,dm_text,arg1_span,arg1_text,arg2_span,arg2_text,full_text
0,EntRel,,,"[[9, 129]]","John R. Stevens, 49 years old, was named senior executive vice president and chief operating officer, both new positions","[[131, 213]]","He will continue to report to Donald Pardus, president and chief executive officer","John R. Stevens, 49 years old, was named senior executive vice president and chief operating officer, both new positions He will continue to report to Donald Pardus, president and chief executive officer"
1,EntRel,,,"[[131, 213]]","He will continue to report to Donald Pardus, president and chief executive officer","[[215, 296]]",Mr. Stevens was executive vice president of this electric-utility holding company,"He will continue to report to Donald Pardus, president and chief executive officer Mr. Stevens was executive vice president of this electric-utility holding company"
2,EntRel,,,"[[215, 296]]",Mr. Stevens was executive vice president of this electric-utility holding company,"[[298, 368]]","Arthur A. Hatch, 59, was named executive vice president of the company","Mr. Stevens was executive vice president of this electric-utility holding company Arthur A. Hatch, 59, was named executive vice president of the company"
3,Explicit,"[[377, 387]]",previously,"[[298, 368]]","Arthur A. Hatch, 59, was named executive vice president of the company","[[370, 376], [388, 438]]",He was president of the company's Eastern Edison Co. unit,"Arthur A. Hatch, 59, was named executive vice president of the company He was previously president of the company's Eastern Edison Co. unit"
4,EntRel,,,"[[370, 438]]",He was previously president of the company's Eastern Edison Co. unit,"[[440, 521]]","John D. Carney, 45, was named to succeed Mr. Hatch as president of Eastern Edison","He was previously president of the company's Eastern Edison Co. unit John D. Carney, 45, was named to succeed Mr. Hatch as president of Eastern Edison"
5,Explicit,"[[523, 533]]",previously,"[[440, 521]]","John D. Carney, 45, was named to succeed Mr. Hatch as president of Eastern Edison","[[534, 573]]",he was vice president of Eastern Edison,"John D. Carney, 45, was named to succeed Mr. Hatch as president of Eastern Edison previously he was vice president of Eastern Edison"
6,NoRel,,,"[[523, 573]]",Previously he was vice president of Eastern Edison,"[[575, 652]]","Robert P. Tassinari, 63, was named senior vice president of Eastern Utilities","Previously he was vice president of Eastern Edison Robert P. Tassinari, 63, was named senior vice president of Eastern Utilities"
7,Explicit,"[[661, 671]]",previously,"[[575, 652]]","Robert P. Tassinari, 63, was named senior vice president of Eastern Utilities","[[654, 660], [672, 686]]",He was vice president,"Robert P. Tassinari, 63, was named senior vice president of Eastern Utilities He was previously vice president"
8,Explicit,"[[169, 174]]",after,"[[141, 167], [229, 326]]",that the attack commercial has come of age in an off-off election year with only a few contests scattered across the country,"[[175, 227]]",getting a boost in last year's presidential campaign,that the attack commercial after getting a boost in last year's presidential campaign has come of age in an off-off election year with only a few contests scattered across the country
9,Explicit,"[[330, 333]]",but,"[[128, 326]]","The irony is that the attack commercial, after getting a boost in last year's presidential campaign, has come of age in an off-off election year with only a few contests scattered across the country","[[334, 436]]","in the three leading political contests of 1989, the negative ads have reached new levels of hostility","The irony is that the attack commercial, after getting a boost in last year's presidential campaign, has come of age in an off-off election year with only a few contests scattered across the country but in the three leading political contests of 1989, the negative ads have reached new levels of hostility"


In [14]:
len(data)

40588

In [17]:
data = data[data['dm_text'].map(lambda x: isinstance(x, str))]
len(data)

18453

In [20]:
data.sort_values(['dm_text'], inplace=True)
data.to_csv('../project/dm_text.csv', index=False)

# Assemble dataset

In [2]:
pdtb_dir = '/home/michael/school/11-727/project/pdtb_v2/pipes/'
pd.set_option('display.max_colwidth', 999)

In [13]:
# Create big dataset with full text
outlines = []

for i in range(25):
    i_format = '{:02d}'.format(i)
    print(i_format)
    
    for fname in os.listdir(os.path.join(pdtb_dir, i_format)):
#         print(fname)
        if not fname.startswith('.'):
            data = pd.read_csv(os.path.join(pdtb_dir, i_format, fname), sep='|', header=None, quoting=csv.QUOTE_NONE,
                            encoding='utf8')
            
        # Assemble spans into a text
        spans = {}
        text = {}

        for rel, spans['dm'], text['dm'], spans['arg1'], text['arg1'], spans['arg2'], text['arg2'] in \
                zip(data[0],data[3],data[8],data[22],data[24],data[32],data[34]):

            if rel != 'Explicit':
                del spans['dm']
                del text['dm']

            for k in spans:
                spans[k] = spans[k].split(';') # if multiple spans
                for i,sp in enumerate(spans[k]):
                    spans[k][i] = [int(n) for n in sp.split('..')]

            # order spans
            span_vals = {tuple(sp): tname for tname, sps in spans.items() for sp in sps}
            span_vals = OrderedDict(sorted(span_vals.items()))

            minval = list(span_vals.keys())[0][0]
            maxval = list(span_vals.keys())[-1][-1]

            full_text = ' ' * (maxval - minval)

            already_sub = set()
            for (sp_beg, sp_end), k in span_vals.items():
                diff = sp_end - sp_beg
                if diff < len(text[k]):
                    if k in already_sub:
                        full_text = full_text[:sp_beg-minval] + text[k][-1*diff:] + full_text[sp_end-minval:]
                    else:
                        full_text = full_text[:sp_beg-minval] + text[k][:diff] + full_text[sp_end-minval:]

                else:
                    full_text = full_text[:sp_beg-minval] + text[k] + full_text[sp_end-minval:]

                already_sub.add(k)

#             print(full_text)
#             print()

            if rel != 'Explicit':
                spans['dm'] = np.nan
                text['dm'] = np.nan

            outlines.append([rel, spans['dm'], text['dm'], spans['arg1'], text['arg1'], spans['arg2'], text['arg2'], full_text])
                    
data_with_text = pd.DataFrame(outlines)

00
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


Unnamed: 0,0,1,2,3,4,5,6,7
0,EntRel,,,"[[9, 129]]","John R. Stevens, 49 years old, was named senior executive vice president and chief operating officer, both new positions","[[131, 213]]","He will continue to report to Donald Pardus, president and chief executive officer","John R. Stevens, 49 years old, was named senior executive vice president and chief operating officer, both new positions He will continue to report to Donald Pardus, president and chief executive officer"
1,EntRel,,,"[[131, 213]]","He will continue to report to Donald Pardus, president and chief executive officer","[[215, 296]]",Mr. Stevens was executive vice president of this electric-utility holding company,"He will continue to report to Donald Pardus, president and chief executive officer Mr. Stevens was executive vice president of this electric-utility holding company"
2,EntRel,,,"[[215, 296]]",Mr. Stevens was executive vice president of this electric-utility holding company,"[[298, 368]]","Arthur A. Hatch, 59, was named executive vice president of the company","Mr. Stevens was executive vice president of this electric-utility holding company Arthur A. Hatch, 59, was named executive vice president of the company"
3,Explicit,"[[377, 387]]",previously,"[[298, 368]]","Arthur A. Hatch, 59, was named executive vice president of the company","[[370, 376], [388, 438]]",He was president of the company's Eastern Edison Co. unit,"Arthur A. Hatch, 59, was named executive vice president of the company He was previously president of the company's Eastern Edison Co. unit"
4,EntRel,,,"[[370, 438]]",He was previously president of the company's Eastern Edison Co. unit,"[[440, 521]]","John D. Carney, 45, was named to succeed Mr. Hatch as president of Eastern Edison","He was previously president of the company's Eastern Edison Co. unit John D. Carney, 45, was named to succeed Mr. Hatch as president of Eastern Edison"
5,Explicit,"[[523, 533]]",previously,"[[440, 521]]","John D. Carney, 45, was named to succeed Mr. Hatch as president of Eastern Edison","[[534, 573]]",he was vice president of Eastern Edison,"John D. Carney, 45, was named to succeed Mr. Hatch as president of Eastern Edison previously he was vice president of Eastern Edison"
6,NoRel,,,"[[523, 573]]",Previously he was vice president of Eastern Edison,"[[575, 652]]","Robert P. Tassinari, 63, was named senior vice president of Eastern Utilities","Previously he was vice president of Eastern Edison Robert P. Tassinari, 63, was named senior vice president of Eastern Utilities"
7,Explicit,"[[661, 671]]",previously,"[[575, 652]]","Robert P. Tassinari, 63, was named senior vice president of Eastern Utilities","[[654, 660], [672, 686]]",He was vice president,"Robert P. Tassinari, 63, was named senior vice president of Eastern Utilities He was previously vice president"
8,Explicit,"[[169, 174]]",after,"[[141, 167], [229, 326]]",that the attack commercial has come of age in an off-off election year with only a few contests scattered across the country,"[[175, 227]]",getting a boost in last year's presidential campaign,that the attack commercial after getting a boost in last year's presidential campaign has come of age in an off-off election year with only a few contests scattered across the country
9,Explicit,"[[330, 333]]",but,"[[128, 326]]","The irony is that the attack commercial, after getting a boost in last year's presidential campaign, has come of age in an off-off election year with only a few contests scattered across the country","[[334, 436]]","in the three leading political contests of 1989, the negative ads have reached new levels of hostility","The irony is that the attack commercial, after getting a boost in last year's presidential campaign, has come of age in an off-off election year with only a few contests scattered across the country but in the three leading political contests of 1989, the negative ads have reached new levels of hostility"


In [14]:
data_with_text = pd.DataFrame(outlines)
data_with_text.to_csv('../project/dm_text.csv', index=False)