In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import zipfile
import gc
from tqdm import tqdm_notebook as tqdm
import re
from collections import defaultdict
import time
import matplotlib.pyplot as plt
%matplotlib inline

import spacy
from sklearn.metrics import log_loss

nlp = spacy.load('en_core_web_lg')

In [3]:
test_df  = pd.read_table('input/gap-development.tsv')
train_df = pd.read_table('input/gap-test.tsv')
val_df   = pd.read_table('input/gap-validation.tsv')

In [2]:
def bs(lens, target):
    low, high = 0, len(lens) - 1

    while low < high:
        mid = low + int((high - low) / 2)

        if target > lens[mid]:
            low = mid + 1
        elif target < lens[mid]:
            high = mid
        else:
            return mid + 1

    return low

def bin_distance(dist):
    
    buckets = [1, 2, 3, 4, 5, 8, 16, 32, 64]  
    low, high = 0, len(buckets)
    while low < high:
        mid = low + int((high-low) / 2)
        if dist > buckets[mid]:
            low = mid + 1
        elif dist < buckets[mid]:
            high = mid
        else:
            return mid

    return low

def distance_features(P, A, B, char_offsetP, char_offsetA, char_offsetB, text, URL):
    
    doc = nlp(text)
    
    lens = [token.idx for token in doc]
    mention_offsetP = bs(lens, char_offsetP) - 1
    mention_offsetA = bs(lens, char_offsetA) - 1
    mention_offsetB = bs(lens, char_offsetB) - 1
    
    mention_distA = mention_offsetP - mention_offsetA 
    mention_distB = mention_offsetP - mention_offsetB
    
    splited_A = A.split()[0].replace("*", "")
    splited_B = B.split()[0].replace("*", "")
    
    if re.search(splited_A[0], str(URL)):
        contains = 0
    elif re.search(splited_B[0], str(URL)):
        contains = 1
    else:
        contains = 2
    
    dist_binA = bin_distance(mention_distA)
    dist_binB = bin_distance(mention_distB)
    output =  [dist_binA, dist_binB, contains]
    
    return output

def extract_dist_features(df):
    
    index = df.index
    columns = ["D_PA", "D_PB", "IN_URL"]
    dist_df = pd.DataFrame(index = index, columns = columns)

    for i in tqdm(range(len(df))):
        
        text = df.loc[i, 'Text']
        P_offset = df.loc[i,'Pronoun-offset']
        A_offset = df.loc[i, 'A-offset']
        B_offset = df.loc[i, 'B-offset']
        P, A, B  = df.loc[i,'Pronoun'], df.loc[i, 'A'], df.loc[i, 'B']
        URL = df.loc[i, 'URL']
        
        dist_df.iloc[i] = distance_features(P, A, B, P_offset, A_offset, B_offset, text, URL)
        
    return dist_df

In [3]:
test2_df = pd.read_table('input/test_stage_2.tsv')
test2_dist_df = extract_dist_features(test2_df)
test2_dist_df['ID'] = test2_df['ID']
test2_dist_df.to_csv('data/test2_dist.csv', index=False)

HBox(children=(IntProgress(value=0, max=12359), HTML(value='')))




In [34]:
test_dist_df = extract_dist_features(test_df)
# test_dist_df.to_csv('test_dist_df.csv', index=False)
val_dist_df = extract_dist_features(val_df)
# val_dist_df.to_csv('val_dist_df.csv', index=False)
train_dist_df = extract_dist_features(train_df)
# train_dist_df.to_csv('train_dist_df.csv', index=False)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=454), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [35]:
test_dist_df['ID'] = test_df['ID']
test_dist_df.head()

Unnamed: 0,D_PA,D_PB,IN_URL,ID
0,7,6,2,development-1
1,6,5,0,development-2
2,7,4,0,development-3
3,7,0,2,development-4
4,8,7,1,development-5


In [36]:
test_dist_df['ID'] = test_df['ID']
# test_dist_df.set_index('ID', inplace=True)
test_dist_df.to_csv('data/dev_dist.csv', index=False)

val_dist_df['ID'] = val_df['ID']
# val_dist_df.set_index('ID', inplace=True)
val_dist_df.to_csv('data/val_dist.csv', index=False)

train_dist_df['ID'] = train_df['ID']
# train_dist_df.set_index('ID', inplace=True)
train_dist_df.to_csv('data/test_dist.csv', index=False)

In [32]:
test_dist_df.head()

Unnamed: 0_level_0,D_PA,D_PB,IN_URL,ID
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,5,3,1,
,6,5,0,
,6,3,0,
,7,0,1,
,0,0,2,


In [31]:
test_dist_df['ID'] = test_df['ID']

In [23]:
train_dist_df.columns

Index(['D_PA', 'D_PB', 'IN_URL'], dtype='object')

https://www.kaggle.com/negedng/extracting-features-from-spacy-dependency/output

In [4]:
class Graph():
    def __init__(self):
        """
        self.edges is a dict of all possible next nodes
        e.g. {'X': ['A', 'B', 'C', 'E'], ...}
        self.weights has all the weights between two nodes,
        with the two nodes as a tuple as the key
        e.g. {('X', 'A'): 7, ('X', 'B'): 2, ...}
        """
        self.edges = defaultdict(list)
        self.weights = {}
    
    def add_edge(self, from_node, to_node, weight, back_penalty=1):
        # Note: assumes edges are bi-directional
        self.edges[from_node].append(to_node)
        self.edges[to_node].append(from_node)
        self.weights[(from_node, to_node)] = weight
        self.weights[(to_node, from_node)] = weight*back_penalty

def dijsktra(graph, initial, end):
    # shortest paths is a dict of nodes
    # whose value is a tuple of (previous node, weight)
    shortest_paths = {initial: (None, 0)}
    current_node = initial
    visited = set()
    
    while current_node != end:
        visited.add(current_node)
        destinations = graph.edges[current_node]
        weight_to_current_node = shortest_paths[current_node][1]

        for next_node in destinations:
            weight = graph.weights[(current_node, next_node)] + weight_to_current_node
            if next_node not in shortest_paths:
                shortest_paths[next_node] = (current_node, weight)
            else:
                current_shortest_weight = shortest_paths[next_node][1]
                if current_shortest_weight > weight:
                    shortest_paths[next_node] = (current_node, weight)
        
        next_destinations = {node: shortest_paths[node] for node in shortest_paths if node not in visited}
        if not next_destinations:
            raise Exception("Something is wrong")
        # next node is the destination with the lowest weight
        current_node = min(next_destinations, key=lambda k: next_destinations[k][1])
    
    # Work back through destinations in shortest path
    path = []
    dist = 0
    while current_node is not None:
        path.append(current_node)
        next_node = shortest_paths[current_node][0]
        dist += shortest_paths[current_node][1]
        current_node = next_node
    # Reverse path
    path = path[::-1]
    return path, dist

def get_rank(token):
    """Step up with token.head until it reaches the root. Returns with step number and root"""
    i = 0
    next_token = token
    while(next_token!=next_token.head):
        i+=1
        next_token=next_token.head
    return i, next_token

def child_count(token):
    cc = 0
    for child in token.children:
        cc+=1
    return cc

def build_features(data):
    """Generates features from input data"""
    features = []
    sum_good = 0
    for i in range(0,len(data)):
        fi = []
        dataNext = data.loc[i]
        text = dataNext["Text"]
        #print(visualise(dataNext))
        doc=nlp(text)
        Aoff = dataNext["A-offset"]
        Boff = dataNext["B-offset"]
        Poff = dataNext["Pronoun-offset"]
        lth = len(text)
        
        for token in doc:
            if(token.idx==Aoff):
                Atoken = token
            if(token.idx==Boff):
                Btoken = token
            if(token.idx==Poff):
                Ptoken=token
        Arank, Aroot = get_rank(Atoken)
        Brank, Broot = get_rank(Btoken)
        Prank, Proot = get_rank(Ptoken)
        
        graph = Graph()
        
        for token in doc:
            graph.add_edge(token, token.head, 1, 4)
        
        sent_root = []
        for sent in doc.sents:
            sent_root.append(sent.root)
        for j in range(len(sent_root)-1):
            graph.add_edge(sent_root[j], sent_root[j+1],1, 4)
        try:
            _, Alen = dijsktra(graph, Atoken, Ptoken)
        except:
            Alen = 300
        try:
            _, Blen = dijsktra(graph, Btoken, Ptoken)
        except:
            Blen = 300
        
        
        sent_num = len(sent_root)
        for i in range(len(sent_root)):
            if Aroot == sent_root[i]:
                Atop = i
            if Broot == sent_root[i]:
                Btop = i
            if Proot == sent_root[i]:
                Ptop = i
        
        fi.append(Aoff/lth)#0
        fi.append(Boff/lth)#1
        fi.append(Poff/lth)#2

        fi.append(1.0*Atop/sent_num)#3
        fi.append(1.0*Btop/sent_num)#4
        fi.append(1.0*Ptop/sent_num)#5

        fi.append(Arank/10)#6
        fi.append(Brank/10)#7
        fi.append(Prank/10)#8
        
        #fi.append(Atoken.similarity(Ptoken))#9
        #fi.append(Btoken.similarity(Ptoken))#10
        
        #fi.append(Alen/300)#9
        #fi.append(Blen/300)#10
        
        #fi.append(child_count(Aroot))#11
        #fi.append(child_count(Broot))#12
        #fi.append(child_count(Proot))#13
        
        features.append(fi)
    return np.vstack(features)

def swap_raws(data, i, j):
    """Swap the ith and jth column of the data"""
    new_data = np.copy(data)
    temp = np.copy(new_data[:, i])
    new_data[:,i] = new_data[:,j]
    new_data[:,j] = temp
    return new_data

In [14]:
print("Feature building started ", time.ctime())
test_feat = build_features(test_df)
print("Developement ready", time.ctime())
val_feat = build_features(val_df)
print("Validation ready", time.ctime())
train_feat = build_features(train_df)
print("Test ready", time.ctime())

Feature building started  Thu Apr 11 15:46:21 2019
Developement ready Thu Apr 11 15:46:43 2019
Validation ready Thu Apr 11 15:46:47 2019
Test ready Thu Apr 11 15:47:09 2019


In [5]:
test2_feat = build_features(test2_df)
test2_feat_df = pd.DataFrame(test2_feat)
test2_feat_df['ID'] = test2_df['ID']
test2_feat_df.to_csv('data/test2_feats.csv', index=False)

In [50]:
test_feat_df = pd.DataFrame(test_feat)
test_feat_df['ID'] = test_df['ID']
test_feat_df.to_csv('data/dev_feats.csv', index=False)

val_feat_df = pd.DataFrame(val_feat)
val_feat_df['ID'] = val_df['ID']
val_feat_df.to_csv('data/val_feats.csv', index=False)

train_feat_df = pd.DataFrame(train_feat)
train_feat_df['ID'] = train_df['ID']
train_feat_df.to_csv('data/test_feats.csv', index=False)

In [49]:
train_feat_df = pd.read_csv('data/test_feats.csv', index_col='ID')
train_feat_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
development-1,0.794582,0.826185,0.86456,0.5,0.5,0.75,0.3,0.2,0.2
development-2,0.640653,0.707804,0.780399,0.75,0.75,0.75,0.1,0.4,0.4
development-3,0.630542,0.726601,0.768473,0.5,0.5,0.75,0.3,0.3,0.1
development-4,0.548763,0.780204,0.765648,0.333333,0.666667,0.666667,0.1,0.1,0.3
development-5,0.71843,0.953925,0.692833,0.75,0.75,0.75,0.1,0.5,0.4


In [46]:
test_feat_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,ID
0,0.448357,0.485915,0.643192,0.4,0.4,0.8,0.2,0.3,0.2,development-1
1,0.556098,0.612195,0.692683,0.5,0.5,0.75,0.1,0.3,0.2,development-2
2,0.322761,0.458955,0.494403,0.666667,0.666667,0.666667,0.3,0.3,0.5,development-3
3,0.433915,0.837905,0.800499,0.5,0.75,0.75,0.3,0.5,0.4,development-4
4,0.331818,0.445455,0.662121,0.5,0.5,0.75,0.4,0.2,0.1,development-5


In [17]:
test_dist_df.values

(2000, 3)

In [18]:
a = np.concatenate([test_feat, test_dist_df], axis=1)
a.shape

(2000, 12)

In [37]:
dev_dist_df = pd.read_csv('data/dev_dist.csv', index_col='ID')
val_dist_df = pd.read_csv('data/val_dist.csv', index_col='ID')
test_dist_df = pd.read_csv('data/test_dist.csv', index_col='ID')
# train_dist_df = pd.concat([val_dist_df, test_dist_df])

In [38]:
test_dist_df.head()

Unnamed: 0_level_0,D_PA,D_PB,IN_URL
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test-1,5,3,1
test-2,6,5,0
test-3,6,3,0
test-4,7,0,1
test-5,0,0,2
