In [77]:
#Adjust the paths
dev_data_path = 'D:\Kaggle\GenderedPronoun\gap-coreference-master\gap-coreference-master\gap-development.tsv'
val_data_path = 'D:\Kaggle\GenderedPronoun\gap-coreference-master\gap-coreference-master\gap-validation.tsv'

output_file_path = 'D:\Kaggle\GenderedPronoun\gap-coreference-master\gap-coreference-master\output.csv'

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

#Read data from csv
train_set = pd.read_csv(dev_data_path,sep='\t')
val_set = pd.read_csv(val_data_path,sep='\t')
labelNames = ['A-distance','B-distance','Pronoun-at-start', 'A-before', 'B-before', 'A-after', 'B-after',
              'dot-betweenA', 'dot-betweenB', 'comma-betweenA','comma-betweenB']

def frameSubStrContains(my_set, minIdx, maxIdx, searchSubstr):
    contains=[]
    for index, row in maxIdx.iteritems():
        subStr = my_set['Text'][index][minIdx[index]: maxIdx[index]]
        contains.append(searchSubstr in subStr)

    return contains
            
def prepareData(my_set):
    #Compute Distance from pronoun to each name
    my_set['A-distance'] = abs(my_set['A-offset'] - my_set['Pronoun-offset']);
    my_set['B-distance'] = abs(my_set['B-offset'] - my_set['Pronoun-offset']);
    
    my_set['Pronoun-at-start'] = my_set['Pronoun-offset'] == 0
   
    my_set['A-before'] = my_set['Pronoun-offset'] < my_set['A-offset']
    my_set['B-before'] = my_set['Pronoun-offset'] < my_set['B-offset']
    
    my_set['A-after'] = my_set['Pronoun-offset'] > my_set['A-offset']
    my_set['B-after'] = my_set['Pronoun-offset'] > my_set['B-offset']
   
    max_a_offset = my_set[['A-offset','Pronoun-offset' ]].max(axis=1)
    min_a_offset = my_set[['A-offset','Pronoun-offset' ]].min(axis=1)
    max_b_offset = my_set[['B-offset','Pronoun-offset' ]].max(axis=1)
    min_b_offset = my_set[['B-offset','Pronoun-offset' ]].min(axis=1)
    
    my_set['dot-betweenA'] = pd.DataFrame(frameSubStrContains(my_set, min_a_offset, max_a_offset, '.'))
    my_set['comma-betweenA'] = pd.DataFrame(frameSubStrContains(my_set, min_a_offset, max_a_offset, ','))

    my_set['dot-betweenB'] = pd.DataFrame(frameSubStrContains(my_set, min_b_offset, max_b_offset, '.'))
    my_set['comma-betweenB'] = pd.DataFrame(frameSubStrContains(my_set, min_b_offset, max_b_offset, ','))
    
    #Convert from the provided labels in data set to the Kaggle labels format
    my_set['A'] = np.where(my_set['A-coref']==True, 1, 0)
    my_set['B'] = np.where(my_set['B-coref']==True, 1, 0)
    my_set['NEITHER'] = np.where((my_set['A-coref']==False) & (my_set['B-coref']==False), 1, 0)
    

print(train_set.shape)
prepareData(train_set)
prepareData(val_set)
print(train_set.shape)

#Compute probability from the distance directly without ML
#train_set['A'] = train_set['A-distance']/(train_set['A-distance']+train_set['B-distance'])
#train_set['B'] = train_set['B-distance']/(train_set['A-distance']+train_set['B-distance'])
#train_set['NEITHER'] = 0
#print(rd)

neigh = KNeighborsClassifier(n_neighbors=5)
X = train_set[labelNames]
Y = train_set[['A','B','NEITHER']]
neigh.fit(X, Y)

P = val_set[labelNames]
Z = val_set[['A','B','NEITHER']]
res = neigh.predict(P)


acc = accuracy_score(Z, res)

print("Val Acc = " + str(acc))

P = train_set[labelNames]
Z = train_set[['A','B','NEITHER']]
res = neigh.predict(P)
acc = accuracy_score(Z, res)
print("Train Acc = " + str(acc))

df = pd.DataFrame(data=res,    # values
              columns=['A','B','NEITHER'])

df.insert(loc=0, column='ID', value=train_set['ID'])

print(df)

df.to_csv(path_or_buf=output_file_path,index=False)

(2000, 11)
(2000, 23)
Val Acc = 0.4118942731277533
Train Acc = 0.5915
                    ID  A  B  NEITHER
0        development-1  1  0        0
1        development-2  1  0        0
2        development-3  1  0        0
3        development-4  0  1        0
4        development-5  0  0        0
5        development-6  0  1        0
6        development-7  1  0        0
7        development-8  0  1        0
8        development-9  0  1        0
9       development-10  1  0        0
10      development-11  1  0        0
11      development-12  1  0        0
12      development-13  1  0        0
13      development-14  1  0        0
14      development-15  0  1        0
15      development-16  0  1        0
16      development-17  1  0        0
17      development-18  0  0        0
18      development-19  0  1        0
19      development-20  0  0        0
20      development-21  0  1        0
21      development-22  1  0        0
22      development-23  0  1        0
23      developmen