In [1]:
import json
import numpy as np

from utils import read_lines

In [2]:
PATH = json.load(open('path_config.json'))

In [3]:
document_path = PATH['xsum_fariseq'] + '/test.source'
target_path = PATH['xsum_fariseq'] + '/test.target'
test_source = read_lines(document_path)
test_target = read_lines(target_path)
print(len(test_source))
assert len(test_source) == len(test_target)

11301


In [4]:
document_path = PATH['xsum_fariseq'] + '/train.source'
target_path = PATH['xsum_fariseq'] + '/train.target'
train_source = read_lines(document_path)
train_target = read_lines(target_path)
print(len(train_source))
assert len(train_source) == len(train_target)

203575


#### Inspection

In [5]:
right_entities = json.load(open('right_entities.json'))
left_entities = json.load(open('left_entities.json'))

In [6]:
left_entities[1]

{'start': 74,
 'end': 79,
 'label': 1,
 'type': 'PERSON',
 'ent': 'David',
 'prior': 0.00992584228515625,
 'posterior': 0.94140625,
 'id': 1513}

In [7]:
test_source[922]

'The Times Educational Supplement (TES) says measures to guard against grade inflation were not appropriate for this year\'s English exams. Teaching unions have accused Ofqual of "regulatory failure" and say it is more evidence of flawed exam marking. But the regulator insists it applied its measures correctly. It comes as state and private school heads call for an independent inquiry into the problems. Last week Ofqual said it felt the way this year\'s English GCSE exams were graded was fair, despite the  grade boundaries being moved significantly part-way through the year. Teachers have complained that pupils achieving exactly the same marks would have received different grades depending on what time of year they sat the exam. And many who were predicted a grade C, obtained a grade D in their summer exams. Ofqual found that June\'s grade boundaries were correct, while January\'s were "too lenient". A key document sent to exam boards, which is on the Ofqual website, sets out how it ex

#### TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
left_source = []
for ent in left_entities:
    left_source.append(test_source[ent['id']])

In [10]:
right_source = []
for ent in right_entities:
    right_source.append(test_source[ent['id']])

In [11]:
print(len(left_source))

18


In [12]:
print(len(right_source))

18


In [13]:
concat_corpus = []
concat_corpus.extend(left_source)
concat_corpus.extend(right_source)
concat_corpus.extend(train_source)
print(len(concat_corpus))

203611


In [14]:
train_source_tfidf = TfidfVectorizer().fit_transform(concat_corpus)

In [15]:
train_source_tfidf.shape

(203611, 265359)

#### Calculate Similarity

In [16]:
left_vector = train_source_tfidf[0: len(left_source), :]
right_vector = train_source_tfidf[len(left_source): len(left_source) + len(right_source), :]
source_vector = train_source_tfidf[len(left_source) + len(right_source):, :]

In [17]:
print(left_vector.shape)
print(right_vector.shape)
print(source_vector.shape)

(18, 265359)
(18, 265359)
(203575, 265359)


In [18]:
left_similarity = source_vector * left_vector.T
right_similarity = source_vector * right_vector.T

#### Retrival Document

In [19]:
import torch

In [20]:
print(left_similarity.shape)
print(right_similarity.shape)

(203575, 18)
(203575, 18)


In [32]:
def count_entity(similarity_matrix, entities, k=3):
    """similarity_matrix: (203575, 18)"""
    similarity_matrix = torch.tensor(similarity_matrix.toarray())
    topk = torch.topk(similarity_matrix, k, dim=0)[1]
    
    related_documents = []
    for i in range(topk.shape[1]):
        related_documents.append([])
        for k in range(topk.shape[0]):
            related_documents[i].append(train_source[topk[k][i]])
    
    counts = []
    for i in range(topk.shape[1]):
        source = ' '.join(related_documents[i])
        counts.append(source.count(entities[i]['ent']))
    return counts

In [73]:
left_counts = count_entity(left_similarity, left_entities, k=10)

In [74]:
left_counts

[3, 7, 7, 13, 1, 0, 3, 8, 5, 7, 4, 0, 100, 12, 4, 0, 13, 3]

In [75]:
sum(left_counts) / len(left_counts)

10.555555555555555

In [76]:
right_counts = count_entity(right_similarity, right_entities, k=10)

In [77]:
sum(right_counts) / len(right_counts)

7.333333333333333

In [67]:
left_entities[0]

{'start': 6,
 'end': 16,
 'label': 1,
 'type': 'ORG',
 'ent': 'Centurions',
 'prior': 0.004512786865234375,
 'posterior': 0.85888671875,
 'id': 6824}

In [41]:
train_source[17646]

'The 30-year-old moved to Leigh in November on a two-year contract after his release by Salford following a "disciplinary procedure". Chase won the 2011 Man of Steel while at Castleford, also earning his 11 England caps during a four-year stay. The New Zealand-born stand-off scored two tries in his five appearances for the Centurions this season, with his last appearance on 1 May. "He came to Leigh and wanted to fall back in love with the game," owner Derek Beaumont told BBC Radio Manchester. "He came from a difficult situation from Salford and it\'s proven hard to do that. "He\'s had a couple of things that have been made available to him as an opportunity and there are a couple of options he can explore outside of the game. "He came and spoke to me and felt it was in his best interests if he was given that opportunity and I\'ve accepted that." Leigh signed Australian former London Broncos half-back Josh Drinkwater this week from West Tigers until the end of the season.'

In [20]:
np.average(left_max_similarity.toarray()[0])

0.502102009741062

In [21]:
np.average(right_max_similarity.toarray()[0])

0.4649270696993297

In [22]:
for threshold in [0.1, 0.2, 0.3, 0.4, 0.5]:
    print('threshold: {}'.format(threshold))
    left_related = left_vector.toarray() > threshold
    right_related = right_vector.toarray() > threshold
    print(np.sum(left_related, axis=1))
    print(np.sum(left_related))
    print(np.sum(right_related, axis=1))
    print(np.sum(right_related))
    print()

threshold: 0.1
[31 14 26 16 10 35 18 23 29  8 35 29 27 12 29 13 21 18]
394
[18 24 17 29 29 17 17 12 23 15 23 15 15 14 14 18 22 18]
340

threshold: 0.2
[2 3 2 6 3 5 2 3 4 2 8 7 7 6 4 3 4 2]
73
[5 1 5 5 5 3 3 2 3 4 7 5 5 3 3 5 5 2]
71

threshold: 0.3
[0 0 1 1 1 0 1 2 1 2 0 1 2 4 1 2 1 1]
21
[1 1 2 1 1 2 2 1 2 1 1 2 2 1 1 2 0 1]
24

threshold: 0.4
[0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 2 0 1]
8
[0 1 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1]
9

threshold: 0.5
[0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0]
2
[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
1

