-
Notifications
You must be signed in to change notification settings - Fork 35
/
evaluate.py
110 lines (86 loc) · 3.76 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
This script evaluates a embedding model in a semantic similarity perspective.
It uses the dataset of ASSIN sentence similarity shared task and the method
of Hartmann which achieved the best results in the competition.
ASSIN shared-task website:
http://propor2016.di.fc.ul.pt/?page_id=381
Paper of Hartmann can be found at:
http://www.linguamatica.com/index.php/linguamatica/article/download/v8n2-6/365
"""
from sklearn.linear_model import LinearRegression
from sentence_similarity.utils.assin_eval import read_xml, eval_similarity
from gensim.models import KeyedVectors
from xml.dom import minidom
from numpy import array
from os import path
import pickle
import argparse
from preprocessing import clean_text
DATA_DIR = 'sentence_similarity/data/'
TEST_DIR = path.join(DATA_DIR, 'assin-test-gold/')
def gensim_embedding_difference(data, field1, field2, clean=False):
"""
Calculate the similarity between the sum of all embeddings.
Setting clean to False will reproduce the results that were reported by Hartmann et al. .
However, setting it to True will universally improve the score of the evaluated embeddings.
"""
distances = []
for pair in data:
if clean:
e1 = [clean_text(i) for i in pair[field1] if clean_text(i) in embeddings]
e2 = [clean_text(i) for i in pair[field2] if clean_text(i) in embeddings]
else:
e1 = [i if i in embeddings else 'unk' for i in pair[field1]]
e2 = [i if i in embeddings else 'unk' for i in pair[field2]]
distances.append([embeddings.n_similarity(e1, e2)])
return distances
def evaluate_testset(x, y, test):
"""Docstring."""
l_reg = LinearRegression()
l_reg.fit(x, y)
test_predict = l_reg.predict(test)
return test_predict
def write_xml(filename, pred):
"""Docstring."""
with open(filename) as fp:
xml = minidom.parse(fp)
pairs = xml.getElementsByTagName('pair')
for pair in pairs:
pair.setAttribute('similarity', str(pred[pairs.index(pair)]))
with open(filename, 'w') as fp:
fp.write(xml.toxml())
if __name__ == '__main__':
# Parser descriptors
parser = argparse.ArgumentParser(
description='''Sentence similarity evaluation for word embeddings in
brazilian and european variants of Portuguese language. It is expected
a word embedding model in text format.''')
parser.add_argument('embedding',
type=str,
help='embedding model')
parser.add_argument('lang',
choices=['br', 'eu'],
help='{br, eu} choose PT-BR or PT-EU testset')
args = parser.parse_args()
lang = args.lang
emb = args.embedding
# Loading embedding model
embeddings = KeyedVectors.load_word2vec_format(emb,
binary=False,
unicode_errors="ignore")
# Loading evaluation data and parsing it
with open('%sassin-pt%s-train.pkl' % (DATA_DIR, lang), 'rb') as fp:
data = pickle.load(fp)
with open('%sassin-pt%s-test-gold.pkl' % (DATA_DIR, lang), 'rb') as fp:
test = pickle.load(fp)
# Getting features
features = gensim_embedding_difference(data, 'tokens_t1', 'tokens_t2')
features_test = gensim_embedding_difference(test, 'tokens_t1', 'tokens_t2')
# Predicting similarities
results = array([float(i['result']) for i in data])
results_test = evaluate_testset(features, results, features_test)
write_xml('%soutput.xml' % DATA_DIR, results_test)
# Evaluating
pairs_gold = read_xml('%sassin-pt%s-test.xml' % (TEST_DIR, lang), True)
pairs_sys = read_xml('%soutput.xml' % DATA_DIR, True)
eval_similarity(pairs_gold, pairs_sys)