In [53]:
import gzip
import numpy as np
import random
import os
import json

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import FunctionTransformer,LabelEncoder
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [3]:
############################################################################################
# 1. LOAD DATA
############################################################################################

PairExample = namedtuple('PairExample', 'entity_1, entity_2, snippet')
Snippet = namedtuple('Snippet', 'left, mention_1, middle, mention_2, right, direction')
def load_data(file, verbose=True):
    f = open(file,'r', encoding='utf-8')
    data = []
    labels = []
    for i,line in enumerate(f):
        instance = json.loads(line)
        if i==0:
            if verbose:
                print('json example:')
                print(instance)
        #'relation, entity_1, entity_2, snippet' fileds for each example
        #'left, mention_1, middle, mention_2, right, direction' for each snippet
        instance_tuple = PairExample(instance['entity_1'],instance['entity_2'],[])
        for snippet in instance['snippet']:
            try:
                snippet_tuple = Snippet(snippet['left'],snippet['mention_1'],
                                        snippet['middle'], 
                                        snippet['mention_2'],snippet['right'],
                                        snippet['direction'])
                instance_tuple.snippet.append(snippet_tuple)
            except:
                print(instance)
        if i==0:
            if verbose:
                print('\nexample transformed as a named tuple:')
                print(instance_tuple)
        data.append(instance_tuple)
        labels.append(instance['relation'])
    return data,labels
    
train_data, train_labels = load_data('train.json.txt')

json example:
{'relation': 'has_spouse', 'entity_1': 'Judy_Garland', 'entity_2': 'David_Rose', 'snippet': [{'left': 'thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', 'mention_1': 'Judy Garland', 'middle': 'while she was engaged to composer', 'mention_2': 'David Rose', 'right': '. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', 'direction': 'fwd'}]}

example transformed as a named tuple:
PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but 

In [4]:
# Statistics over relations
def print_stats(labels):
    labels_counts = Counter(labels)
    print('{:20s} {:>10s} {:>10s}'.format('', '', 'rel_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('relation', 'examples', '/all_examples'))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    for k,v in labels_counts.items():
        print('{:20s} {:10d} {:10.2f}'.format(k, v, v /len(labels)))
    print('{:20s} {:>10s} {:>10s}'.format('--------', '--------', '-------'))
    print('{:20s} {:10d} {:10.2f}'.format('Total', len(labels), len(labels) /len(labels)))

print('Train set statistics:')
print_stats(train_labels)

Train set statistics:
                                rel_examples
relation               examples /all_examples
--------               --------    -------
has_spouse                 3019       0.31
author                     2653       0.27
NO_REL                     2300       0.24
capital                     510       0.05
worked_at                  1178       0.12
--------               --------    -------
Total                      9660       1.00


In [15]:
### THESE DICTS ARE NOT USED

# check that each entity pair is assigned only one relation
pair_dict={}
rel_dict={}
for example, label in zip(train_data,train_labels):
    if (example.entity_1,example.entity_2) not in pair_dict.keys():
        pair_dict[(example.entity_1,example.entity_2)] = [label]
        
    else:
        pair_dict[(example.entity_1,example.entity_2)].append(label)
        print(example.entity_1,example.entity_2,label)
    if label not in rel_dict.keys():
        rel_dict[label] = [example]
    else:
        rel_dict[label].append(example)
print("Done building dictionary")  
    
# example for each relation
#for rel in rel_dict.keys():
    #ex = rel_dict[rel][0]
    #print(rel,ex.entity_1,ex.entity_2) 

Done building dictionary


In [20]:
# how to reconstruct full context
ex = train_data[0]
print(ex)
print("\n full context:")
s = ex.snippet[0]
print(' '.join((s.left, s.mention_1, s.middle, s.mention_2, s.right)))

PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', direction='fwd')])

 full context:
thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old Judy Garland while she was engaged to composer David Rose . Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair


In [40]:
###########################################################################################
# 2. EXTRACT FEATURES and BUILD CLASSIFIER
###########################################################################################

# Extract two simple features
def ExractSimpleFeatures(data, verbose=True):
    featurized_data = []
    for instance in data:
        featurized_instance = {'mid_words':'', 'distance':np.inf}
        for s in instance.snippet:
            if len(s.middle.split()) < featurized_instance['distance']:
                featurized_instance['mid_words'] = s.middle
                featurized_instance['distance'] = len(s.middle.split())
        featurized_data.append(featurized_instance)
    if verbose:
        print(len(data))
        print(len(featurized_data))
        print(data[0])
        print(featurized_data[0])
    return featurized_data

In [42]:
# Transform dataset to features
train_data_featurized = ExractSimpleFeatures(train_data)

9660
9660
PairExample(entity_1='Judy_Garland', entity_2='David_Rose', snippet=[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', direction='fwd')])
{'mid_words': 'while she was engaged to composer', 'distance': 6}


In [14]:
# Transform labels to nimeric values
le = LabelEncoder()
train_labels_featurized = le.fit_transform(train_labels)

# Fit model one vs rest logistic regression    
clf = make_pipeline(DictVectorizer(), LogisticRegression())

In [26]:
train_data[0].snippet

[Snippet(left='thirty and his life and career were riding high . In 1941 , shortly after the death of his father , Mercer began an intense affair with nineteen-year-old', mention_1='Judy Garland', middle='while she was engaged to composer', mention_2='David Rose', right='. Garland married Rose to temporarily stop the affair , but the effect on Mercer lingered , adding to the emotional depth of his lyrics . Their affair', direction='fwd')]

In [71]:
# Extract all middle segments
# One instance may contain multiple snippets
def getMiddleSegmentData(data, left, middle, right):
    all_segments = []
    for instance in data:
        combined_segments = ""
        for snippet in instance.snippet:
            if left:
                combined_segments += snippet.left
            if middle:
                combined_segments += snippet.middle
            if right:
                combined_segments += snippet.right
        all_segments.append(combined_segments)
    return all_segments

train_data_middle_segment = getMiddleSegmentData(train_data, False, True, False)
train_data_all_segments = getMiddleSegmentData(train_data, True, True, True)

In [64]:
clf2 = make_pipeline(TfidfVectorizer(), LogisticRegression())

In [65]:
clf3 = make_pipeline(TfidfVectorizer(ngram_range=(0,3) ,analyzer='char'), LogisticRegression())

In [38]:
##################################################################################################
# 2. TRAIN CLASSIFIER AND EVALUATE (CV)
##################################################################################################

def print_statistics_header():
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        'relation', 'precision', 'recall', 'f-score', 'support'))
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))

def print_statistics_row(rel, result):
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format(rel, *result))

def print_statistics_footer(avg_result):
    print('{:20s} {:>10s} {:>10s} {:>10s} {:>10s}'.format(
        '-' * 18, '-' * 9, '-' * 9, '-' * 9, '-' * 9))
    print('{:20s} {:10.3f} {:10.3f} {:10.3f} {:10d}'.format('macro-average', *avg_result))

def macro_average_results(results):
    avg_result = [np.average([r[i] for r in results.values()]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results.values()]))
    return avg_result

def average_results(results):
    avg_result = [np.average([r[i] for r in results]) for i in range(3)]
    avg_result.append(np.sum([r[3] for r in results]))
    return avg_result
    
def evaluateCV(classifier, label_encoder, X, y, verbose=True):
    results = {}
    for rel in le.classes_:
            results[rel] = []
    if verbose:
        print_statistics_header()
        kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
        for train_index, test_index in kfold.split(X, y):
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            classifier.fit(X_train, y_train)
            pred_labels = classifier.predict(X_test)
            stats = precision_recall_fscore_support(y_test, pred_labels, beta=0.5)
            #print(stats)
            for rel in label_encoder.classes_:
                rel_id = label_encoder.transform([rel])[0]
            #print(rel_id,rel)
                stats_rel = [stat[rel_id] for stat in stats]
                results[rel].append(stats_rel)
        for rel in label_encoder.classes_:
            results[rel] = average_results(results[rel])
            if verbose:
                print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic

In [49]:
evaluateCV(clf,le,train_data_featurized,train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.404      0.783      0.447       2300
author                    0.737      0.636      0.713       2653
capital                   0.614      0.063      0.200        510
has_spouse                0.877      0.717      0.840       3019
worked_at                 0.758      0.239      0.525       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.678      0.487      0.545       9660


0.5449903063516611

In [67]:
evaluateCV(clf2,le,train_data_middle_segment,train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.507      0.671      0.533       2300
author                    0.856      0.745      0.831       2653
capital                   0.843      0.518      0.746        510
has_spouse                0.825      0.858      0.831       3019
worked_at                 0.814      0.587      0.755       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.769      0.676      0.739       9660


0.7393568037393644

In [72]:
evaluateCV(clf2,le,train_data_all_segments,train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.615      0.660      0.623       2300
author                    0.755      0.842      0.771       2653
capital                   0.932      0.498      0.793        510
has_spouse                0.766      0.864      0.784       3019
worked_at                 0.832      0.392      0.678       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.780      0.651      0.730       9660


0.7299269073222046

In [66]:
evaluateCV(clf3,le,train_data_middle_segment,train_labels_featurized)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.612      0.705      0.629       2300
author                    0.842      0.871      0.848       2653
capital                   0.890      0.573      0.800        510
has_spouse                0.835      0.850      0.838       3019
worked_at                 0.795      0.581      0.740       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.795      0.716      0.771       9660


0.770894711364053

In [33]:
# A check for the average F1 score

f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro')

def evaluateCV_check(classifier, X, y, verbose=True):
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 
    scores = cross_val_score(classifier, X, y, cv=kfold, scoring = f_scorer)
    print("\nCross-validation scores (StratifiedKFold): ", scores)
    print("Mean cv score (StratifiedKFold): ", scores.mean())

In [36]:
evaluateCV_check(clf,train_data_featurized,train_labels_featurized)


Cross-validation scores (StratifiedKFold):  [0.5393179  0.57372145 0.54955579 0.50265555 0.55970084]
Mean cv score (StratifiedKFold):  0.5449903063516612


In [34]:
evaluateCV_check(clf2,train_data_middle_segment,train_labels_featurized)


Cross-validation scores (StratifiedKFold):  [0.73294155 0.74584423 0.73944281 0.73395541 0.74460001]
Mean cv score (StratifiedKFold):  0.7393568037393644


In [50]:
evaluateCV_check(clf3,train_data_middle_segment,train_labels_featurized)


Cross-validation scores (StratifiedKFold):  [0.71893154 0.74422827 0.74656978 0.74079391 0.71748781]
Mean cv score (StratifiedKFold):  0.7336022616899978


In [37]:
#########################################################################################
# 4. TEST PREDICTIONS and ANALYSIS
#########################################################################################

# Fit final model on the full train data
clf.fit(train_data_featurized, train_labels_featurized)

# Predict on test set
test_data, test_labels = load_data('test.json.txt', verbose=False)
test_data_featurized = ExractSimpleFeatures(test_data, verbose=False)
test_label_predicted = clf.predict(test_data_featurized)
# Deprecation warning explained: https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
test_label_predicted_decoded = le.inverse_transform(test_label_predicted)
print(test_label_predicted_decoded[:2])
f = open("test_labels.txt", 'w', encoding="utf-8")
for label in test_label_predicted_decoded:
    f.write(label+'\n')

FileNotFoundError: [Errno 2] No such file or directory: 'test.json.txt'

In [38]:
# Feature analisys - print N most informative
# !! Make changes in this function when you change the pipleine!!
def printNMostInformative(classifier,label_encoder,N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = classifier.named_steps['dictvectorizer'].get_feature_names()

    coef = classifier.named_steps['logisticregression'].coef_    
    print(coef.shape)
    for rel in label_encoder.classes_:
        rel_id = label_encoder.transform([rel])[0]
        coef_rel = coef[rel_id]
        coefs_with_fns = sorted(zip(coef_rel, feature_names))
        top_features = coefs_with_fns[-N:]
        print("\nClass {} best: ".format(rel))
        for feat in top_features:
            print(feat)        
        
print("Top features used to predict: ")
# show the top features
printNMostInformative(clf,le,3)

Top features used to predict: 
(5, 3984)

Class NO_REL best: 
(1.4454404891518906, 'mid_words=or')
(1.4768824721441594, 'mid_words=and the')
(1.7478700821347326, 'mid_words=, and')

Class author best: 
(3.2329724590543107, "mid_words='s novel")
(3.396538059707806, 'mid_words=, by')
(4.900904204379451, 'mid_words=by')

Class capital best: 
(2.2261657584289756, 'mid_words=, in')
(2.816573434315352, 'mid_words=in')
(3.407334079785748, 'mid_words=after')

Class has_spouse best: 
(3.290605086968, 'mid_words=&')
(3.3575566159732424, 'mid_words=married')
(4.037952996019875, 'mid_words=and his wife')

Class worked_at best: 
(2.6555584767554246, 'mid_words=professor')
(2.789350716076006, 'mid_words=CEO')
(2.7936190235252005, 'mid_words=of the')


In [41]:
clf.named_steps['dictvectorizer'].get_feature_names()

['distance',
 'mid_words=',
 'mid_words=! It was posted in',
 'mid_words=&',
 'mid_words=& Bhs boss ,',
 'mid_words=& Geoffrey Pullum , et al . ( 2002 ) .',
 'mid_words=& John Pisano . “ Magic Trumpet ” is a',
 "mid_words='",
 "mid_words=' 1910 book",
 "mid_words=' 2001 memoir ,",
 "mid_words=' 2006 novel",
 "mid_words=' 2011 novel ,",
 "mid_words=' New York Times Bestseller",
 "mid_words=' book ,",
 "mid_words=' by",
 "mid_words=' collection of short stories",
 "mid_words=' founder Andre Harrell ,",
 "mid_words=' husband",
 "mid_words=' novel",
 "mid_words=' play",
 "mid_words=' poem '",
 "mid_words=' production company",
 "mid_words=' s",
 "mid_words=' second novel",
 "mid_words=' tragedy",
 "mid_words=''",
 "mid_words='01 (",
 "mid_words='07 ( Salk Institute /",
 "mid_words='Where the Sidewalk Ends ' and",
 "mid_words='s",
 "mid_words='s '",
 "mid_words='s ,",
 "mid_words='s , and Nick Park 's",
 "mid_words='s 1820 novel",
 "mid_words='s 1826 novel",
 "mid_words='s 1875 novel",
 "mi