In [7]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
import numpy as np

from pa4_functions import load_data, print_stats, ExractSimpleFeatures, extractSegments, evaluateCV
from pa4_functions import evaluateCV_check, printLabelsToFile, printNMostInformative, gridSearchCV

def useClassifier(clf, encoder, train_features, train_labels, test_features, do_check, filename):
    avg = evaluateCV(clf, encoder, train_features, train_labels)
    print("Average: %f" % avg)
    
    if do_check:
        evaluateCV_check(clf, train_features, train_labels)

    clf.fit(train_features, train_labels)   
    test_label_predicted = clf.predict(test_features)

    if filename:
        printLabelsToFile(encoder, test_label_predicted, filename)
        print("printed to " + '"' + filename + '"')
###########################################################################################
# 2. LOAD DATA
###########################################################################################
train_data, train_labels = load_data('train.json.txt', verbose=False)
test_data, _ = load_data('test.json.txt', verbose=False)

In [2]:
###########################################################################################
# 2. EXTRACT FEATURES and LABELS
###########################################################################################
# LABELS
le = LabelEncoder()
train_labels_onehot =le.fit_transform(train_labels)

# FEATURES - TRAIN & TEST
train_data_featurized = ExractSimpleFeatures(train_data, verbose=False)
test_data_featurized = ExractSimpleFeatures(test_data, verbose=False)

train_data_middle_segment = extractSegments(train_data, False, True, False)
test_data_middle_segment = extractSegments(test_data, False, True, False)

train_data_all_segments = extractSegments(train_data, True, True, True)
test_data_all_segments = extractSegments(test_data, True, True, True)

In [3]:
###########################################################################################
# 3. BUILD PIPELINES
###########################################################################################
# best old pipeline
# clf3 = make_pipeline(TfidfVectorizer(ngram_range=(0, 3), analyzer='char'), LogisticRegression())

#clf1 gives the same results as Tatyana's pipe 
clf1 = make_pipeline(CountVectorizer(), LogisticRegression())
clf2 = make_pipeline(TfidfVectorizer(), LogisticRegression())

### Optimization with GridSearch
Here, the hyperparameters of the classifiers are optimized

- CountVectorizer and Logistic Regression

In [5]:
# This is the initial performance of the classifier
clf1 = make_pipeline(CountVectorizer(), LogisticRegression())
useClassifier(clf1, le, train_data_all_segments, train_labels_onehot, test_data_all_segments, False, None)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.661      0.714      0.671       2300
author                    0.815      0.813      0.814       2653
capital                   0.883      0.639      0.820        510
has_spouse                0.860      0.902      0.868       3019
worked_at                 0.727      0.611      0.700       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.789      0.736      0.775       9660
Average: 0.774663

Cross-validation scores (StratifiedKFold):  [0.77310582 0.77412754 0.77655771 0.77941565 0.77010789]
Mean cv score (StratifiedKFold):  0.7746629212275845


In [200]:
# This is the first gridsearch: the result seems to say, that 1-2 grams are best for this task
parameters = {
    'countvectorizer__analyzer': ['word', ],#'char'],
    'countvectorizer__stop_words': ['english', None],
    'countvectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 
}
clf1 = make_pipeline(CountVectorizer(), LogisticRegression())
gridSearchCV(clf1, parameters, train_data_middle_segment, train_labels_onehot, 0)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1) 
[CV]  countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1), total=   2.0s
[CV] countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.4s remaining:    0.0s


[CV]  countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1), total=   2.4s
[CV] countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1) 
[CV]  countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1), total=   2.4s
[CV] countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1) 
[CV]  countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1), total=   1.9s
[CV] countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1) 
[CV]  countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 1), total=   1.7s
[CV] countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 2) 
[CV]  countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 2), total=   3.6s
[CV] countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 2) 
[CV]  countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 2), total=   4.6s
[CV] countvectorizer__analyzer=word, countvectorizer__ngram_range=(1, 2) 
[CV]  countvectorizer_

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   55.3s finished


Best: 
0.762 (+/-0.014) for {'countvectorizer__analyzer': 'word', 'countvectorizer__ngram_range': (1, 2)}

Grid: 
0.757 (+/-0.013) for {'countvectorizer__analyzer': 'word', 'countvectorizer__ngram_range': (1, 1)}
0.762 (+/-0.014) for {'countvectorizer__analyzer': 'word', 'countvectorizer__ngram_range': (1, 2)}
0.736 (+/-0.022) for {'countvectorizer__analyzer': 'word', 'countvectorizer__ngram_range': (2, 2)}


{'countvectorizer__analyzer': 'word', 'countvectorizer__ngram_range': (1, 2)}

In [203]:
parameters2 = {
    #'logisticregression__C': [1, 10, 100, 1000],
    'logisticregression__penalty':['l2'],
    'logisticregression__solver':['lbfgs', 'newton-cg', 'sag'], # only with l2 penalty
    'logisticregression__multi_class':['ovr', 'multinomial'], # multinomial not for solver=’liblinear’
    # n_jobs can be set for multi_class = ovr, but then solver cannot be liblinear
}
clf1 = make_pipeline(CountVectorizer(ngram_range=(1, 2)), LogisticRegression())
gridSearchCV(clf1, parameters2, train_data_middle_segment, train_labels_onehot, 2)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=  10.2s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.9s remaining:    0.0s


[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=  10.0s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=  10.1s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=   8.9s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=   8.3s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=newton-cg 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l



[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag, total=  16.7s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag 




[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag, total=  19.6s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag 




[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag, total=  16.7s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag 




[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag, total=  18.9s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag 




[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=sag, total=  18.6s
[CV] logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=  16.2s
[CV] logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=  10.9s
[CV] logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=  12.2s
[CV] logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregressi



[CV]  logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag, total=   8.5s
[CV] logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag 




[CV]  logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag, total=  10.8s
[CV] logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag 




[CV]  logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag, total=   5.9s
[CV] logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag 




[CV]  logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag, total=  11.5s
[CV] logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag 




[CV]  logisticregression__multi_class=multinomial, logisticregression__penalty=l2, logisticregression__solver=sag, total=   9.9s


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  6.7min finished


Best: 
0.763 (+/-0.014) for {'logisticregression__multi_class': 'ovr', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'lbfgs'}

Grid: 
0.763 (+/-0.014) for {'logisticregression__multi_class': 'ovr', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'lbfgs'}
0.763 (+/-0.014) for {'logisticregression__multi_class': 'ovr', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
0.672 (+/-0.034) for {'logisticregression__multi_class': 'ovr', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'sag'}
0.758 (+/-0.016) for {'logisticregression__multi_class': 'multinomial', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'lbfgs'}
0.758 (+/-0.016) for {'logisticregression__multi_class': 'multinomial', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
0.691 (+/-0.043) for {'logisticregression__multi_class': 'multinomial', 'logisticregression__penalty': 'l2', 'logisticregre

{'logisticregression__multi_class': 'ovr',
 'logisticregression__penalty': 'l2',
 'logisticregression__solver': 'lbfgs'}

In [153]:
clf2mod = make_pipeline(CountVectorizer(), LogisticRegression(multi_class='ovr', solver='lbfgs'))
useClassifier(clf2mod, le, train_data_all_segments, train_labels_onehot, test_data_all_segments, None)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.665      0.719      0.675       2300
author                    0.817      0.815      0.816       2653
capital                   0.879      0.645      0.819        510
has_spouse                0.862      0.902      0.869       3019
worked_at                 0.729      0.610      0.701       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.790      0.738      0.776       9660
Average: 0.776135

Cross-validation scores (StratifiedKFold):  [0.76938566 0.77412185 0.78397105 0.78192044 0.77127563]
Mean cv score (StratifiedKFold):  0.7761349285160714


In [204]:
clf2mod = make_pipeline(CountVectorizer(ngram_range=(1, 2)), LogisticRegression(multi_class='ovr', solver='lbfgs'))
useClassifier(clf2mod, le, train_data_all_segments, train_labels_onehot, test_data_all_segments, None)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.656      0.780      0.677       2300
author                    0.836      0.826      0.834       2653
capital                   0.932      0.657      0.859        510
has_spouse                0.893      0.909      0.896       3019
worked_at                 0.790      0.587      0.739       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.822      0.751      0.801       9660
Average: 0.801213

Cross-validation scores (StratifiedKFold):  [0.79292147 0.80051931 0.8116933  0.80162961 0.79930227]
Mean cv score (StratifiedKFold):  0.801213192485917


- TfidfVectorizer and Logistic Regression

In [151]:
parameters0 = {
    'tfidfvectorizer__analyzer': ['char'], #['word', 'char'],
    'tfidfvectorizer__stop_words': ['english', None],
    'tfidfvectorizer__ngram_range': [(1, 2), (1, 3), (2, 3)],
    #'tfidfvectorizer__use_idf': [True, False],
    #'tfidfvectorizer__max_df': [0.1, 0.5, 1.0],
    #'tfidfvectorizer__min_df': [1, 5, 10],    
}
parameters1 = {
    'logisticregression__C': [0.1, 1, 10],
    'logisticregression__penalty':['l2'], # 'l1',  always seems to be worse
    'logisticregression__solver':['liblinear'], # , 'saga' does not seem to converge
    #'logisticregression__dual':[True, False], # can only be true for l2 penalty with liblinear solver
}
parameters2 = {
    #'logisticregression__C': [1, 10, 100, 1000],
    'logisticregression__penalty':['l2'],
    'logisticregression__solver':['lbfgs', 'newton-cg', 'sag'], # only with l2 penalty
    'logisticregression__multi_class':['ovr', 'multinomial'], # multinomial not for solver=’liblinear’
    # n_jobs can be set for multi_class = ovr, but then solver cannot be liblinear
}

In [24]:
gridSearchCV(clf2, parameters0, train_data_middle_segment, train_labels_onehot)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english 
[CV]  tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english, total=   4.3s
[CV] tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.1s remaining:    0.0s


[CV]  tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english, total=   3.3s
[CV] tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english 
[CV]  tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english, total=   4.4s
[CV] tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english 
[CV]  tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english, total=   3.0s
[CV] tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english 
[CV]  tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=english, total=   2.8s
[CV] tfidfvectorizer__analyzer=char, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__stop_words=None 
[CV]  tfidfvectorizer__analyzer=char, tfidfvectorizer__

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  3.6min finished


Best: 
0.782 (+/-0.010) for {'tfidfvectorizer__analyzer': 'char', 'tfidfvectorizer__ngram_range': (2, 3), 'tfidfvectorizer__stop_words': 'english'}

Grid: 
0.748 (+/-0.009) for {'tfidfvectorizer__analyzer': 'char', 'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__stop_words': 'english'}
0.748 (+/-0.009) for {'tfidfvectorizer__analyzer': 'char', 'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__stop_words': None}
0.780 (+/-0.010) for {'tfidfvectorizer__analyzer': 'char', 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__stop_words': 'english'}
0.780 (+/-0.010) for {'tfidfvectorizer__analyzer': 'char', 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__stop_words': None}
0.782 (+/-0.010) for {'tfidfvectorizer__analyzer': 'char', 'tfidfvectorizer__ngram_range': (2, 3), 'tfidfvectorizer__stop_words': 'english'}
0.782 (+/-0.010) for {'tfidfvectorizer__analyzer': 'char', 'tfidfvectorizer__ngram_range': (2, 3), 'tfidfvectorizer__stop_words': None}


{'tfidfvectorizer__analyzer': 'char',
 'tfidfvectorizer__ngram_range': (2, 3),
 'tfidfvectorizer__stop_words': 'english'}

In [30]:
clf2mod = make_pipeline(TfidfVectorizer(stop_words='english', ngram_range=(2, 3), analyzer='char'), LogisticRegression())
useClassifier(clf2mod, le, train_data_middle_segment, train_labels_onehot, test_data_middle_segment, None)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.598      0.730      0.621       2300
author                    0.874      0.850      0.869       2653
capital                   0.871      0.620      0.805        510
has_spouse                0.852      0.852      0.852       3019
worked_at                 0.814      0.609      0.762       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.802      0.732      0.782       9660
Average: 0.78

Cross-validation scores (StratifiedKFold):  [0.76872924 0.78926317 0.81156896 0.77699781 0.76176503]
Mean cv score (StratifiedKFold):  0.7816648402230364


In [42]:
clf2mod = make_pipeline(TfidfVectorizer(stop_words='english', ngram_range=(2, 3), analyzer='char'), LogisticRegression())
gridSearchCV(clf2mod, parameters1, train_data_middle_segment, train_labels_onehot)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear 
[CV]  logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear, total=   3.9s
[CV] logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.9s remaining:    0.0s


[CV]  logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear, total=   3.8s
[CV] logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear 
[CV]  logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear, total=   3.9s
[CV] logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear 
[CV]  logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear, total=   5.8s
[CV] logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear 
[CV]  logisticregression__C=0.1, logisticregression__penalty=l2, logisticregression__solver=liblinear, total=   4.6s
[CV] logisticregression__C=1, logisticregression__penalty=l2, logisticregression__solver=liblinear 
[CV]  logisticregression__C=1, logisticregression__penalty=l2, logisticregression__solver=liblinear, total=   5.0s
[CV] logist

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.2min finished


Best: 
0.782 (+/-0.010) for {'logisticregression__C': 1, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'liblinear'}

Grid: 
0.741 (+/-0.015) for {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'liblinear'}
0.782 (+/-0.010) for {'logisticregression__C': 1, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'liblinear'}


{'logisticregression__C': 1,
 'logisticregression__penalty': 'l2',
 'logisticregression__solver': 'liblinear'}

In [44]:
clf2mod = make_pipeline(TfidfVectorizer(stop_words='english', ngram_range=(2, 3), analyzer='char'), LogisticRegression())
gridSearchCV(clf2mod, parameters2, train_data_middle_segment, train_labels_onehot)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=   6.2s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.2s remaining:    0.0s


[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=   5.9s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=   7.3s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=   7.8s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=lbfgs, total=   7.7s
[CV] logisticregression__multi_class=ovr, logisticregression__penalty=l2, logisticregression__solver=newton-cg 
[CV]  logisticregression__multi_class=ovr, logisticregression__penalty=l

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  4.9min finished


Best: 
0.784 (+/-0.012) for {'logisticregression__multi_class': 'multinomial', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'lbfgs'}

Grid: 
0.782 (+/-0.010) for {'logisticregression__multi_class': 'ovr', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'lbfgs'}
0.782 (+/-0.010) for {'logisticregression__multi_class': 'ovr', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
0.782 (+/-0.010) for {'logisticregression__multi_class': 'ovr', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'sag'}
0.784 (+/-0.012) for {'logisticregression__multi_class': 'multinomial', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'lbfgs'}
0.784 (+/-0.011) for {'logisticregression__multi_class': 'multinomial', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
0.784 (+/-0.011) for {'logisticregression__multi_class': 'multinomial', 'logisticregression__penalty': 'l2', 'logis

{'logisticregression__multi_class': 'multinomial',
 'logisticregression__penalty': 'l2',
 'logisticregression__solver': 'lbfgs'}

In [47]:
parameters4 = {
    'logisticregression__C': [0.1, 1, 10],
}
clf2mod = make_pipeline(TfidfVectorizer(stop_words='english', ngram_range=(2, 3), analyzer='char'), 
                        LogisticRegression(multi_class='multinomial', solver='lbfgs'))
gridSearchCV(clf2mod, parameters4, train_data_middle_segment, train_labels_onehot)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] logisticregression__C=0.1 .......................................
[CV] ........................ logisticregression__C=0.1, total=   5.2s
[CV] logisticregression__C=0.1 .......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.7s remaining:    0.0s


[CV] ........................ logisticregression__C=0.1, total=   6.7s
[CV] logisticregression__C=0.1 .......................................
[CV] ........................ logisticregression__C=0.1, total=   6.5s
[CV] logisticregression__C=0.1 .......................................
[CV] ........................ logisticregression__C=0.1, total=   7.2s
[CV] logisticregression__C=0.1 .......................................
[CV] ........................ logisticregression__C=0.1, total=   6.5s
[CV] logisticregression__C=1 .........................................
[CV] .......................... logisticregression__C=1, total=  10.5s
[CV] logisticregression__C=1 .........................................
[CV] .......................... logisticregression__C=1, total=  12.2s
[CV] logisticregression__C=1 .........................................
[CV] .......................... logisticregression__C=1, total=  10.8s
[CV] logisticregression__C=1 .........................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  2.7min finished


Best: 
0.784 (+/-0.012) for {'logisticregression__C': 1}

Grid: 
0.748 (+/-0.016) for {'logisticregression__C': 0.1}
0.784 (+/-0.012) for {'logisticregression__C': 1}
0.781 (+/-0.026) for {'logisticregression__C': 10}


{'logisticregression__C': 1}

In [107]:
# This classifier uses more optimal parameters and is able to get marginally better results
clf2mod = make_pipeline(TfidfVectorizer(stop_words='english', ngram_range=(2, 3), analyzer='char'), 
                        LogisticRegression(C=1, multi_class='multinomial', solver='lbfgs'))
useClassifier(clf2mod, le, train_data_middle_segment, train_labels_onehot, test_data_middle_segment, None)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.615      0.735      0.636       2300
author                    0.884      0.862      0.879       2653
capital                   0.870      0.643      0.812        510
has_spouse                0.861      0.854      0.860       3019
worked_at                 0.791      0.637      0.754       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.804      0.746      0.788       9660
Average: 0.788008

Cross-validation scores (StratifiedKFold):  [0.77130256 0.78812138 0.8176673  0.78067575 0.78227256]
Mean cv score (StratifiedKFold):  0.7880079089463325


### Best Classifier

In [8]:
clf = make_pipeline(CountVectorizer(ngram_range=(1, 2)), LogisticRegression(multi_class='ovr', solver='lbfgs'))
useClassifier(clf, le, train_data_all_segments, train_labels_onehot, test_data_all_segments, True, None)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.656      0.780      0.677       2300
author                    0.836      0.826      0.834       2653
capital                   0.932      0.657      0.859        510
has_spouse                0.893      0.909      0.896       3019
worked_at                 0.790      0.587      0.739       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.822      0.751      0.801       9660
Average: 0.801213

Cross-validation scores (StratifiedKFold):  [0.79292147 0.80051931 0.8116933  0.80162961 0.79930227]
Mean cv score (StratifiedKFold):  0.801213192485917


In [9]:
print("Top features used to predict: ")
#printNMostInformative(clf, le, 3, 'dictvectorizer')
printNMostInformative(clf, le, 3, 'countvectorizer')

Top features used to predict: 
(5, 772654)

Class NO_REL best: 
(0.6592857822275802, 'india')
(0.6751229576087763, 'stars')
(0.7125847509006051, 'bill')

Class author best: 
(1.1773082744019743, 'books')
(1.7301629307234392, 'book')
(2.3289587019069633, 'novel')

Class capital best: 
(0.7251712587666335, 'airport')
(1.1386176456328614, 'capital of')
(1.3739590687226302, 'capital')

Class has_spouse best: 
(1.9053646128553086, 'married')
(2.16041766314762, 'husband')
(2.9616921783107704, 'wife')

Class worked_at best: 
(1.2986412437578314, 'ceo')
(1.3954142887266279, 'founder')
(2.1071297174143093, 'professor')


In [106]:
# plot_coefficients(clf2mod, 'countvectorizer', le)

In [10]:
clf = make_pipeline(CountVectorizer(ngram_range=(1, 3)), LogisticRegression(multi_class='ovr', solver='lbfgs'))
useClassifier(clf, le, train_data_all_segments, train_labels_onehot, test_data_all_segments, True, None)

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------


KeyboardInterrupt: 

### Test with Word2Vec vectors

In [206]:
from gensim.models import KeyedVectors
# at the moment, this function checks each word in the input array and replaces it with the mean of the w2v vector
def w2v_transform(input_array): # input can e.g. be all middle segments
    # the word2vec vectors can be downloaded from(1.5GB): 
    # https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
    word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    
    unique_word_matrix = []
    for item in input_array:
        unique_word_matrix.append(list(set(item.lower().split())))
    # sentences that contain more or less than 120 words are either cut off or filled up with zeros
    mean_matrix = np.zeros((len(unique_word_matrix), 120))
    i = 0; 
    while i < len(unique_word_matrix):
        set_matrix = []
        j = 0;
        while j < 120:
            if j >= len(unique_word_matrix[i]):
                mean_matrix[i][j] = 0
                j += 1
                continue;
            word = unique_word_matrix[i][j]
            if word in word_vectors.vocab:
                mean_matrix[i, j] = np.mean(word_vectors[word])
            else:
                mean_matrix[i][j] = 0
            j += 1
        i += 1
    return mean_matrix

In [207]:
matrix = w2v_transform(train_data_middle_segment)

In [210]:
evaluateCV_check(LogisticRegression(), mean_matrix, train_labels_onehot)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Cross-validation scores (StratifiedKFold):  [0.08572679 0.10160061 0.09606363 0.10468581 0.08435447]
Mean cv score (StratifiedKFold):  0.09448625989628309


  'precision', 'predicted', average, warn_for)
