In [1]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
import numpy as np

from pa4_functions import load_data, print_stats, ExractSimpleFeatures, extractSegments, evaluateCV, evaluateCV_check, printLabelsToFile, printNMostInformative

def useClassifier(clf, encoder, train_features, train_labels, test_features, filename):
    avg = evaluateCV(clf, encoder, train_features, train_labels)
    print("Average: %.2f" % avg)
    evaluateCV_check(clf, train_features, train_labels)

    clf.fit(train_features, train_labels)   
    test_label_predicted = clf.predict(test_features)

    printLabelsToFile(encoder, test_label_predicted, filename)
    print("printed to " + '"' + filename + '"')
###########################################################################################
# 2. LOAD DATA
###########################################################################################
train_data, train_labels = load_data('train.json.txt', verbose=False)
test_data, _ = load_data('test.json.txt', verbose=False)

In [16]:
def gridSearchCV(pipeline, parameters, features, labels):
    f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro')
    gridSearch = GridSearchCV(pipeline, parameters, scoring=f_scorer, cv=5, verbose=2)
    gridSearch.fit(features, labels)
    means = gridSearch.cv_results_['mean_test_score']
    stds = gridSearch.cv_results_['std_test_score']
    parameters = gridSearch.cv_results_['params']
    print("Best: ")
    print("%0.3f (+/-%0.03f) for %r" % (means[gridSearch.best_index_], 
                                        stds[gridSearch.best_index_] * 2, parameters[gridSearch.best_index_]))
    print("\nGrid: ")
    for mean, std, params in zip(means, stds, parameters):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    return gridSearch.best_params_

In [3]:
###########################################################################################
# 2. EXTRACT FEATURES and LABELS
###########################################################################################
# LABELS
le = LabelEncoder()
train_labels_onehot =le.fit_transform(train_labels)

# FEATURES - TRAIN & TEST
train_data_featurized = ExractSimpleFeatures(train_data, verbose=False)
test_data_featurized = ExractSimpleFeatures(test_data, verbose=False)

train_data_middle_segment = extractSegments(train_data, False, True, False)
test_data_middle_segment = extractSegments(test_data, False, True, False)

train_data_all_segments = extractSegments(train_data, True, True, True)
test_data_all_segments = extractSegments(test_data, True, True, True)

In [4]:
###########################################################################################
# 3. BUILD PIPELINES
###########################################################################################
#clf = make_pipeline(DictVectorizer(), LogisticRegression())
clf2 = make_pipeline(TfidfVectorizer(), LogisticRegression())
clf3 = make_pipeline(TfidfVectorizer(ngram_range=(0, 3), analyzer='char'), LogisticRegression())

In [61]:
# can be optimized with parameters
clf = make_pipeline(DictVectorizer(), LogisticRegression(C=1000, multi_class='multinomial', solver='newton-cg'))
evaluateCV_check(clf, train_data_featurized, train_labels_onehot) 


Cross-validation scores (StratifiedKFold):  [0.58280724 0.59586451 0.57539648 0.57802525 0.57415612]
Mean cv score (StratifiedKFold):  0.5812499186909281


In [12]:
parameters0 = {
    'tfidfvectorizer__analyzer': ['word', 'char'],
    'tfidfvectorizer__stop_words': ['english', None],
    'tfidfvectorizer__ngram_range': [(1, 1), (2, 2), (3, 3)],
    #'tfidfvectorizer__use_idf': [True, False],
    #'tfidfvectorizer__max_df': [0.1, 0.5, 1.0],
    #'tfidfvectorizer__min_df': [1, 5, 10],    
}
parameters1 = {
    'logisticregression__C': [1, 10, 100, 1000],
    'logisticregression__penalty':['l1', 'l2'],
    'logisticregression__solver':['liblinear', 'saga'],
    #'logisticregression__dual':[True, False], # can only be true for l2 penalty with liblinear solver
}
parameters2 = {
    'logisticregression__C': [1, 10, 100, 1000],
    'logisticregression__penalty':['l2'],
    'logisticregression__solver':['lbfgs', 'newton-cg', 'sag'], # only with l2 penalty
    'logisticregression__multi_class':['ovr', 'multinomial'], # multinomial not for solver=’liblinear’
}

In [None]:
gridSearchCV(clf2, parameters0, train_data_middle_segment, train_labels_onehot)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english 
[CV]  tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english, total=   0.7s
[CV] tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV]  tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english, total=   0.8s
[CV] tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english 
[CV]  tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english, total=   0.8s
[CV] tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english 
[CV]  tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english, total=   0.6s
[CV] tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english 
[CV]  tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=english, total=   0.8s
[CV] tfidfvectorizer__analyzer=word, tfidfvectorizer__ngram_range=(1, 1), tfidfvectorizer__stop_words=None 
[CV]  tfidfvectorizer__analyzer=word, tfidfvectorizer__

In [7]:
###########################################################################################
# 4. USE PIPELINES
###########################################################################################
useClassifier(clf3, le, train_data_middle_segment, train_labels_onehot, test_data_middle_segment, 'predictions_clf3.txt')

relation              precision     recall    f-score    support
------------------    ---------  ---------  ---------  ---------
NO_REL                    0.612      0.705      0.629       2300
author                    0.842      0.871      0.848       2653
capital                   0.890      0.573      0.800        510
has_spouse                0.835      0.850      0.838       3019
worked_at                 0.795      0.581      0.740       1178
------------------    ---------  ---------  ---------  ---------
macro-average             0.795      0.716      0.771       9660
Average: 0.77

Cross-validation scores (StratifiedKFold):  [0.76231294 0.78004128 0.79387289 0.76638445 0.751862  ]
Mean cv score (StratifiedKFold):  0.770894711364053
printed to "test.txt"


  if diff:


In [10]:
#useClassifier(clf2, le, train_data_middle_segment, train_labels_onehot, test_data_middle_segment, 'predictions_clf2.txt')

In [13]:
print("Top features used to predict: ")
#printNMostInformative(clf, le, 3, 'dictvectorizer')
printNMostInformative(clf3, le, 3, 'tfidfvectorizer')

Top features used to predict: 
(5, 13485)

Class NO_REL best: 
(2.5744258445318517, ' an')
(2.6452725899682488, '. ')
(3.849651134579289, ' ,')

Class author best: 
(3.5879475095786724, "'")
(4.029460877872109, 'y')
(4.944191387880969, 'by')

Class capital best: 
(3.138655611011565, 'in')
(3.7073144125946293, ',,')
(4.438772077639486, ',,,')

Class has_spouse best: 
(4.751422684772724, 'ife')
(4.769287527192585, 'wif')
(4.864998654459525, ' wi')

Class worked_at best: 
(3.282106103205331, 'at')
(3.478830411723698, 'und')
(4.428550603474797, ' at')
