In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


path = './learning_data/'

# Load Test Data

In [2]:
with open(path+'X_test_raw.pkl','rb') as f:
    X_test_raw = pickle.load(f)
with open(path+'y_test.pkl','rb') as f:
    y_test = pickle.load(f)
with open(path+'topics.pkl','rb') as f:
    topics = pickle.load(f)

# Load Models

In [3]:
# LR

with open(path+'tfidf_vectorizer.pkl','rb') as f:
    tfidf_vectorizer_lr = pickle.load(f)
with open('./learning_data/ovr_clf.pkl','rb') as f:
    ovr_clf = pickle.load(f)

# MNB

with open(path+'tfidf_vectorizer_mnb.pkl','rb') as f:
    tfidf_vectorizer_mnb = pickle.load(f)
with open(path+'tfidf_transformer_mnb.pkl','rb') as f:
    tfidf_transformer_mnb = pickle.load(f)
with open(path+'mnb_clf.pkl','rb') as f:
    mnb_clf = pickle.load(f)

# Helper function to get the top-k results

In [4]:
def get_top_k(probs, k):
    results = np.zeros_like(probs)
    for i in range(probs.shape[0]):
        indexes = probs[i,:].argsort()[-k:][::-1]
        results[i,indexes] = 1.0
    return results

# Results

In [5]:
k = 10

# LR

X_test_lr = tfidf_vectorizer_lr.transform(X_test_raw.input.values)
lr_predictions = ovr_clf.predict_proba(X_test_lr)
lr_results = get_top_k(lr_predictions, k)

print('\n'+'*'*25+' LR Results:')
print(classification_report(y_test, lr_results, target_names=topics, digits=4, zero_division=0))

# MNB

X_test_mnb = tfidf_transformer_mnb.transform(tfidf_vectorizer_mnb.transform(X_test_raw.readme_processed.values))
mnb_predictions = mnb_clf.predict_proba(X_test_mnb)
mnb_results = get_top_k(mnb_predictions, k)

print('\n'+'*'*25+' MNB Results:')
print(classification_report(y_test, mnb_results, target_names=topics, digits=4, zero_division=0))


************************* LR Results:
                        precision    recall  f1-score   support

                    3d     0.1641    0.9036    0.2778       498
               actions     0.1458    0.8587    0.2492        92
                    ai     0.1122    0.8315    0.1977       273
                  ajax     0.0969    0.5814    0.1661        43
             algorithm     0.1514    0.8941    0.2589       614
               android     0.3720    0.8798    0.5229      2313
               angular     0.6447    0.9102    0.7547       590
               ansible     0.7107    0.9718    0.8210       177
                 antlr     0.0899    0.8366    0.1623       404
                   api     0.1750    0.8672    0.2912      1980
             archlinux     0.2089    0.8684    0.3367        38
               arduino     0.5519    0.9732    0.7044       224
                aspnet     0.3763    0.8926    0.5294       121
                  atom     0.4532    0.9394    0.6114       165
