In [11]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

import tabulate
import numpy as np

from collections import defaultdict
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import classification_report

In [5]:
from ipynb.fs.full.Preproc_pipeline_Text import *

from ipynb.fs.full.Algo_classifier_w_textnormer_and_vectorizer import *

from ipynb.fs.full.Kfold_class import *

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading wordnet: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


In [6]:
def identity(words):
    return words

def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))
        
    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)

In [7]:
estimator_ls = [LogisticRegression(multi_class='ovr',solver='saga'),MultinomialNB(), SGDClassifier()]

models = []
for model in estimator_ls:
    #models.append(create_pipeline(model, True)) ## Returns a negative vector value
    models.append(create_pipeline(model, False))

In [8]:
pic_root = '/Users/nebo333/Documents/Pickled files/'
cat_pattern = r'rec[\w]+.*' ## limiting to just rec categories. Reduces # of labels and volume
pic_corpus = TextPickledCorpusReader(pic_root, cat_pattern)
model_loader = kfold_CorpusLoader(pic_corpus)

labels = pic_corpus.categories()

In [12]:
## Show summary model performance by model

for model in models:
    scores = [] # Store a list of scores for each split
    model_name = str(model.get_params().get('classifier'))
    model_name = model_name.split('(')
    model_name = model_name[0]

    for X_train, X_test, y_train, y_test in model_loader:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score  = accuracy_score(y_test, y_pred)
        scores.append(score)

    print("Accuracy of {} is {:0.3f}".format(model_name, np.mean(scores)))

Accuracy of LogisticRegression is 0.978
Accuracy of MultinomialNB is 0.980
Accuracy of SGDClassifier is 0.985


In [13]:
## Show overall accuracy by category / label

model = create_pipeline(LogisticRegression(multi_class='ovr',solver='saga'), False)
for X_train, X_test, y_train, y_test in model_loader:
    model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, labels=labels))

                  precision    recall  f1-score   support

        recautos       0.91      0.95      0.93        62
  recmotorcycles       0.95      0.94      0.94        93
recsportbaseball       0.99      0.99      0.99        85
  recsporthockey       1.00      0.98      0.99        91

        accuracy                           0.96       331
       macro avg       0.96      0.96      0.96       331
    weighted avg       0.96      0.96      0.96       331



In [14]:
## Show detailed model performance by model

fields = ['model', 'precision', 'recall', 'accuracy', 'f1']
table  = []

for model in models:
    scores = defaultdict(list) # storage for all our model metrics

    # k-fold cross-validation
    for X_train, X_test, y_train, y_test in model_loader:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Add scores to our scores
        scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        scores['accuracy'].append(accuracy_score(y_test, y_pred))
        scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

    # Aggregate our scores and add to the table.
    model_name = str(model.get_params().get('classifier'))
    model_name = model_name.split('(')
    model_name = model_name[0]
    row = [str(model_name)]
    for field in fields[1:]:
        row.append(np.mean(scores[field]))

    table.append(row)

# Sort the models by F1 score descending
table.sort(key=lambda row: row[-1], reverse=True)
print(tabulate.tabulate(table, headers=fields))

model                 precision    recall    accuracy        f1
------------------  -----------  --------  ----------  --------
SGDClassifier          0.983238  0.983158    0.983158  0.983139
MultinomialNB          0.982132  0.9819      0.9819    0.9819
LogisticRegression     0.977728  0.977366    0.977366  0.977409
