# Pre processing

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_curve, roc_auc_score, plot_roc_curve
import numpy as np
import pandas as pd

import sys
sys.path.insert(1, '../../libs')
from utils import get_data

In [2]:
data = get_data("../../data/authors.csv")
data = data[data.username.isin(data.username.unique()[4:6])] #select two authors

In [3]:
class SparseToArray():
    """
    https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required
    """

    def __repr__(self):
        return("SparseToArray()")

    def __str__(self):
        return("SparseToArray()")

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [6]:
class AuthorClassifier:
    def __init__(self, 
                vectorizer=CountVectorizer(), 
                clf=MultinomialNB(),
                scaler=None,
                pca=None):
        self.vectorizer = vectorizer
        self.clf = clf
        self.scaler = scaler
        self.pca = pca
        self.pipe = None
        self.auc_score = -1

    def fit(self, X_train, y_train):
        steps = list() 
        steps.append((self.vectorizer.__str__(), self.vectorizer))

        if self.scaler or self.pca: 
            steps.append(("SparseToArray()", SparseToArray()))
        
        if self.scaler: 
            steps.append((self.scaler.__str__(), self.scaler))
        if self.pca: 
            steps.append((self.pca.__str__(), self.pca))
        
        steps.append((self.clf.__str__(), self.clf))

        pipe = Pipeline(steps)
        pipe.fit(X_train, y_train)
        self.pipe = pipe
        return pipe
    
    def predict(self, X_test, store_auc=True):
        y_pred = self.pipe.predict(X_test)
        if store_auc:
            self.auc_score = round(roc_auc_score(y_test, self.pipe.decision_function(X_test)), 4)
        return y_pred
    
    def evaluate(self, y_true, y_pred):
        metrics = dict()
        for i, author in enumerate(np.unique(y_true)):
            i += 1
            metrics[f"author{i}"] = author
        for i, author in enumerate(np.unique(y_true)):
            i += 1
            metrics[f"precision_author{i}"] = round(precision_score(y_true, y_pred, pos_label=author), 4)
            metrics[f"recall_author{i}"] = round(recall_score(y_true, y_pred, pos_label=author), 4)
            metrics[f"f1_score_author{i}"] = round(f1_score(y_true, y_pred, pos_label=author), 4)
        metrics["precision_weighted"] = round(precision_score(y_true, y_pred, average='weighted'), 4 )
        metrics["precision_micro"] = round(precision_score(y_true, y_pred, average='micro'), 4 )
        metrics["precision_macro"] = round(precision_score(y_true, y_pred, average='macro'), 4 )
        metrics["recall_weighted"] = round(recall_score(y_true, y_pred, average='weighted'), 4 )
        metrics["recall_micro"] = round(recall_score(y_true, y_pred, average='micro'), 4 )
        metrics["recall_macro"] = round(recall_score(y_true, y_pred, average='macro'), 4 )
        metrics["f1_weighted"] = round(f1_score(y_true, y_pred, average='weighted'), 4 )
        metrics["f1_micro"] = round(f1_score(y_true, y_pred, average='micro'), 4 )
        metrics["f1_macro"] = round(f1_score(y_true, y_pred, average='macro'), 4 )
        metrics["auc_score"] = self.auc_score
        metrics["accuracy"] = round(accuracy_score(y_true, y_pred), 4 )

        return metrics


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data.comment, data.username, test_size=0.33, random_state=42)
    
clf = AuthorClassifier(clf=LogisticRegression())
pipe = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.evaluate(y_test, y_pred)

{'author1': 'MaxVonHabsburg',
 'author2': 'TheGza1',
 'precision_author1': 0.9197,
 'recall_author1': 0.8155,
 'f1_score_author1': 0.8645,
 'precision_author2': 0.8371,
 'recall_author2': 0.9302,
 'f1_score_author2': 0.8812,
 'precision_weighted': 0.878,
 'precision_micro': 0.8734,
 'precision_macro': 0.8784,
 'recall_weighted': 0.8734,
 'recall_micro': 0.8734,
 'recall_macro': 0.8728,
 'f1_weighted': 0.8729,
 'f1_micro': 0.8734,
 'f1_macro': 0.8728,
 'auc_score': 0.9432,
 'accuracy': 0.8734}