# Pre processing

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns

import sys
sys.path.insert(1, '../../libs')
from utils import get_data

In [2]:
data = get_data("../../data/authors.csv")
data = data[data.username.isin(data.username.unique()[:2])] #select two authors

In [3]:
class SparseToArray():
    """
    https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required
    """

    def __repr__(self):
        return("SparseToArray()")

    def __str__(self):
        return("SparseToArray()")

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [4]:
class AuthorClassifier:
    def __init__(self, 
                vectorizer=CountVectorizer(), 
                clf=MultinomialNB(),
                scaler=None,
                pca=None):
        self.vectorizer = vectorizer
        self.clf = clf
        self.scaler = scaler
        self.pca = pca
        self.pipe = None

    def fit(self, X_train, y_train):
        steps = list() 
        steps.append((self.vectorizer.__str__(), self.vectorizer))

        if self.scaler or self.pca: 
            steps.append(("SparseToArray()", SparseToArray()))
        
        if self.scaler: 
            steps.append((self.scaler.__str__(), self.scaler))
        if self.pca: 
            steps.append((self.pca.__str__(), self.pca))
        
        steps.append((self.clf.__str__(), self.clf))

        pipe = Pipeline(steps)
        pipe.fit(X_train, y_train)
        self.pipe = pipe
        return pipe
    
    def predict(self, X_test):
        y_pred = self.pipe.predict(X_test)
        return y_pred
    
    def evaluate(self, y_true, y_pred):
        return classification_report(y_true, y_pred)       

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data.comment, data.username, test_size=0.33, random_state=42)
    
clf = AuthorClassifier()
print(clf.fit(X_train, y_train))
y_pred = clf.predict(X_test)
print(clf.evaluate(y_test, y_pred))

Pipeline(steps=[('CountVectorizer()', CountVectorizer()),
                ('MultinomialNB()', MultinomialNB())])
              precision    recall  f1-score   support

BluePirate89       0.98      0.65      0.78       319
    Manada_2       0.74      0.99      0.85       328

    accuracy                           0.82       647
   macro avg       0.86      0.82      0.81       647
weighted avg       0.86      0.82      0.81       647

