In [1]:
import pandas as pd
import numpy as np

from scipy.stats import mode

from collections import deque

import utils as u

import pickle
import time

import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

from scipy.stats import entropy

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.pipeline import FeatureUnion
from imblearn.pipeline import Pipeline

# vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# feature selectors
from sklearn.feature_selection import SelectKBest, chi2

# scalers
from sklearn.preprocessing import MinMaxScaler

# classifiers
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression

# samplers
from imblearn.under_sampling import TomekLinks

# calibration
from sklearn.calibration import CalibratedClassifierCV

In [2]:
df = pd.read_pickle('../pandas/lemma_delivered_merged_df.pkl')

In [3]:
#df['ft'] = pd.read_pickle('../pandas/FT_TFIDF_lemma_labeled_vocab.pkl')
df['ft'] = pd.read_pickle('../pandas/FT_TFIDF_lemma_full_vocab.pkl')
df['is_nn'] = pd.read_pickle('../pandas/is_nn_full.pkl')
df['is_trash'] = pd.read_pickle('../pandas/is_trash_full.pkl')

In [4]:
labeled_corpus = df[(df['agg_label'] != -1) & (df['is_nn'] == False)]
unlabeled_corpus = df[(df['agg_label'] == -1) & (df['is_nn'] == False) & (df['is_trash'] == False)]

target = 'agg_label'
text = 'lemma_delivered'
fasttext = 'ft'
numeric = ['raw_len', 'raw_word_count']

In [5]:
train_X, vali_X, train_y, vali_y = train_test_split(
    labeled_corpus,
    labeled_corpus[target],
    test_size=0.4,
    random_state=1,
    stratify=labeled_corpus[target])

test_X, validation_X, test_y, validation_y = train_test_split(
    vali_X,
    vali_y,
    test_size=0.5,
    random_state=1,
    stratify=vali_y)

In [6]:
# models
clf_LinearSVC = LinearSVC(**{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01})
baseline_model = {
    'title': 'baseline_model',
    'features': ('Text+FT', {'text': text, 'ft': fasttext}),
    'vectorizer': ('TfidfVectorizer', TfidfVectorizer, {'lowercase': False, 'max_df': 0.25}),
    'scaler': ('MinMaxScaler', MinMaxScaler, {'feature_range': (0,1)}),
    'sampler': ('TomekLinks', TomekLinks, {'random_state': 1, 'sampling_strategy': 'majority', 'n_jobs': -1}),
    #'sampler': ('No_sampling', u.PassThrough, {}),
    'classifier': ('CalibratedClassifierCV_LinearSVC', CalibratedClassifierCV, {'base_estimator': clf_LinearSVC, 'method': 'isotonic', 'cv': 3}),
    #'classifier': ('LinearSVC', LinearSVC, {'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01}),
}

quick_svm_model = {
    'title': 'quick_svm_model',
    'features': ('Text+FT', {'text': text}),
    'vectorizer': ('TfidfVectorizer', TfidfVectorizer, {'lowercase': False, 'max_df': 0.25}),
    'scaler': ('No_scaling', u.PassThrough, {}),
    'sampler': ('No_sampling', u.PassThrough, {}),
    'classifier': ('LinearSVC', LinearSVC, {'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01}),
}

clf_ComplementNB = ComplementNB(**{'alpha': 0.2})
quick_model = {
    'title': 'quick_model',
    'features': ('TextOnly', {'text': text}),
    'vectorizer': ('TfidfVectorizer', TfidfVectorizer, {'lowercase': False, 'max_df': 0.25}),
    'scaler': ('No_scaling', u.PassThrough, {}),
    'sampler': ('No_sampling', u.PassThrough, {}),
    'classifier': ('CalibratedClassifierCV_ComplementNB', CalibratedClassifierCV, {'base_estimator': clf_ComplementNB, 'method': 'isotonic', 'cv': 3}),
    #'classifier': ('CalibratedClassifierCV_ComplementNB', CalibratedClassifierCV, {'base_estimator': clf_ComplementNB, 'cv': 3}),
}

clf_LogisticRegression = LogisticRegression(**{'solver': 'saga', 'tol': 0.0001, 'C': 8, 'n_jobs': -1, 'random_state': 1})
log_model = {
    'title': 'log_model',
    'features': ('TextOnly', {'text': text}),
    'vectorizer': ('TfidfVectorizer', TfidfVectorizer, {'lowercase': False, 'max_df': 0.25}),
    'scaler': ('No_scaling', u.PassThrough, {}),
    'sampler': ('No_sampling', u.PassThrough, {}),
    'classifier': ('CalibratedClassifierCV_LogisticRegression', CalibratedClassifierCV, {'base_estimator': clf_LogisticRegression, 'method': 'isotonic', 'cv': 3}),
}

wordembed_model = {
    'title': 'wordembed_model',
    'features': ('FT', {'ft': fasttext}),
    'vectorizer': ('TfidfVectorizer', u.PassThrough, {}),
    'scaler': ('MinMaxScaler', MinMaxScaler, {'feature_range': (0,1)}),
    #'sampler': ('TomekLinks', TomekLinks, {'random_state': 1, 'sampling_strategy': 'majority', 'n_jobs': -1}),
    'sampler': ('No_sampling', u.PassThrough, {}),
    'classifier': ('CalibratedClassifierCV_LinearSVC', CalibratedClassifierCV, {'base_estimator': clf_LinearSVC, 'method': 'isotonic', 'cv': 3}),
    #'classifier': ('LinearSVC', LinearSVC, {'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01}),
}

In [None]:
class FastTextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col

    def fit(self, df, y=None):
        return self

    def transform(self, df):
        return np.stack(df[self.col].to_numpy())

def modeller(data, model):

    features = []

    if 'text' in model['features'][1]:
        features.append(
            ('text', Pipeline([
                ('article', u.ColumnSelector(model['features'][1]['text'])),
                ('vectorizer', model['vectorizer'][1](**model['vectorizer'][2])),
            ]))
        )

    if 'ft' in model['features'][1]:
        features.append(
            ('embeddings', Pipeline([
                ('ft', FastTextSelector(model['features'][1]['ft'])),
                ('scaler', model['scaler'][1](**model['scaler'][2])),
            ]))
        )

    model = Pipeline([
        ('features', FeatureUnion(features)),
        ('sampler', model['sampler'][1](**model['sampler'][2])),
        ('classifier', model['classifier'][1](**model['classifier'][2]))
    ])
    

    model.fit(data['train_X'], data['train_y'])
    preds = model.predict(data['validation_X'])
    probs = model.predict_proba(data['validation_X'])
    
    metrics = {}
    
    if 'validation_y' in data:
        metrics['acc'] = accuracy_score(data['validation_y'], preds)
        metrics['prec'], metrics['reca'], metrics['fsco'], _ = precision_recall_fscore_support(data['validation_y'], preds, average='macro')
        #print(acc, prec, reca, fsco)
    
    return preds, probs, metrics, model['classifier'].classes_