In [1]:
import pandas as pd
import numpy as np

import utils as u

import pickle
import time

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.pipeline import FeatureUnion
from imblearn.pipeline import Pipeline

# vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# feature selectors
from sklearn.feature_selection import SelectKBest, chi2

# scalers
from sklearn.preprocessing import MinMaxScaler

# classifiers
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB

# samplers
from imblearn.under_sampling import TomekLinks

# calibration
from sklearn.calibration import CalibratedClassifierCV

In [2]:
df = pd.read_pickle('../pandas/lemma_delivered_merged_df.pkl')

In [3]:
#df['ft'] = pd.read_pickle('../pandas/FT_TFIDF_lemma_labeled_vocab.pkl')
df['ft'] = pd.read_pickle('../pandas/FT_TFIDF_lemma_full_vocab.pkl')
df['is_nn'] = pd.read_pickle('../pandas/is_nn_full.pkl')

In [4]:
labeled_corpus = df[(df['agg_label'] != -1) & (df['is_nn'] == False)]
unlabeled_corpus = df[(df['agg_label'] == -1) & (df['is_nn'] == False)]

target = 'agg_label'
text = 'lemma_delivered'
fasttext = 'ft'
numeric = ['raw_len', 'raw_word_count']

In [5]:
train_X, vali_X, train_y, vali_y = train_test_split(
    labeled_corpus,
    labeled_corpus[target],
    test_size=0.4,
    random_state=1,
    stratify=labeled_corpus[target])

test_X, validation_X, test_y, validation_y = train_test_split(
    vali_X,
    vali_y,
    test_size=0.5,
    random_state=1,
    stratify=vali_y)

In [6]:
class FastTextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col

    def fit(self, df, y=None):
        return self

    def transform(self, df):
        return np.stack(df[self.col].to_numpy())

def modeller(data, feats, vectorizer, scaler, selector, sampler, classifier):

    features = []

    if 'text' in feats:
        features.append(
            ('text', Pipeline([
                ('article', u.ColumnSelector(feats['text'])),
                ('vectorizer', vectorizer['vec'](**vectorizer['params'])),
                #('selector', selector['sel'](**selector['params'])),
            ]))
        )

    if 'numeric' in feats:
        features.append(
            ('numerical', Pipeline([
                ('numeric', u.ColumnSelector(feats['numeric'])),
                ('scaler', scaler['sca'](**scaler['params'])),
            ]))
        )

    if 'ft' in feats:
        features.append(
            ('embeddings', Pipeline([
                ('ft', FastTextSelector(feats['ft'])),
                ('scaler', scaler['sca'](**scaler['params'])),
            ]))
        )

    model = Pipeline([
        ('features', FeatureUnion(features)),
        #('selector', selector['sel'](**selector['params'])),
        ('sampler', sampler['smpl'](**sampler['params'])),
        #('scaler', scaler['sca'](**scaler['params'])),
        ('classifier', classifier['clf'](**classifier['params']))
    ])
    

    model.fit(data['train_X'], data['train_y'])
    preds = model.predict(data['validation_X'])
    probs = model.predict_proba(data['validation_X'])
    
    metrics = {}
    
    if 'validation_y' in data:
        metrics['acc'] = accuracy_score(data['validation_y'], preds)
        metrics['prec'], metrics['reca'], metrics['fsco'], _ = precision_recall_fscore_support(data['validation_y'], preds, average='macro')
        #print(acc, prec, reca, fsco)
    
    return preds, probs, metrics

In [7]:
# feature combinations
feats = [
    #('TextOnly', {'text': text}),
    #('FTOnly', {'ft': fasttext}),
    ('Text+FT', {'text': text, 'ft': fasttext}),
    #('Text+DLWC', {'text': text, 'numeric': numeric}),
    #('All', {'text': text, 'numeric': numeric, 'ft': fasttext}),
]


# vectorizers
vectorizers = [
    #('No_vectorizer', u.PassThrough, [{}]),
    ('TfidfVectorizer', TfidfVectorizer, [{'lowercase': False, 'max_df': 0.25}]),  
]

#corpus = ['empty']
#fastt = False

# feature selectors
selectors = [
    ('No_selector', u.PassThrough, [{}]),
    #('SelectKBest', SelectKBest, [{'score_func': chi2, 'k': 40000}]),
]

# scalers
scalers = [
    #('No_scaling', u.PassThrough, [{}]),
    ('MinMaxScaler', MinMaxScaler, [{'feature_range': (0,1)}]),
]

# classifiers

clf = LinearSVC(**{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01})

classifiers = [
    #('LinearSVC', LinearSVC, [{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01}]),
    ('CalibratedClassifierCV_LinearSVC', CalibratedClassifierCV, [{'base_estimator': clf, 'method': 'isotonic', 'cv': 3}]),
]

# samplers
samplers = [
    #('No_sampling', u.PassThrough, [{}]),
    ('TomekLinks', TomekLinks, [{'random_state': 1, 'sampling_strategy': 'majority', 'n_jobs': -1}]),
]


def model_caller(data):
  
    preds, probs, metrics = modeller(**{
        'data': data,
        'feats': feats[0][1],
        'vectorizer': {
            'vec': vectorizers[0][1],
            'params': vectorizers[0][2][0]
        },
        'scaler': {
            'sca': scalers[0][1],
            'params': scalers[0][2][0]
        },
        'selector': {
            'sel': selectors[0][1],
            'params': selectors[0][2][0]
        },
        'sampler': {
            'smpl': samplers[0][1],
            'params': samplers[0][2][0]
        },
        'classifier': {
            'clf': classifiers[0][1],
            'params': classifiers[0][2][0]
        }
    })
    
    model_title = {
        'features': feats[0][0],
        'vectorizer': vectorizers[0][0],
        'scaler': scalers[0][0],
        'selector': selectors[0][0],
        'sampler': samplers[0][0],
        'classifier': classifiers[0][0],
    }
    
    print(model_title)
    return preds, probs, metrics

In [18]:
train_trash_y = [1 if x in [91, 92, 93] else 0 for x in train_y]
validation_trash_y = [1 if x in [91, 92, 93] else 0 for x in validation_y]
test_trash_y = [1 if x in [91, 92, 93] else 0 for x in test_y]

In [24]:
data_in = {
    'train_X': train_X,
    'train_y': train_trash_y,
    'validation_X': df
}

preds, probs, _ = model_caller(data_in)

{'features': 'Text+FT', 'vectorizer': 'TfidfVectorizer', 'scaler': 'MinMaxScaler', 'selector': 'No_selector', 'sampler': 'TomekLinks', 'classifier': 'CalibratedClassifierCV_LinearSVC'}


In [22]:
np.mean(preds == test_trash_y)

0.9003219747786423

In [36]:
df['is_trash'] = [True if x == 1 else False for x in preds]

In [38]:
df['is_trash'].to_pickle('../pandas/is_trash_full.pkl')

In [51]:
df.loc[labeled_corpus.index][(~df.loc[labeled_corpus.index]['agg_label'].isin([91,92,93])) & (df.loc[labeled_corpus.index]['is_trash'] == True)]['agg_label'].value_counts().sum()

793

In [35]:
df.loc[labeled_corpus.index]['agg_label'].isin([91,92,93]).sum()

7623

In [44]:
df.loc[unlabeled_corpus.index]['is_trash'].sum()

764869