In [1]:
import pandas as pd
import numpy as np

import utils as u

import pickle
import time
import re

import murmurhash as mhash

from IPython.display import display
%matplotlib inline

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

# vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

# feature selectors
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from sklearn.cluster import FeatureAgglomeration


# scalers
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

# classifiers
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

# samplers
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks

from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

# calibration
from sklearn.calibration import CalibratedClassifierCV

In [2]:
#df = pd.read_pickle('../pandas/lemma_delivered_merged_ft_s_dropped2_df.pkl')
df = pd.read_pickle('../pandas/lemma_delivered_merged_df.pkl')

In [3]:
#df['ft'] = pd.read_pickle('../pandas/FT_TFIDF_lemma_full_vocab.pkl')
df['ft'] = pd.read_pickle('../pandas/FT_TFIDF_lemma_labeled_vocab.pkl')
#df['ft'] = pd.read_pickle('../pandas/FT_TFIDF_labeled_vocab.pkl')
#df['ft'] = pd.read_pickle('../pandas/FT_TFIDF_full_vocab.pkl')

In [4]:
df['is_nn'] = pd.read_pickle('../pandas/is_nn_full.pkl')

In [5]:
labeled_corpus = df[(df['agg_label'] != -1) & (df['is_nn'] == False)]
unlabeled_corpus = df[(df['agg_label'] == -1) & (df['is_nn'] == False)]

#labeled_corpus = df[(df['agg_label'] != -1)]
#labeled_corpus = df[(df['agg_label'] != -1) & (df['agg_label'] < 90)]
#labeled_corpus = df[(df['agg_label'] != -1)]['lemma_delivered']
#target = df[(df['agg_label'] != -1) & (df['agg_label'] < 90)]['agg_label']
#target = df[(df['agg_label'] != -1)]['agg_label']
#unlabeled_corpus = df[(df['agg_label'] == -1)]

target = 'agg_label'
text = 'lemma_delivered'
fasttext = 'ft'
numeric = ['raw_len', 'raw_word_count']

In [6]:
train_X, vali_X, train_y, vali_y = train_test_split(
    labeled_corpus,
    labeled_corpus[target],
    test_size=0.4,
    random_state=1,
    stratify=labeled_corpus[target])

test_X, validation_X, test_y, validation_y = train_test_split(
    vali_X,
    vali_y,
    test_size=0.5,
    random_state=1,
    stratify=vali_y)

In [None]:
stopwords_lemma = np.loadtxt('stopwords_lemma.txt', dtype=str)

In [11]:
def tokenizer(txt):
    p = re.compile(r"\d*([^\d\W]+)\d*")
    result = []
    for word in txt.split():
        result.append(p.sub(r"\1", word))
    return result

In [7]:
# vectorizer params
TfidfVectorizer_params = list(ParameterGrid({
    'strip_accents': ['ascii', 'unicode', None],
    'max_df': [round(0.01*x, 2) for x in range(5,45,10)],
    #'tokenizer': [lambda x: x.split(), None],
    'lowercase': [True, False],
    'max_features': [1000*x for x in range(10,50,10)] + [None],
    #'stop_words': [list(stopwords_lemma), None],
    #'norm': ['l2', None],
    #'use_idf': [True, False]
}))

CountVectorizer_params = list(ParameterGrid({
    'strip_accents': ['ascii', 'unicode', None],
    'max_df': [round(0.1*x, 2) for x in range(3,11,2)] + [1],
    'max_features': [1000*x for x in range(5,50,20)] + [None],
}))

HashingVectorizer_params = list(ParameterGrid({
    'strip_accents': ['ascii', 'unicode', None],
    'n_features': [2**x for x in range(15,25,2)],
    'norm': ['l2'],
    'alternate_sign': [False]
}))

# feature selector params
SelectKBest_params = list(ParameterGrid({
    'score_func': [chi2, f_classif, mutual_info_classif],
    'k': [10000, 25000, 40000, 55000],
    #'score_func': [chi2],
    #'k': [30000, 35000, 40000]
}))

# scaler params
MinMaxScaler_params = list(ParameterGrid({
    'feature_range': [(0,1)]
}))

# classifier params
MultinomialNB_params = list(ParameterGrid({
    'alpha': [0.01, 0.0015, 0.001, 0.0005, 0.0001]
}))

LinearSVC_params = list(ParameterGrid({
    #'loss': ['hinge', 'squared_hinge'],
    #'tol': [0.01, 0.001, 0.0001, 0.00001],
    #'C': [0.001, 0.01, 0.1, 1, 10],
    #'class_weight': ['balanced', None]
    'loss': ['squared_hinge'],
    #'tol': [0.01, 0.005, 0.001, 0.0005],
    'tol': [0.008, 0.01, 0.015, 0.02],
    #'C': [0.5, 1, 1.5, 2],
    'C': [0.1, 0.3, 0.5, 0.8],
    #'C': [1],
    #'class_weight': [None],
    #'max_iter': [1500]
}))

ComplementNB_params = list(ParameterGrid({
    'alpha': [0.1, 0.2, 0.5, 1]
}))

SGDClassifier_params = list(ParameterGrid({
    'loss': ['hinge', 'log', 'squared_hinge', 'perceptron'],
    'alpha': [0.001,0.0001, 0.00001, 0.000001],
    'n_jobs': [-1],
    'random_state': [1]
}))

LogisticRegression_params = list(ParameterGrid({
    'tol': [0.001, 0.0001, 0.00001, 0.000001],
    'C': [6, 8, 10, 12],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'n_jobs': [-1],
    'random_state': [1]
}))

# sampler params
ClusterCentroids_params = list(ParameterGrid({
    'random_state': [1],
    'sampling_strategy': ['majority', 'not minority', 'all'],
    'n_jobs': [-1]
}))

RandomUnderSampler_params = list(ParameterGrid({
    'random_state': [1],
    'sampling_strategy': ['majority', 'auto'],
    'replacement': [True, False]
}))

TomekLinks_params = list(ParameterGrid({
    'random_state': [1],
    'sampling_strategy': ['majority', 'auto'],
    'n_jobs': [-1]
}))

label_targets = {
    #12: 2388,
    #93: 2388,
    #91: 2388,
    15: 2000,
    16: 2000,
    20: 2000,
    3: 2000,
    10: 1500,
    92: 1500,
    17: 1500,
    19: 1500,
    4: 1000,
    13: 1000,
    1: 1000,
    23: 1000,
    8: 1000,
    6: 1000,
    7: 1000,
    25: 1000,
    5: 1000,
    2: 1000,
    9: 1000,
    24: 800,
    14: 800,
    26: 800,
    21: 800,
    18: 800,
}

ADASYN_params = list(ParameterGrid({
    'random_state': [1],
    'sampling_strategy': ['minority'],
    'n_neighbors': [23, 27, 31, 33],
    #'n_neighbors': [31,41,61,71,81],
    'n_jobs': [-1]
}))

SMOTE_params = list(ParameterGrid({
    'random_state': [1],
    'sampling_strategy': ['minority', 'auto'],
    'k_neighbors': [3,5,7,13],
    'n_jobs': [-1]
}))

RandomOverSampler_params = list(ParameterGrid({
    'random_state': [1],
    'sampling_strategy': ['minority', 'auto']
}))

CalibratedClassifierCV_params = list(ParameterGrid({
    'cv': [3],
    'method': ['sigmoid', 'isotonic']
}))

In [8]:
# feature combinations
feats = [
    #('TextOnly', {'text': text}),
    ('FTOnly', {'ft': fasttext}),
    #('Text+FT', {'text': text, 'ft': fasttext}),
    #('Text+DLWC', {'text': text, 'numeric': numeric}),
    #('TextOnly', False),
    #('TextPOS', pos_features),
    #('TextONT', ont_features),
    #('TextDLWC', numerical_features),
    #('All', pos_features + ont_features + numerical_features),
]


# vectorizers
vectorizers = [
    #('No_vectorizer', u.PassThrough, [{}]),
    #('TfidfVectorizer', TfidfVectorizer, [{}]),
    #('TfidfVectorizer', TfidfVectorizer, [{'max_df': 0.3, 'max_features': None, 'norm': 'l2', 'strip_accents': 'ascii', 'use_idf': False}]),
    #('TfidfVectorizer', TfidfVectorizer, TfidfVectorizer_params),
    ('TfidfVectorizer', TfidfVectorizer, [{'lowercase': False, 'max_df': 0.25}]), 
    #('TfidfVectorizer', TfidfVectorizer, [{'lowercase': False, 'max_df': 0.25, 'tokenizer': tokenizer}]), 
    #('CountVectorizer', CountVectorizer, CountVectorizer_params),
    #('HashingVectorizer', HashingVectorizer, HashingVectorizer_params),  
]

#corpus = ['empty']
#fastt = False

# feature selectors
selectors = [
    ('No_selector', u.PassThrough, [{}]),
    #('SelectKBest', SelectKBest, [{'score_func': f_classif, 'k': 35000}]),
    #('SelectKBest', SelectKBest, [{'score_func': chi2, 'k': 30000}]),
    #('SelectKBest', SelectKBest, [{'score_func': chi2, 'k': 40000}]),
    #('SelectKBest', SelectKBest, SelectKBest_params),
]

# scalers
scalers = [
    #('No_scaling', u.PassThrough, [{}]),
    ('MinMaxScaler', MinMaxScaler, [{'feature_range': (0,1)}]),
    #('MinMaxScaler', MinMaxScaler, MinMaxScaler_params),
    #('MaxAbsScaler', MaxAbsScaler, [{}]),
]


clf_LinearSVC = LinearSVC(**{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01})

# classifiers
classifiers = [
    #('MultinomialNB', MultinomialNB, MultinomialNB_params),
    #('LinearSVC', LinearSVC, LinearSVC_params),
    ('LinearSVC', LinearSVC, [{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01}]),
    #('CalibratedClassifierCV_LinearSVC', CalibratedClassifierCV, [{'base_estimator': clf_LinearSVC, 'method': 'isotonic', 'cv': 3}]),
    #('ComplementNB', ComplementNB, ComplementNB_params),
    #('ComplementNB', ComplementNB, [{'alpha': 0.2}]),
    #('SGDClassifier', SGDClassifier, SGDClassifier_params),
    #('SGDClassifier', SGDClassifier, [{'alpha': 0.00001, 'loss': 'log', 'n_jobs': -1, 'random_state': 1}]),
    #('MultinomialNB', MultinomialNB, [{'alpha': 0.0015}]),
    #('LogisticRegression', LogisticRegression, [{}]),
    #('LogisticRegression', LogisticRegression, LogisticRegression_params),
    #('LogisticRegression', LogisticRegression, [{'solver': 'saga', 'tol': 0.0001, 'C': 8, 'n_jobs': -1, 'random_state': 1}]),
    #('SGDClassifier', SGDClassifier, [{}]),
    #('LinearSVC', LinearSVC, [{}]),
    #('MultinomialNB', MultinomialNB, [{}]),
    #('ComplementNB', ComplementNB, [{}]),
    #('LinearSVC', LinearSVC, [{'C': 1, 'class_weight': None, 'loss': 'hinge', 'tol': 0.01}]),
    #('LinearSVC', LinearSVC, [{'C': 0.1, 'class_weight': None, 'loss': 'squared_hinge', 'tol': 0.01}]),
    #('ComplementNB', ComplementNB, [{'alpha': 0.2, 'norm': False}]),
    #('SGDClassifier', SGDClassifier, [{'alpha': 0.01, 'loss': 'hinge', 'n_jobs': -1}]),
    #('LogisticRegression', LogisticRegression, [{'solver': 'lbfgs', 'multi_class': 'multinomial', 'max_iter': 500}]),
    #('XGBClassifier', XGBClassifier, [{}]),
    #('PassiveAggressiveClassifier', PassiveAggressiveClassifier, [{}]),
    #('RandomForestClassifier', RandomForestClassifier, [{'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 3, 'random_state': 0}]),
    #('MLPClassifier', MLPClassifier, [{'hidden_layer_sizes': (40,), 'max_iter': 500, 'alpha': 1e-4, 'solver': 'sgd', 'verbose': 10, 'random_state': 1, 'learning_rate_init': .1}]),
    #('BucketClassifier', mt.BucketClassifier, [{'hashfunc': mhash, 'classifiers': [LogisticRegression(**{'solver': 'lbfgs', 'multi_class': 'multinomial'}) for _ in range(10)], 'judge': MultinomialNB(alpha=0.001), 'num_buckets': 8}]),
    #('EnsembleClassifier', mt.EnsembleClassifier, [{'binary_clf': LogisticRegression, 'binary_clf_params': {'solver': 'lbfgs'}, 'bucket_clf': MultinomialNB, 'bucket_clf_params': {'alpha': 0.001}, 'judge': MultinomialNB(alpha=0.001)}]),
]

# samplers
samplers = [
    #('SMOTE', SMOTE, [{'random_state': 1, 'sampling_strategy': 'minority', 'k_neighbors': 3, 'n_jobs': -1}]),
    #('SMOTE', SMOTE, [{'random_state': 1, 'sampling_strategy': 'minority', 'k_neighbors': 7, 'n_jobs': -1}]),
    ('No_sampling', u.PassThrough, [{}]),
    #('ClusterCentroids', ClusterCentroids, ClusterCentroids_params),
    #('ClusterCentroids', ClusterCentroids, [{'random_state': 1, 'sampling_strategy': 'majority', 'n_jobs': -1}]),
    #('RandomUnderSampler', RandomUnderSampler, RandomUnderSampler_params),
    #('RandomUnderSampler', RandomUnderSampler, [{'random_state': 1, 'sampling_strategy': 'majority', 'replacement': False}]),
    #('TomekLinks', TomekLinks, TomekLinks_params),
    #('TomekLinks', TomekLinks, [{'random_state': 1, 'sampling_strategy': 'majority', 'n_jobs': -1}]),
    #('ADASYN', ADASYN, ADASYN_params),
    #('ADASYN', ADASYN, [{'n_jobs': -1, 'n_neighbors': 27, 'random_state': 1, 'sampling_strategy': 'minority'}]),
    #('SMOTE', SMOTE, SMOTE_params),
    #('SMOTE', SMOTE, [{'n_jobs': -1, 'k_neighbors': 3, 'random_state': 1, 'sampling_strategy': 'minority'}]),
    #('RandomOverSampler', RandomOverSampler, RandomOverSampler_params),
    #('RandomOverSampler', RandomOverSampler, [{'random_state': 1, 'sampling_strategy': 'minority'}]),
]

In [9]:
results, preds_validation = u.model_iterator({
    'train_X': train_X,
    'train_y': train_y,
    #'validation_X': validation_X,
    #'validation_y': validation_y,
    'validation_X': test_X,
    'validation_y': test_y
}, feats, vectorizers, classifiers, samplers, selectors, scalers)

display(results.sort_values(by=['Accuracy', 'Precision'], ascending=False))

1_FTOnly/TfidfVectorizer/LinearSVC/No_sampling/No_selector/MinMaxScaler
{'lowercase': False, 'max_df': 0.25}
{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01}
{}
{}
{'feature_range': (0, 1)}
0.6713174134692782 0.6715701110883445 0.6180819985345474 0.6303858624039064


Unnamed: 0,Features,Vectorizer,V.params,Selector,Sel.params,Scaler,Sca.params,Classifier,C.params,Sampler,S.params,Accuracy,Precision,Recall,Fscore
1_FTOnly/TfidfVectorizer/LinearSVC/No_sampling/No_selector/MinMaxScaler,FTOnly,TfidfVectorizer,"{'lowercase': False, 'max_df': 0.25}",No_selector,{},MinMaxScaler,"{'feature_range': (0, 1)}",LinearSVC,"{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01}",No_sampling,{},0.671317,0.67157,0.618082,0.630386


In [68]:
np.stack(df['ft'].to_numpy()).min()

-1.5358802111241325

In [62]:
results.sort_values(by=['Accuracy', 'Precision'], ascending=False)['C.params'].tolist()

[{'C': 6,
  'n_jobs': -1,
  'random_state': 1,
  'solver': 'liblinear',
  'tol': 0.0001},
 {'C': 6,
  'n_jobs': -1,
  'random_state': 1,
  'solver': 'liblinear',
  'tol': 1e-05},
 {'C': 6,
  'n_jobs': -1,
  'random_state': 1,
  'solver': 'liblinear',
  'tol': 1e-06},
 {'C': 8, 'n_jobs': -1, 'random_state': 1, 'solver': 'lbfgs', 'tol': 0.001},
 {'C': 8, 'n_jobs': -1, 'random_state': 1, 'solver': 'lbfgs', 'tol': 0.0001},
 {'C': 8, 'n_jobs': -1, 'random_state': 1, 'solver': 'lbfgs', 'tol': 1e-05},
 {'C': 8, 'n_jobs': -1, 'random_state': 1, 'solver': 'lbfgs', 'tol': 1e-06},
 {'C': 6, 'n_jobs': -1, 'random_state': 1, 'solver': 'saga', 'tol': 0.0001},
 {'C': 6, 'n_jobs': -1, 'random_state': 1, 'solver': 'saga', 'tol': 1e-05},
 {'C': 6, 'n_jobs': -1, 'random_state': 1, 'solver': 'saga', 'tol': 1e-06},
 {'C': 8,
  'n_jobs': -1,
  'random_state': 1,
  'solver': 'liblinear',
  'tol': 0.0001},
 {'C': 6, 'n_jobs': -1, 'random_state': 1, 'solver': 'saga', 'tol': 0.001},
 {'C': 12, 'n_jobs': -1, 'ra

In [None]:
results.loc['71_TextOnly/TfidfVectorizer/LogisticRegression/No_sampling/No_selector/No_scaling']['C.params']


In [None]:
display(results.sort_values(by=['Accuracy', 'Precision'], ascending=False).groupby('Classifier').head(5))

In [None]:
timestamp = str(round(time.time()))
results.to_pickle('../stats/'+timestamp+'_tfidfgrid_results.pkl')
with open('../stats/'+timestamp+'_tfidfgrid_preds.pkl', 'wb') as f:
    pickle.dump(preds_validation, f, pickle.HIGHEST_PROTOCOL)

In [None]:
display(results.sort_values(by=['Accuracy', 'Precision'], ascending=False)['S.params'].tolist())

In [None]:
testy = u.plot_cf(preds_validation['validation_y'], preds_validation['preds'][0], title = 'test')

In [None]:
vec = TfidfVectorizer(**{'lowercase': False, 'max_df': 0.25}) 

vec_train = vec.fit_transform(train_X[text])
vec_test = vec.transform(vali_X[text])

agg = FeatureAgglomeration(n_clusters=50)

vec_train_reduced = agg.fit_transform(vec_train)
vec_test_reduced = agg.transform(vec_test)

clf = MultinomialNB(**{'alpha': 0.001})
clf.fit(vec_train_reduced, train_y)

preds = clf.predict(vec_test_reduced)

np.mean(preds == vali_y)
