# Imports

In [1]:
import string
import pandas as pd
import numpy as np
import scipy.stats as stats
import json
import matplotlib.pyplot as plt

from collections import defaultdict
from preprocessing import Preprocessor
from data_loader import DataLoader, Encoder
from helpers import select_n_components, pos_check
from textblob import TextBlob
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator
from sklearn.utils.fixes import loguniform
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.base import TransformerMixin
from tempfile import mkdtemp
from shutil import rmtree
from joblib import Memory

[nltk_data] Downloading package stopwords to
[nltk_data]     /afs/inf.ed.ac.uk/user/s21/s2125219/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [2]:
train, validate, test = DataLoader().create_dataframe(preprocess=True, split=True, remove_duplicates=True)

  1%|▏         | 3454/239073 [00:00<00:06, 34534.19it/s]

Preprocessing...


100%|██████████| 239073/239073 [00:05<00:00, 42436.23it/s]


Deduplicating...


In [3]:
train.head()

Unnamed: 0,phrase_id,phrase,phrase_clean,sentiment_val,label_id,label,word_count
190264,212000,so second-rate,secondrate,0.26389,2.0,Negative,1
120749,121243,go with this claustrophobic concept,go claustrophobic concept,0.36111,2.0,Negative,3
189861,211915,so completely,completely,0.5,3.0,Neutral,1
106888,6123,ease and confidence,ease confidence,0.65278,4.0,Positive,2
223674,42917,treachery and,treachery,0.45833,3.0,Neutral,1


In [8]:
train.shape

(126124, 7)

# Train, test, dev split

In [9]:
X_train, y_train = train['phrase_clean'], train['label_id']
X_val, y_val = validate['phrase_clean'], validate['label_id']
X_test, y_test = test['phrase_clean'], test['label_id']

# Feature Engineering & Selection

- Features to include:

    - phrase length
    - punctuation count
    - capital letters count

In [10]:
punct_count = lambda l1, l2: sum([1 for x in l1 if x in l2])
caps_count = lambda l1: sum([1 for x in l1 if x.isupper()])

def get_phrase_length(text):
    return np.array([len(t) for t in tqdm(text)]).reshape(-1, 1)

def get_num_punct(text):
    return np.array([punct_count(t, set(string.punctuation)) for t in tqdm(text)]).reshape(-1, 1)

def get_num_caps(text):
    return np.array([caps_count(t) for t in tqdm(text)]).reshape(-1, 1)

In [11]:
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             max_features=10000, 
                             use_idf=True, 
                             ngram_range=(1,3))

# Prediction

In [12]:
class Result:
    def __init__(self):
        self.accuracy = 0
        self.precision = 0
        self.recall = 0
        self.f1 = 0
        self.kappa = 0
        
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        print('Making dense transformation...\n')
        return X.todense()

class ClassificationPipeline():
    def __init__(self, clf_id, clf, vectorizer, feature_processing, pipe=None):
        self.pipe = pipe 
        self.clf_id = clf_id 
        self.clf = clf
        self.vectorizer = vectorizer
        self.feature_processing = feature_processing
                   
    def create_feature_pipeline(self, memory):
        self.pipe = Pipeline([
            ('feature_pipeline', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', self.vectorizer),
                    ('to_dense', DenseTransformer()),
                    ('lda', LinearDiscriminantAnalysis(n_components=4)),
                ])),
                ('feature_processing', self.feature_processing)
            ])),
            (self.clf_id, self.clf)
        ], memory=memory)
    
    def create_pipeline(self, memory):
        self.pipe = Pipeline([
            ('feature_pipeline', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', self.vectorizer),
                    ('to_dense', DenseTransformer()),
                    ('lda', LinearDiscriminantAnalysis(n_components=4)),
                ])),
            ])),
            (self.clf_id, self.clf)
        ], memory=memory)
            
    def train_and_evaluate(self, X_train, y_train, X_val, y_val, memory, use_features=True):
        if use_features:
            self.create_feature_pipeline(memory=memory)
        else:
            self.create_pipeline(memory=memory)
            
        self.pipe.fit(X_train, y_train)
        preds = self.pipe.predict(X_val)
        
        accuracy = accuracy_score(y_val, preds)
        precision = precision_score(y_val, preds, average='macro')
        recall = recall_score(y_val, preds, average='macro')
        f1 = f1_score(y_val, preds, average='macro')
        kappa = cohen_kappa_score(y_val, preds)
        
        return accuracy, precision, recall, f1, kappa

In [26]:
# Features
features = FeatureUnion([
    ('phrase_length', Pipeline([
        ('f1', FunctionTransformer(get_phrase_length, validate=False))]
    )),
#     ('num_punct', Pipeline([
#         ('f2', FunctionTransformer(get_num_punct, validate=False))]
#     )),
#     ('num_caps', Pipeline([
#         ('f3', FunctionTransformer(get_num_caps, validate=False))
#     ]))
])

feature_processing = Pipeline([('features', features)])

# Classifiers
dc = ClassificationPipeline(clf_id='dc', 
                            clf=DummyClassifier(strategy='most_frequent'),
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
gnb = ClassificationPipeline(clf_id='gnb',
                            clf=GaussianNB(),
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
lr = ClassificationPipeline(clf_id='lr', 
                           clf=LogisticRegression(max_iter=10000),
                           vectorizer=tfidf_vect,
                           feature_processing=feature_processing)
lin_svm = ClassificationPipeline(clf_id='lin_svm', 
                                 clf=LinearSVC(),
                                 vectorizer=tfidf_vect,
                                 feature_processing=feature_processing)
rbf_svm = ClassificationPipeline(clf_id='rbf_svm', 
                                 clf=SVC(kernel='rbf'), 
                                 vectorizer=tfidf_vect,
                                 feature_processing=feature_processing)
rf = ClassificationPipeline(clf_id='rf', 
                            clf=RandomForestClassifier(max_depth=10, n_estimators=50), 
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
mlp = ClassificationPipeline(clf_id='mlp', 
                             clf=MLPClassifier(max_iter=800),
                             vectorizer=tfidf_vect,
                             feature_processing=feature_processing)

In [27]:
# Train and evaluate classifiers using additional features
clfs = [dc, gnb, lr, lin_svm, rbf_svm, rf, mlp]

# Train classifiers using additional features

In [15]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

results = defaultdict(Result)

for clf in clfs:
    print(f'Training {clf.clf_id}...\n')
    
    accuracy, precision, recall, f1, kappa = clf.train_and_evaluate(X_train, y_train, X_val, y_val, 
                                                          use_features=True, memory=memory)
    
    results[clf.clf_id].accuracy = accuracy 
    results[clf.clf_id].precision = precision
    results[clf.clf_id].recall = recall
    results[clf.clf_id].f1 = f1
    results[clf.clf_id].kappa = kappa
    
rmtree(cachedir)

You provided "cachedir='/tmp/tmpdtl1wrtb'", use "location='/tmp/tmpdtl1wrtb'" instead.
  


Training dc...

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(transformer_list=[('text',
                                Pipeline(steps=[('vectorizer',
                                                 TfidfVectorizer(max_features=10000,
                                                                 ngram_range=(1,
                                                                              3))),
                                                ('to_dense',
                                                 <__main__.DenseTransformer object at 0x7fc4cd151110>),
                                                ('lda',
                                                 LinearDiscriminantAnalysis(n_components=4))])),
                             ..., 
190264                   secondrate
120749    go claustrophobic concept
189861                   completely
106888        

100%|██████████| 126124/126124 [00:00<00:00, 2175496.36it/s]
100%|██████████| 126124/126124 [00:00<00:00, 425104.35it/s]
100%|██████████| 126124/126124 [00:00<00:00, 499577.30it/s]
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


______________________________________________fit_transform_one - 252.9s, 4.2min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2281414.55it/s]
100%|██████████| 47470/47470 [00:00<00:00, 410605.88it/s]
100%|██████████| 47470/47470 [00:00<00:00, 447261.02it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Training gnb...

[Memory]257.3s, 4.3min  : Loading _fit_transform_one from /tmp/tmpdtl1wrtb/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2304199.92it/s]
100%|██████████| 47470/47470 [00:00<00:00, 401877.16it/s]
100%|██████████| 47470/47470 [00:00<00:00, 491283.91it/s]


Training lr...

[Memory]260.2s, 4.3min  : Loading _fit_transform_one from /tmp/tmpdtl1wrtb/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1821058.51it/s]
100%|██████████| 47470/47470 [00:00<00:00, 405182.85it/s]
100%|██████████| 47470/47470 [00:00<00:00, 470054.04it/s]


Training lin_svm...

[Memory]277.2s, 4.6min  : Loading _fit_transform_one from /tmp/tmpdtl1wrtb/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min




Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1856028.59it/s]
100%|██████████| 47470/47470 [00:00<00:00, 406544.64it/s]
100%|██████████| 47470/47470 [00:00<00:00, 490732.96it/s]


Training rbf_svm...

[Memory]334.9s, 5.6min  : Loading _fit_transform_one from /tmp/tmpdtl1wrtb/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1844884.37it/s]
100%|██████████| 47470/47470 [00:00<00:00, 404043.22it/s]
100%|██████████| 47470/47470 [00:00<00:00, 479240.00it/s]


Training rf...

[Memory]1059.1s, 17.7min: Loading _fit_transform_one from /tmp/tmpdtl1wrtb/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1985160.04it/s]
100%|██████████| 47470/47470 [00:00<00:00, 397256.98it/s]
100%|██████████| 47470/47470 [00:00<00:00, 450708.45it/s]


Training mlp...

[Memory]1068.8s, 17.8min: Loading _fit_transform_one from /tmp/tmpdtl1wrtb/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2247066.91it/s]
100%|██████████| 47470/47470 [00:00<00:00, 411023.76it/s]
100%|██████████| 47470/47470 [00:00<00:00, 450236.56it/s]


In [16]:
for k, v in json.loads(Encoder().encode(results)).items():
    accuracy = v['accuracy']
    precision = v['precision']
    recall = v['recall']
    f1 = v['f1']
    kappa = v['kappa']
    
    print(f'{k} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, Kappa: {kappa:.3f}')
    print()

dc --- Accuracy: 0.498, Precision: 0.100, Recall: 0.200, F1: 0.133, Kappa: 0.000

gnb --- Accuracy: 0.602, Precision: 0.501, Recall: 0.493, F1: 0.495, Kappa: 0.391

lr --- Accuracy: 0.623, Precision: 0.554, Recall: 0.469, F1: 0.499, Kappa: 0.406

lin_svm --- Accuracy: 0.574, Precision: 0.559, Recall: 0.377, F1: 0.378, Kappa: 0.343

rbf_svm --- Accuracy: 0.628, Precision: 0.560, Recall: 0.465, F1: 0.494, Kappa: 0.415

rf --- Accuracy: 0.626, Precision: 0.549, Recall: 0.487, F1: 0.510, Kappa: 0.419

mlp --- Accuracy: 0.622, Precision: 0.541, Recall: 0.495, F1: 0.513, Kappa: 0.419



# Train classifiers without using additional features

In [17]:
# Train and evaulate classifiers without additional features
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

results = defaultdict(Result)

for clf in clfs:
    print(f'Training {clf.clf_id}...\n')
    
    accuracy, precision, recall, f1, kappa = clf.train_and_evaluate(X_train, y_train, X_val, y_val, 
                                                          use_features=True, memory=memory)
    
    print(f'{clf.clf_id} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1}, Kappa: {kappa}')
    
    results[clf.clf_id].accuracy = accuracy 
    results[clf.clf_id].precision = precision
    results[clf.clf_id].recall = recall
    results[clf.clf_id].f1 = f1
    results[clf.clf_id].kappa = kappa
    

rmtree(cachedir)

You provided "cachedir='/tmp/tmpnm_h1at5'", use "location='/tmp/tmpnm_h1at5'" instead.
  This is separate from the ipykernel package so we can avoid doing imports until


Training dc...

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(transformer_list=[('text',
                                Pipeline(steps=[('vectorizer',
                                                 TfidfVectorizer(max_features=10000,
                                                                 ngram_range=(1,
                                                                              3))),
                                                ('to_dense',
                                                 <__main__.DenseTransformer object at 0x7fc4ce7ffb50>),
                                                ('lda',
                                                 LinearDiscriminantAnalysis(n_components=4))])),
                             ..., 
190264                   secondrate
120749    go claustrophobic concept
189861                   completely
106888        

100%|██████████| 126124/126124 [00:00<00:00, 2151737.03it/s]
100%|██████████| 126124/126124 [00:00<00:00, 417352.97it/s]
100%|██████████| 126124/126124 [00:00<00:00, 485675.75it/s]
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


______________________________________________fit_transform_one - 247.1s, 4.1min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2224149.18it/s]
100%|██████████| 47470/47470 [00:00<00:00, 407646.69it/s]
100%|██████████| 47470/47470 [00:00<00:00, 441582.06it/s]
  _warn_prf(average, modifier, msg_start, len(result))


dc --- Accuracy: 0.498, Precision: 0.100, Recall: 0.200, F1: 0.13290195526797016, Kappa: 0.0
Training gnb...

[Memory]251.5s, 4.2min  : Loading _fit_transform_one from /tmp/tmpnm_h1at5/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2247777.22it/s]
100%|██████████| 47470/47470 [00:00<00:00, 410644.83it/s]
100%|██████████| 47470/47470 [00:00<00:00, 478028.40it/s]


gnb --- Accuracy: 0.602, Precision: 0.501, Recall: 0.493, F1: 0.4946757953164787, Kappa: 0.39078783708428255
Training lr...

[Memory]254.5s, 4.2min  : Loading _fit_transform_one from /tmp/tmpnm_h1at5/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2220849.63it/s]
100%|██████████| 47470/47470 [00:00<00:00, 404847.53it/s]
100%|██████████| 47470/47470 [00:00<00:00, 436102.81it/s]


lr --- Accuracy: 0.623, Precision: 0.554, Recall: 0.469, F1: 0.49862062348428465, Kappa: 0.405959194323282
Training lin_svm...

[Memory]270.7s, 4.5min  : Loading _fit_transform_one from /tmp/tmpnm_h1at5/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min




Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1840926.93it/s]
100%|██████████| 47470/47470 [00:00<00:00, 405135.03it/s]
100%|██████████| 47470/47470 [00:00<00:00, 474522.83it/s]


lin_svm --- Accuracy: 0.610, Precision: 0.560, Recall: 0.416, F1: 0.43706289160450995, Kappa: 0.38530510430823295
Training rbf_svm...

[Memory]330.6s, 5.5min  : Loading _fit_transform_one from /tmp/tmpnm_h1at5/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2256694.14it/s]
100%|██████████| 47470/47470 [00:00<00:00, 409873.87it/s]
100%|██████████| 47470/47470 [00:00<00:00, 463852.56it/s]


rbf_svm --- Accuracy: 0.628, Precision: 0.560, Recall: 0.465, F1: 0.49421423105242646, Kappa: 0.4148779886577302
Training rf...

[Memory]1045.3s, 17.4min: Loading _fit_transform_one from /tmp/tmpnm_h1at5/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1812537.42it/s]
100%|██████████| 47470/47470 [00:00<00:00, 408850.42it/s]
100%|██████████| 47470/47470 [00:00<00:00, 477564.04it/s]


rf --- Accuracy: 0.628, Precision: 0.552, Recall: 0.489, F1: 0.5127251485666179, Kappa: 0.42219492158266414
Training mlp...

[Memory]1055.2s, 17.6min: Loading _fit_transform_one from /tmp/tmpnm_h1at5/joblib/sklearn/pipeline/_fit_transform_one/6e7cb0abd50c5cf851df923d78866d86
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1424457.96it/s]
100%|██████████| 47470/47470 [00:00<00:00, 405753.42it/s]
100%|██████████| 47470/47470 [00:00<00:00, 475003.96it/s]


mlp --- Accuracy: 0.625, Precision: 0.547, Recall: 0.489, F1: 0.5113847957621365, Kappa: 0.42154886706205574


In [18]:
for k, v in json.loads(Encoder().encode(results)).items():
    accuracy = v['accuracy']
    precision = v['precision']
    recall = v['recall']
    f1 = v['f1']
    kappa = v['kappa']
    
    print(f'{k} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, Kappa: {kappa:.3f}')
    print()

dc --- Accuracy: 0.498, Precision: 0.100, Recall: 0.200, F1: 0.133, Kappa: 0.000

gnb --- Accuracy: 0.602, Precision: 0.501, Recall: 0.493, F1: 0.495, Kappa: 0.391

lr --- Accuracy: 0.623, Precision: 0.554, Recall: 0.469, F1: 0.499, Kappa: 0.406

lin_svm --- Accuracy: 0.610, Precision: 0.560, Recall: 0.416, F1: 0.437, Kappa: 0.385

rbf_svm --- Accuracy: 0.628, Precision: 0.560, Recall: 0.465, F1: 0.494, Kappa: 0.415

rf --- Accuracy: 0.628, Precision: 0.552, Recall: 0.489, F1: 0.513, Kappa: 0.422

mlp --- Accuracy: 0.625, Precision: 0.547, Recall: 0.489, F1: 0.511, Kappa: 0.422



# Best performing classifier on test set

In [19]:
# Dedup validation set to avoid adding any bias
validate = DataLoader().dedup(validate)

Deduplicating...


In [20]:
validate.shape

(45449, 7)

In [21]:
X_val, y_val = validate['phrase_clean'], validate['label_id']

In [22]:
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])

In [23]:
X.shape

(171573,)

In [28]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

accuracy, precision, recall, f1, kappa = rf.train_and_evaluate(X, y, 
                                                                X_test, y_test, 
                                                                use_features=True, 
                                                                memory=memory)
rmtree(cachedir)

You provided "cachedir='/tmp/tmpp4slgsi_'", use "location='/tmp/tmpp4slgsi_'" instead.
  


________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(transformer_list=[('text',
                                Pipeline(steps=[('vectorizer',
                                                 TfidfVectorizer(max_features=10000,
                                                                 ngram_range=(1,
                                                                              3))),
                                                ('to_dense',
                                                 <__main__.DenseTransformer object at 0x7fc4cd305f50>),
                                                ('lda',
                                                 LinearDiscriminantAnalysis(n_components=4))])),
                               ('feature_processing',
                                Pipeline(steps=[('features',
                                                 FeatureUn

100%|██████████| 171573/171573 [00:00<00:00, 1930854.09it/s]
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


______________________________________________fit_transform_one - 411.5s, 6.9min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1688492.10it/s]


In [29]:
accuracy, precision, recall, f1, kappa

(0.6350747840741521,
 0.5729196788805456,
 0.5003701461077453,
 0.5275887782465907,
 0.434207220408908)

# Unsupervised