In [20]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import scipy
from sklearn.utils import shuffle

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV, PredefinedSplit, ParameterGrid
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramsha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
yelp_train = shuffle(pd.read_csv("../data/yelp/yelp-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label']))
yelp_valid = shuffle(pd.read_csv("../data/yelp/yelp-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label']))
yelp_test = shuffle(pd.read_csv("../data/yelp/yelp-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label']))

IMDB_train = shuffle(pd.read_csv("../data/IMDB/IMDB-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label']))
IMDB_valid = shuffle(pd.read_csv("../data/IMDB/IMDB-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label']))
IMDB_test = shuffle(pd.read_csv("../data/IMDB/IMDB-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label']))


In [22]:
print(yelp_train.head())

                                                 review  label
3320  I always just get one thing from Vietnamese pl...      3
460   I really like eating here.  I have only been f...      4
3453  I decided to break from routine and get a hair...      1
6718  MARGARITA time....I love their ritas on the ro...      5
1118  My wife and I have going to this restaurant on...      1


In [23]:
datasets = {
    'yelp': {'train': yelp_train, 'valid': yelp_valid, 'test': yelp_test},
    'IMDB': {'train': IMDB_train, 'valid': IMDB_valid, 'test': IMDB_test},
    
}

In [24]:
for dataset in datasets.values():
    for split in dataset.values():
        split['review'] = split['review'].str.replace('<br /><br />', ' ').str.replace('[^\w\s]', '').str.lower()

In [25]:
stop_words = stopwords.words("english")
MAX_FEATURES = 10000
vocabulary = {}
for dataset_name, dataset in datasets.items():
    # accumulate all words
    all_words = [word for sentence in dataset['train']['review'].str.split().tolist() for word in sentence]
    # keep most frequent words, exclude stop words
    freq_words = Counter(word for word in all_words if word not in stop_words).most_common(MAX_FEATURES)
    # create dictionary for most frequent words
    vocabulary[dataset_name] = {word[0]: i for i, word in enumerate(freq_words)}

In [26]:
def get_BOW(dataset, vocabulary, x_name='review', y_name='label'):
    # convert each split (train/valid/test) for each dataset (yelp/IMDB) to BoW representations
    BBOW = {}
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    for split_name, split in dataset.items():
        vector = vectorizer.fit_transform(split[x_name])
        vector[vector > 1] = 1
        BBOW[split_name] = [vector, split[y_name]]
    return BBOW

yelp_BBOW= get_BOW(datasets['yelp'], vocabulary['yelp'])
IMDB_BBOW = get_BOW(datasets['IMDB'], vocabulary['IMDB'])

In [27]:
models = {
    
    'Logistic Regression': {
        'classifer' : LogisticRegression(),
         'param' : {
            'tol': np.arange(0.0005, 0.001, .0011),
            'C': np.arange(0.01, 1.01, 0.01)
            }
            
        },
    'Bernoulli Naive Bayes': {
        'classifer' : BernoulliNB(),
        'param' : {
            'alpha': np.arange(0.01, 1.01, 0.01)
        }
    },
   
    'Decision Trees': {
        'classifer' : DecisionTreeClassifier(),
        'param' : {
            'max_depth': np.arange(13, 17),
            'max_features': np.arange(0.1, 0.5, 0.1),
            'min_samples_leaf': np.arange(3, 6)
        },
    },
    'Linear SVM': {
        'classifer' : LinearSVC(),
        'param' : {
            'C': np.logspace(-2, 2, num=8),
            'max_iter': np.arange(1000, 2000, 100)
        }
    }
}

In [28]:
def classifier(models, model, dataset_BOW, tune=False, average='micro'):
    # assign classifier and hyperparameters
    classifier = models[model]['classifer']
    param = models[model]['param']
    # tune hyperparameters
    train_x = dataset_BOW['train'][0]
    train_y = dataset_BOW['train'][1]
    if tune and param is not None:
        valid_x = dataset_BOW['valid'][0]
        valid_y = dataset_BOW['valid'][1]
        ps = PredefinedSplit(test_fold=[-1 if i < len(train_y) else 0 for i in range(len(train_y) + len(valid_y))])
        classifier = GridSearchCV(classifier, param, cv=ps, scoring='f1_micro', n_jobs=2, verbose=10)
        train_x = scipy.sparse.vstack([train_x, valid_x])
        train_y = np.concatenate([train_y, valid_y])                                          
    # fit model
    classifier.fit(train_x, train_y)
    # predict and compute f1 score for every split (train/valid/test)
    print(f"{model.upper()}\n")
    if tune and param is not None:
        print(f"Hypertuning Parameters:")
        means = classifier.cv_results_['mean_test_score']
        for mean, params in zip(means, classifier.cv_results_['params']):
            print("%0.4f for %r" % (mean, params))
        print(f"\nOptimal Parameters: {classifier.best_params_}")
    print(f"Corresponding F1 Scores:")
    for split_name, split in dataset_BOW.items():
        y_true = split[1]
        y_pred = classifier.predict(split[0])
        f1 = f1_score(y_true, y_pred, average=average)   
        #print(f"{split_name.upper()}: {f1}")
        print("%s: %0.4f" % (split_name.upper(), f1))

# Yelp

In [29]:
classifier(models, 'Logistic Regression', yelp_BBOW, tune=True)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    1.9s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    2.7s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    4.6s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    6.2s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    9.5s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   12.9s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   17.8s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.7s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   30.5s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   37.4s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:   46.7s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:   56.3s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  1.0min finished


LOGISTIC REGRESSION

Hypertuning Parameters:
0.4650 for {'C': 0.01, 'tol': 0.0005}
0.4900 for {'C': 0.02, 'tol': 0.0005}
0.4900 for {'C': 0.03, 'tol': 0.0005}
0.4890 for {'C': 0.04, 'tol': 0.0005}
0.4840 for {'C': 0.05, 'tol': 0.0005}
0.4880 for {'C': 0.060000000000000005, 'tol': 0.0005}
0.4870 for {'C': 0.06999999999999999, 'tol': 0.0005}
0.4850 for {'C': 0.08, 'tol': 0.0005}
0.4860 for {'C': 0.09, 'tol': 0.0005}
0.4830 for {'C': 0.09999999999999999, 'tol': 0.0005}
0.4830 for {'C': 0.11, 'tol': 0.0005}
0.4880 for {'C': 0.12, 'tol': 0.0005}
0.4900 for {'C': 0.13, 'tol': 0.0005}
0.4890 for {'C': 0.14, 'tol': 0.0005}
0.4860 for {'C': 0.15000000000000002, 'tol': 0.0005}
0.4850 for {'C': 0.16, 'tol': 0.0005}
0.4840 for {'C': 0.17, 'tol': 0.0005}
0.4840 for {'C': 0.18000000000000002, 'tol': 0.0005}
0.4820 for {'C': 0.19, 'tol': 0.0005}
0.4820 for {'C': 0.2, 'tol': 0.0005}
0.4820 for {'C': 0.21000000000000002, 'tol': 0.0005}
0.4830 for {'C': 0.22, 'tol': 0.0005}
0.4830 for {'C': 0.23, 'tol':

In [32]:
classifier(models, 'Bernoulli Naive Bayes', yelp_BBOW, tune=True)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Batch computation too fast (0.0839s.) Setting batch_size=4.
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  24 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done  44 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done  72 tasks      | elapsed:    1.7s


BERNOULLI NAIVE BAYES

Hypertuning Parameters:
0.4290 for {'alpha': 0.01}
0.4330 for {'alpha': 0.02}
0.4370 for {'alpha': 0.03}
0.4340 for {'alpha': 0.04}
0.4320 for {'alpha': 0.05}
0.4330 for {'alpha': 0.060000000000000005}
0.4330 for {'alpha': 0.06999999999999999}
0.4300 for {'alpha': 0.08}
0.4260 for {'alpha': 0.09}
0.4260 for {'alpha': 0.09999999999999999}
0.4220 for {'alpha': 0.11}
0.4210 for {'alpha': 0.12}
0.4170 for {'alpha': 0.13}
0.4150 for {'alpha': 0.14}
0.4150 for {'alpha': 0.15000000000000002}
0.4140 for {'alpha': 0.16}
0.4140 for {'alpha': 0.17}
0.4140 for {'alpha': 0.18000000000000002}
0.4140 for {'alpha': 0.19}
0.4140 for {'alpha': 0.2}
0.4150 for {'alpha': 0.21000000000000002}
0.4150 for {'alpha': 0.22}
0.4130 for {'alpha': 0.23}
0.4130 for {'alpha': 0.24000000000000002}
0.4130 for {'alpha': 0.25}
0.4150 for {'alpha': 0.26}
0.4140 for {'alpha': 0.27}
0.4140 for {'alpha': 0.28}
0.4150 for {'alpha': 0.29000000000000004}
0.4130 for {'alpha': 0.3}
0.4110 for {'alpha': 0.3

[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    2.2s finished


In [33]:
classifier(models, 'Decision Trees', yelp_BBOW, tune=True)

Fitting 1 folds for each of 48 candidates, totalling 48 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    1.7s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    2.8s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    3.8s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    5.4s
[Parallel(n_jobs=2)]: Done  48 out of  48 | elapsed:    7.4s finished


DECISION TREES

Hypertuning Parameters:
0.3620 for {'max_depth': 13, 'max_features': 0.1, 'min_samples_leaf': 3}
0.4020 for {'max_depth': 13, 'max_features': 0.1, 'min_samples_leaf': 4}
0.3970 for {'max_depth': 13, 'max_features': 0.1, 'min_samples_leaf': 5}
0.3860 for {'max_depth': 13, 'max_features': 0.2, 'min_samples_leaf': 3}
0.3970 for {'max_depth': 13, 'max_features': 0.2, 'min_samples_leaf': 4}
0.3810 for {'max_depth': 13, 'max_features': 0.2, 'min_samples_leaf': 5}
0.3800 for {'max_depth': 13, 'max_features': 0.30000000000000004, 'min_samples_leaf': 3}
0.3970 for {'max_depth': 13, 'max_features': 0.30000000000000004, 'min_samples_leaf': 4}
0.3640 for {'max_depth': 13, 'max_features': 0.30000000000000004, 'min_samples_leaf': 5}
0.4100 for {'max_depth': 13, 'max_features': 0.4, 'min_samples_leaf': 3}
0.3780 for {'max_depth': 13, 'max_features': 0.4, 'min_samples_leaf': 4}
0.3780 for {'max_depth': 13, 'max_features': 0.4, 'min_samples_leaf': 5}
0.3730 for {'max_depth': 14, 'max_fe

In [34]:
classifier(models, 'Linear SVM', yelp_BBOW, tune=True)

Fitting 1 folds for each of 80 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    2.0s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    3.7s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    5.8s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   11.6s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   21.8s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   40.3s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:  1.6min finished


LINEAR SVM

Hypertuning Parameters:
0.4810 for {'C': 0.01, 'max_iter': 1000}
0.4810 for {'C': 0.01, 'max_iter': 1100}
0.4810 for {'C': 0.01, 'max_iter': 1200}
0.4810 for {'C': 0.01, 'max_iter': 1300}
0.4810 for {'C': 0.01, 'max_iter': 1400}
0.4810 for {'C': 0.01, 'max_iter': 1500}
0.4810 for {'C': 0.01, 'max_iter': 1600}
0.4810 for {'C': 0.01, 'max_iter': 1700}
0.4810 for {'C': 0.01, 'max_iter': 1800}
0.4810 for {'C': 0.01, 'max_iter': 1900}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1000}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1100}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1200}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1300}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1400}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1500}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1600}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1700}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1800}
0.4680 for {'C': 0.0372759372031494, 'max_iter': 1900}
0.47

# IMDB

In [35]:
classifier(models, 'Logistic Regression', IMDB_BBOW, tune=True)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    2.7s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    4.2s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    8.2s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   11.6s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   16.2s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   19.8s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   25.3s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   30.3s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:   37.4s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:   44.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   47.5s finished


LOGISTIC REGRESSION

Hypertuning Parameters:
0.8700 for {'C': 0.01, 'tol': 0.0005}
0.8757 for {'C': 0.02, 'tol': 0.0005}
0.8774 for {'C': 0.03, 'tol': 0.0005}
0.8783 for {'C': 0.04, 'tol': 0.0005}
0.8795 for {'C': 0.05, 'tol': 0.0005}
0.8792 for {'C': 0.060000000000000005, 'tol': 0.0005}
0.8794 for {'C': 0.06999999999999999, 'tol': 0.0005}
0.8803 for {'C': 0.08, 'tol': 0.0005}
0.8797 for {'C': 0.09, 'tol': 0.0005}
0.8800 for {'C': 0.09999999999999999, 'tol': 0.0005}
0.8791 for {'C': 0.11, 'tol': 0.0005}
0.8785 for {'C': 0.12, 'tol': 0.0005}
0.8780 for {'C': 0.13, 'tol': 0.0005}
0.8776 for {'C': 0.14, 'tol': 0.0005}
0.8779 for {'C': 0.15000000000000002, 'tol': 0.0005}
0.8772 for {'C': 0.16, 'tol': 0.0005}
0.8770 for {'C': 0.17, 'tol': 0.0005}
0.8775 for {'C': 0.18000000000000002, 'tol': 0.0005}
0.8771 for {'C': 0.19, 'tol': 0.0005}
0.8768 for {'C': 0.2, 'tol': 0.0005}
0.8768 for {'C': 0.21000000000000002, 'tol': 0.0005}
0.8761 for {'C': 0.22, 'tol': 0.0005}
0.8755 for {'C': 0.23, 'tol':

In [36]:
classifier(models, 'Bernoulli Naive Bayes', IMDB_BBOW, tune=True)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    2.1s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    2.8s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    3.5s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:    4.4s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:    5.2s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:    6.3s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:    7.4s


BERNOULLI NAIVE BAYES

Hypertuning Parameters:
0.8463 for {'alpha': 0.01}
0.8463 for {'alpha': 0.02}
0.8463 for {'alpha': 0.03}
0.8465 for {'alpha': 0.04}
0.8465 for {'alpha': 0.05}
0.8466 for {'alpha': 0.060000000000000005}
0.8465 for {'alpha': 0.06999999999999999}
0.8465 for {'alpha': 0.08}
0.8465 for {'alpha': 0.09}
0.8464 for {'alpha': 0.09999999999999999}
0.8464 for {'alpha': 0.11}
0.8464 for {'alpha': 0.12}
0.8465 for {'alpha': 0.13}
0.8465 for {'alpha': 0.14}
0.8465 for {'alpha': 0.15000000000000002}
0.8465 for {'alpha': 0.16}
0.8464 for {'alpha': 0.17}
0.8461 for {'alpha': 0.18000000000000002}
0.8461 for {'alpha': 0.19}
0.8461 for {'alpha': 0.2}
0.8461 for {'alpha': 0.21000000000000002}
0.8461 for {'alpha': 0.22}
0.8460 for {'alpha': 0.23}
0.8461 for {'alpha': 0.24000000000000002}
0.8461 for {'alpha': 0.25}
0.8461 for {'alpha': 0.26}
0.8461 for {'alpha': 0.27}
0.8461 for {'alpha': 0.28}
0.8460 for {'alpha': 0.29000000000000004}
0.8461 for {'alpha': 0.3}
0.8459 for {'alpha': 0.3

[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    7.9s finished


In [37]:
classifier(models, 'Decision Trees', IMDB_BBOW, tune=True)

Fitting 1 folds for each of 48 candidates, totalling 48 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    1.4s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    4.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    7.1s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   10.3s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   13.8s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   19.1s
[Parallel(n_jobs=2)]: Done  48 out of  48 | elapsed:   26.2s finished


DECISION TREES

Hypertuning Parameters:
0.7154 for {'max_depth': 13, 'max_features': 0.1, 'min_samples_leaf': 3}
0.7212 for {'max_depth': 13, 'max_features': 0.1, 'min_samples_leaf': 4}
0.7125 for {'max_depth': 13, 'max_features': 0.1, 'min_samples_leaf': 5}
0.7215 for {'max_depth': 13, 'max_features': 0.2, 'min_samples_leaf': 3}
0.7240 for {'max_depth': 13, 'max_features': 0.2, 'min_samples_leaf': 4}
0.7241 for {'max_depth': 13, 'max_features': 0.2, 'min_samples_leaf': 5}
0.7225 for {'max_depth': 13, 'max_features': 0.30000000000000004, 'min_samples_leaf': 3}
0.7184 for {'max_depth': 13, 'max_features': 0.30000000000000004, 'min_samples_leaf': 4}
0.7259 for {'max_depth': 13, 'max_features': 0.30000000000000004, 'min_samples_leaf': 5}
0.7214 for {'max_depth': 13, 'max_features': 0.4, 'min_samples_leaf': 3}
0.7195 for {'max_depth': 13, 'max_features': 0.4, 'min_samples_leaf': 4}
0.7210 for {'max_depth': 13, 'max_features': 0.4, 'min_samples_leaf': 5}
0.7060 for {'max_depth': 14, 'max_fe

In [38]:
classifier(models, 'Linear SVM', IMDB_BBOW, tune=True)

Fitting 1 folds for each of 80 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    2.5s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    4.5s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    7.2s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   13.7s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   24.0s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   40.8s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:  1.4min finished


LINEAR SVM

Hypertuning Parameters:
0.8784 for {'C': 0.01, 'max_iter': 1000}
0.8784 for {'C': 0.01, 'max_iter': 1100}
0.8784 for {'C': 0.01, 'max_iter': 1200}
0.8784 for {'C': 0.01, 'max_iter': 1300}
0.8784 for {'C': 0.01, 'max_iter': 1400}
0.8784 for {'C': 0.01, 'max_iter': 1500}
0.8784 for {'C': 0.01, 'max_iter': 1600}
0.8784 for {'C': 0.01, 'max_iter': 1700}
0.8784 for {'C': 0.01, 'max_iter': 1800}
0.8784 for {'C': 0.01, 'max_iter': 1900}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1000}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1100}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1200}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1300}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1400}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1500}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1600}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1700}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1800}
0.8681 for {'C': 0.0372759372031494, 'max_iter': 1900}
0.85