In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier

## Helper Functions


In [2]:
def get_vocab(data: pd.DataFrame, input_col: str):
    data[input_col] = data[input_col].str.lower()
    data[input_col] = data[input_col].str.replace('[^\w\s]', '').str.replace('<br />\|...', '')
    tokens = data[input_col].str.split(expand=True)
    return pd.Series(tokens.stack().value_counts()[:10000])

In [3]:
def pre_process_binary(data, input_col, vocab):
    data[input_col] = data[input_col].str.lower()
    data[input_col] = data[input_col].str.replace('[^\w\s]', '').str.replace('<br />\|...', '')
    vocab_dict = vocab.to_dict()
    vocab_dict = {x: i for i, x in enumerate(list(vocab_dict.keys()))}
    bow_matrix = np.zeros(shape=(data.shape[0], 10000))
    data_tokens = data[input_col].str.split(expand=True)
    for i, row in data_tokens.iterrows():
        for word in row.iteritems():
            if word[1] in vocab_dict:
                bow_matrix[i, vocab_dict.get(word[1])] = 1
    return bow_matrix

In [4]:
def pre_process_frequency(data: pd.DataFrame, input_col: str, vocab):
    data[input_col] = data[input_col].str.lower()
    data[input_col] = data[input_col].str.replace('[^\w\s]', '')
    vocab_dict = vocab.to_dict()
    vocab_dict = {x: i for i, x in enumerate(list(vocab_dict.keys()))}
    data_tokens = data[input_col].str.split(expand=True)
    bow_matrix = np.zeros(shape=(data_tokens.shape[0], 10000))
    for i, row in data_tokens.iterrows():
        word_count = 0
        for word in row.iteritems():
            if word[1] in vocab_dict:
                bow_matrix[i, vocab_dict.get(word[1])] += 1
                word_count += 1
        if word_count != 0:
            for j in range(bow_matrix[i].shape[0]):
                bow_matrix[i][j] /= word_count
    return bow_matrix


In [5]:
def save_vocab(fname, vocab):
    file = open(fname, 'w')
    vocab_dict = vocab.to_dict()
    for i, word in enumerate(list(vocab_dict.keys())):
        row_string = word + " " + str(i + 1) + " " + str(vocab_dict[word]) + "\n"
        file.write(row_string)
    file.close()


In [6]:
def save_data(fname, data, vocab, input_col, class_col):
    file = open(fname, 'w')
    input_data = data[input_col]
    classes = data[class_col]
    vocab_dict = vocab.to_dict()
    vocab_dict = {x: i + 1 for i, x in enumerate(list(vocab_dict.keys()))}
    data_tokens = input_data.str.split(expand=True)
    for i, row in data_tokens.iterrows():
        row_string = ""
        for word in row.iteritems():
            if word[1] in vocab_dict:
                index = vocab_dict.get(word[1])
                row_string += str(index) + " "
        row_string = row_string[:-1]
        row_string += "\t" + str(classes[i]) + "\n"
        file.write(row_string)
    file.close()


#### Read the data

In [7]:
imdb_train = pd.read_csv('./data/IMDB-train.txt', sep='\t', names=["Comments", "Rating"])
imdb_valid = pd.read_csv('./data/IMDB-valid.txt', sep='\t', names=["Comments", "Rating"])
imdb_test = pd.read_csv('./data/IMDB-test.txt', sep='\t', names=["Comments", "Rating"])

#### Create and save vocabulary

In [8]:
vocab = get_vocab(data=imdb_train, input_col="Comments")

In [9]:
save_vocab("./data-modified/IMDB-vocab.txt", vocab)

#### Save the reformatted reviews


In [10]:
save_data("./data-modified/IMDB-train.txt", imdb_train, vocab, "Comments", "Rating")
save_data("./data-modified/IMDB-valid.txt", imdb_valid, vocab, "Comments", "Rating")
save_data("./data-modified/IMDB-test.txt", imdb_test, vocab, "Comments", "Rating")

#### Pre-process the data into binary bag-of-words

In [11]:
imdb_train_bin_x = pre_process_binary(imdb_train, "Comments", vocab)
imdb_valid_bin_x = pre_process_binary(imdb_valid, "Comments", vocab)
imdb_test_bin_x = pre_process_binary(imdb_test, "Comments", vocab)

#### Pre-process the data into binary bag-of-words


In [9]:
imdb_train_freq_x = pre_process_frequency(imdb_train, "Comments", vocab)
imdb_valid_freq_x = pre_process_frequency(imdb_valid, "Comments", vocab)
imdb_test_freq_x = pre_process_frequency(imdb_test, "Comments", vocab)

In [10]:
imdb_train_y = np.array(imdb_train["Rating"])
imdb_valid_y = np.array(imdb_valid["Rating"])
imdb_test_y = np.array(imdb_test["Rating"])


#### Pre-process the data into binary bag-of-words
The following strategy has been used to be able to take advantage of the MultiThread enabled GridSearchCV function. 

The function is originally created for Cross Validation, but with a neat trick it is possible to use it for predefined splits as well.

First we combine both training and validation sets together. Next, we create a fold index (1D array with length len(training) + len(validation)) that has values -1 for all indices corresponding to training data, and 0 set for all indices corresponding to validation data. 

Using this array we can create a PredefinedSplit which then allows having one fold cross validation with our specific training and validation datasets taken into account.


In [11]:
imdb_train_valid_x = np.append(imdb_train_bin_x, imdb_valid_bin_x, axis=0)
imdb_train_valid_y = np.append(imdb_train_y, imdb_valid_y, axis=0)
imdb_train_valid_fold = np.zeros(shape=(imdb_train_valid_x.shape[0]))
imdb_train_valid_fold[0:imdb_train_bin_x.shape[0]] = -1
imdb_bin_ps = PredefinedSplit(imdb_train_valid_fold)

NameError: name 'imdb_train_bin_x' is not defined

#### Create Majority Class Classifier

In [14]:
imdb_majority_clf_bin = DummyClassifier(strategy="most_frequent", random_state=1000)
imdb_majority_clf_bin.fit(imdb_train_bin_x, imdb_train_y)

DummyClassifier(constant=None, random_state=1000, strategy='most_frequent')

In [15]:
print("Classifier: Majority Class | Dataset: IMDB | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier has no hyper parameters")
print()
print("Training set prediction F1 Score (macro):")
imdb_majority_train_predicted = imdb_majority_clf_bin.predict(imdb_train_bin_x)
print("%.3f" % f1_score(imdb_train_y, imdb_majority_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_majority_valid_predicted = imdb_majority_clf_bin.predict(imdb_valid_bin_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_majority_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_majority_test_predicted = imdb_majority_clf_bin.predict(imdb_test_bin_x)
print("%.3f" % f1_score(imdb_test_y, imdb_majority_test_predicted, average='macro'))
print()

Classifier: Majority Class | Dataset: IMDB | Bag of Words: Binary
Tuning hyper-parameters for F1 score (macro)

This classifier has no hyper parameters

Training set prediction F1 Score (macro):
0.333
Validation set prediction F1 Score (macro):
0.333
Test set prediction F1 Score (macro):
0.333



  'precision', 'predicted', average, warn_for)


#### Create Random Class Classifier


In [16]:
imdb_random_clf_bin = DummyClassifier(strategy="uniform", random_state=1000)
imdb_random_clf_bin.fit(imdb_train_bin_x, imdb_train_y)

DummyClassifier(constant=None, random_state=1000, strategy='uniform')

In [17]:
print("Classifier: Random | Dataset: IMDB | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier has no hyper parameters")
print()
print("Training set prediction F1 Score (macro):")
imdb_random_train_predicted = imdb_random_clf_bin.predict(imdb_train_bin_x)
print("%.3f" % f1_score(imdb_train_y, imdb_random_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_random_valid_predicted = imdb_random_clf_bin.predict(imdb_valid_bin_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_random_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_random_test_predicted = imdb_random_clf_bin.predict(imdb_test_bin_x)
print("%.3f" % f1_score(imdb_test_y, imdb_random_test_predicted, average='macro'))
print()


Classifier: Random | Dataset: IMDB | Bag of Words: Binary
Tuning hyper-parameters for F1 score (macro)

This classifier has no hyper parameters

Training set prediction F1 Score (macro):
0.503
Validation set prediction F1 Score (macro):
0.501
Test set prediction F1 Score (macro):
0.497



#### Create Bernoulli Naive Bayes Classifier for Binary Bag of Words
Parameter ranges: 
* Alpha: 15 equally spread values over 10^-2^ to 10^0^

In [18]:
imdb_nb_clf = BernoulliNB()
imdb_nb_alphas = np.logspace(-2, 0, 15)
imdb_nb_params = [{'alpha': imdb_nb_alphas}]
imdb_nb_gs = GridSearchCV(imdb_nb_clf, imdb_nb_params, cv=imdb_bin_ps, scoring='f1_macro', refit=True, n_jobs=2, verbose=2)
imdb_nb_gs.fit(imdb_train_valid_x, imdb_train_valid_y)
imdb_nb_best_clf = imdb_nb_gs.best_estimator_
imdb_nb_best_clf.fit(imdb_train_bin_x, imdb_train_y)


Fitting 1 folds for each of 15 candidates, totalling 15 fits


[CV] alpha=0.01 ......................................................


[CV] alpha=0.013894954943731374 ......................................


[CV] ....................... alpha=0.013894954943731374, total= 1.5min


[CV] ....................................... alpha=0.01, total= 1.5min


[CV] alpha=0.019306977288832496 ......................................


[CV] ....................... alpha=0.019306977288832496, total=   6.5s


[CV] alpha=0.02682695795279726 .......................................


[CV] alpha=0.0372759372031494 ........................................


[CV] ........................ alpha=0.02682695795279726, total=  10.8s


[CV] alpha=0.0517947467923121 ........................................


[CV] ......................... alpha=0.0372759372031494, total=  33.2s


[CV] alpha=0.07196856730011521 .......................................


[CV] ......................... alpha=0.0517947467923121, total=  32.4s


[CV] alpha=0.1 .......................................................


[CV] ........................ alpha=0.07196856730011521, total=  27.5s


[CV] alpha=0.13894954943731375 .......................................


[CV] ........................................ alpha=0.1, total=  12.3s


[CV] alpha=0.19306977288832497 .......................................


[CV] ........................ alpha=0.13894954943731375, total=  10.7s


[CV] alpha=0.2682695795279725 ........................................


[CV] ........................ alpha=0.19306977288832497, total=   9.6s


[CV] ......................... alpha=0.2682695795279725, total=   9.5s


[CV] alpha=0.372759372031494 .........................................


[CV] alpha=0.517947467923121 .........................................


[CV] .......................... alpha=0.372759372031494, total=   6.6s


[CV] alpha=0.7196856730011517 ........................................


[CV] .......................... alpha=0.517947467923121, total=  11.7s


[CV] alpha=1.0 .......................................................


[CV] ......................... alpha=0.7196856730011517, total=  10.4s


[CV] ........................................ alpha=1.0, total=   6.4s


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:  4.5min finished


BernoulliNB(alpha=0.019306977288832496, binarize=0.0, class_prior=None,
      fit_prior=True)

In [19]:
print("Classifier: Bernoulli Naive Bayes | Dataset: IMDB | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("Best Parameters found on validation set:")
print(imdb_nb_gs.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % imdb_nb_gs.best_score_)
print()
print("Training set prediction F1 Score (macro):")
imdb_nb_train_predicted = imdb_nb_best_clf.predict(imdb_train_bin_x)
print("%.3f" % f1_score(imdb_train_y, imdb_nb_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_nb_valid_predicted = imdb_nb_best_clf.predict(imdb_valid_bin_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_nb_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_nb_test_predicted = imdb_nb_gs.predict(imdb_test_bin_x)
print("%.3f" % f1_score(imdb_test_y, imdb_nb_test_predicted, average='macro'))
print()


Classifier: Bernoulli Naive Bayes | Dataset: IMDB | Bag of Words: Binary
Tuning hyper-parameters for F1 score (macro)

Best Parameters found on validation set:
{'alpha': 0.019306977288832496}

Best Scores on validation set:
0.843

Training set prediction F1 Score (macro):


0.872
Validation set prediction F1 Score (macro):


0.843
Test set prediction F1 Score (macro):


0.836



#### Create Decision Tree Classifier for Binary Bag of Words

Parameter ranges:
* Criterion: 'gini' and 'entropy'
* Max Depth: 10 equally spread values over 2^3^ to 2^11^

In [20]:
imdb_dt_criterion_bin = np.array(['gini', 'entropy'])
imdb_dt_max_depth_bin = np.logspace(3, 11, 10, base=2)
imdb_dt_params_bin = [{'criterion': imdb_dt_criterion_bin, 'max_depth': imdb_dt_max_depth_bin, 'random_state': [1000]}]
imdb_dt_gs = GridSearchCV(DecisionTreeClassifier(), imdb_dt_params_bin, 'f1_macro', n_jobs=2, cv=imdb_bin_ps, refit=True, verbose=2)
imdb_dt_gs.fit(imdb_train_valid_x, imdb_train_valid_y)


Fitting 1 folds for each of 20 candidates, totalling 20 fits


[CV] criterion=gini, max_depth=8.0, random_state=1000 ................


[CV] criterion=gini, max_depth=14.813995396596646, random_state=1000 .


[CV] . criterion=gini, max_depth=8.0, random_state=1000, total=  33.1s


[CV] criterion=gini, max_depth=27.43180745129833, random_state=1000 ..


[CV]  criterion=gini, max_depth=14.813995396596646, random_state=1000, total=  50.5s


[CV] criterion=gini, max_depth=50.796833662982365, random_state=1000 .


[CV]  criterion=gini, max_depth=27.43180745129833, random_state=1000, total= 1.2min


[CV] criterion=gini, max_depth=94.06300750563831, random_state=1000 ..


[CV]  criterion=gini, max_depth=50.796833662982365, random_state=1000, total= 1.4min


[CV] criterion=gini, max_depth=174.18112002232027, random_state=1000 .


[CV]  criterion=gini, max_depth=94.06300750563831, random_state=1000, total= 1.6min


[CV] criterion=gini, max_depth=322.53978877308725, random_state=1000 .


[CV]  criterion=gini, max_depth=174.18112002232027, random_state=1000, total= 1.7min


[CV] criterion=gini, max_depth=597.2628682629713, random_state=1000 ..


[CV]  criterion=gini, max_depth=322.53978877308725, random_state=1000, total= 1.6min


[CV] criterion=gini, max_depth=1105.9811726257212, random_state=1000 .


[CV]  criterion=gini, max_depth=597.2628682629713, random_state=1000, total= 1.7min


[CV] criterion=gini, max_depth=2048.0, random_state=1000 .............


[CV]  criterion=gini, max_depth=1105.9811726257212, random_state=1000, total= 1.7min


[CV] criterion=entropy, max_depth=8.0, random_state=1000 .............


[CV]  criterion=entropy, max_depth=8.0, random_state=1000, total=  35.3s


[CV] criterion=entropy, max_depth=14.813995396596646, random_state=1000 


[CV]  criterion=gini, max_depth=2048.0, random_state=1000, total= 1.7min


[CV] criterion=entropy, max_depth=27.43180745129833, random_state=1000 


[CV]  criterion=entropy, max_depth=14.813995396596646, random_state=1000, total=  46.3s


[CV] criterion=entropy, max_depth=50.796833662982365, random_state=1000 


[CV]  criterion=entropy, max_depth=27.43180745129833, random_state=1000, total=  57.0s


[CV] criterion=entropy, max_depth=94.06300750563831, random_state=1000 


[CV]  criterion=entropy, max_depth=50.796833662982365, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=174.18112002232027, random_state=1000 


[CV]  criterion=entropy, max_depth=94.06300750563831, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=322.53978877308725, random_state=1000 


[CV]  criterion=entropy, max_depth=174.18112002232027, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=597.2628682629713, random_state=1000 


[CV]  criterion=entropy, max_depth=322.53978877308725, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=1105.9811726257212, random_state=1000 


[CV]  criterion=entropy, max_depth=597.2628682629713, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=2048.0, random_state=1000 ..........


[CV]  criterion=entropy, max_depth=1105.9811726257212, random_state=1000, total= 1.1min


[CV]  criterion=entropy, max_depth=2048.0, random_state=1000, total= 1.0min


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed: 12.5min finished


GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=2,
       param_grid=[{'criterion': array(['gini', 'entropy'], dtype='<U7'), 'max_depth': array([   8.     ,   14.814  ,   27.43181,   50.79683,   94.06301,
        174.18112,  322.53979,  597.26287, 1105.98117, 2048.     ]), 'random_state': [1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=2)

In [21]:
imdb_dt_best_clf_bin = imdb_dt_gs.best_estimator_
imdb_dt_best_clf_bin.fit(imdb_train_bin_x, imdb_train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini',
            max_depth=14.813995396596646, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1000, splitter='best')

In [22]:
print("Classifier: Decision Tree | Dataset: imdb | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 Score (macro)")
print()
print("Best Parameters found on validation set:")
print(imdb_dt_gs.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % imdb_dt_gs.best_score_)
print()
print("Training set prediction F1 Score (macro):")
imdb_dt_train_predicted = imdb_dt_best_clf_bin.predict(imdb_train_bin_x)
print("%.3f" % f1_score(imdb_train_y, imdb_dt_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_dt_valid_predicted = imdb_dt_best_clf_bin.predict(imdb_valid_bin_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_dt_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_dt_test_predicted = imdb_dt_best_clf_bin.predict(imdb_test_bin_x)
print("%.3f" % f1_score(imdb_test_y, imdb_dt_test_predicted, average='macro'))


Classifier: Decision Tree | Dataset: imdb | Bag of Words: Binary
Tuning hyper-parameters for F1 Score (macro)

Best Parameters found on validation set:
{'criterion': 'gini', 'max_depth': 14.813995396596646, 'random_state': 1000}

Best Scores on validation set:
0.722

Training set prediction F1 Score (macro):


0.815
Validation set prediction F1 Score (macro):


0.722
Test set prediction F1 Score (macro):


0.728


#### Creating Linear Support Vector Classifier for Binary Bag of Words
Parameters:
* C: 20 equally spread values from 10^-3^ to 10^1^
* Tolerance: 20 equally spread values from 10^-3^ to 10^-1^

In [23]:
imdb_svm_clf_bin = LinearSVC()
imdb_svm_c_bin = np.logspace(-3, 1, 7)
imdb_svm_tol_bin = np.logspace(-3, -1, 7)
imdb_svm_dual_bin = [True]
imdb_svm_random_state_bin = [1000]
imdb_svm_params_bin = [{'C': imdb_svm_c_bin, 'tol': imdb_svm_tol_bin, 'random_state': imdb_svm_random_state_bin, 'dual': imdb_svm_dual_bin}]
imdb_svm_gs_bin = GridSearchCV(imdb_svm_clf_bin, imdb_svm_params_bin, cv=imdb_bin_ps, scoring='f1_macro', n_jobs=2, refit=True, verbose=2)


In [24]:
imdb_svm_gs_bin.fit(imdb_train_valid_x, imdb_train_valid_y)
imdb_svm_best_clf_bin = imdb_svm_gs_bin.best_estimator_
imdb_svm_best_clf_bin.fit(imdb_train_bin_x, imdb_train_y)

Fitting 1 folds for each of 49 candidates, totalling 49 fits


[CV] C=0.001, dual=True, random_state=1000, tol=0.001 ................


[CV] C=0.001, dual=True, random_state=1000, tol=0.0021544346900318843 


[CV] . C=0.001, dual=True, random_state=1000, tol=0.001, total=  13.3s


[CV]  C=0.001, dual=True, random_state=1000, tol=0.0021544346900318843, total=   9.4s


[CV] C=0.001, dual=True, random_state=1000, tol=0.004641588833612777 .


[CV]  C=0.001, dual=True, random_state=1000, tol=0.004641588833612777, total=   3.4s


[CV] C=0.001, dual=True, random_state=1000, tol=0.01 .................


[CV] .. C=0.001, dual=True, random_state=1000, tol=0.01, total=   3.1s


[CV] C=0.001, dual=True, random_state=1000, tol=0.021544346900318832 .


[CV] C=0.001, dual=True, random_state=1000, tol=0.046415888336127774 .


[CV]  C=0.001, dual=True, random_state=1000, tol=0.021544346900318832, total=   3.5s


[CV] C=0.001, dual=True, random_state=1000, tol=0.1 ..................


[CV]  C=0.001, dual=True, random_state=1000, tol=0.046415888336127774, total=   3.3s


[CV] C=0.004641588833612777, dual=True, random_state=1000, tol=0.001 .


[CV] ... C=0.001, dual=True, random_state=1000, tol=0.1, total=   3.4s


[CV] C=0.004641588833612777, dual=True, random_state=1000, tol=0.0021544346900318843 


[CV]  C=0.004641588833612777, dual=True, random_state=1000, tol=0.001, total=   3.4s


[CV] C=0.004641588833612777, dual=True, random_state=1000, tol=0.004641588833612777 


[CV]  C=0.004641588833612777, dual=True, random_state=1000, tol=0.0021544346900318843, total=   4.1s


[CV] C=0.004641588833612777, dual=True, random_state=1000, tol=0.01 ..


[CV]  C=0.004641588833612777, dual=True, random_state=1000, tol=0.004641588833612777, total=   3.6s


[CV] C=0.004641588833612777, dual=True, random_state=1000, tol=0.021544346900318832 


[CV]  C=0.004641588833612777, dual=True, random_state=1000, tol=0.01, total=   3.4s


[CV] C=0.004641588833612777, dual=True, random_state=1000, tol=0.046415888336127774 


[CV]  C=0.004641588833612777, dual=True, random_state=1000, tol=0.021544346900318832, total=   3.3s


[CV] C=0.004641588833612777, dual=True, random_state=1000, tol=0.1 ...


[CV]  C=0.004641588833612777, dual=True, random_state=1000, tol=0.046415888336127774, total=   3.4s


[CV] C=0.021544346900318832, dual=True, random_state=1000, tol=0.001 .


[CV]  C=0.004641588833612777, dual=True, random_state=1000, tol=0.1, total=   3.4s


[CV] C=0.021544346900318832, dual=True, random_state=1000, tol=0.0021544346900318843 


[CV]  C=0.021544346900318832, dual=True, random_state=1000, tol=0.001, total=   3.5s


[CV] C=0.021544346900318832, dual=True, random_state=1000, tol=0.004641588833612777 


[CV]  C=0.021544346900318832, dual=True, random_state=1000, tol=0.0021544346900318843, total=   3.6s


[CV] C=0.021544346900318832, dual=True, random_state=1000, tol=0.01 ..


[CV]  C=0.021544346900318832, dual=True, random_state=1000, tol=0.004641588833612777, total=   4.1s


[CV] C=0.021544346900318832, dual=True, random_state=1000, tol=0.021544346900318832 


[CV]  C=0.021544346900318832, dual=True, random_state=1000, tol=0.01, total=   4.1s


[CV] C=0.021544346900318832, dual=True, random_state=1000, tol=0.046415888336127774 


[CV]  C=0.021544346900318832, dual=True, random_state=1000, tol=0.021544346900318832, total=   4.2s


[CV] C=0.021544346900318832, dual=True, random_state=1000, tol=0.1 ...


[CV]  C=0.021544346900318832, dual=True, random_state=1000, tol=0.046415888336127774, total=   3.7s


[CV] C=0.1, dual=True, random_state=1000, tol=0.001 ..................


[CV]  C=0.021544346900318832, dual=True, random_state=1000, tol=0.1, total=   3.5s


[CV] C=0.1, dual=True, random_state=1000, tol=0.0021544346900318843 ..


[CV] ... C=0.1, dual=True, random_state=1000, tol=0.001, total=   4.3s


[CV] C=0.1, dual=True, random_state=1000, tol=0.004641588833612777 ...


[CV]  C=0.1, dual=True, random_state=1000, tol=0.0021544346900318843, total=   4.1s


[CV] C=0.1, dual=True, random_state=1000, tol=0.01 ...................


[CV]  C=0.1, dual=True, random_state=1000, tol=0.004641588833612777, total=   4.0s


[CV] C=0.1, dual=True, random_state=1000, tol=0.021544346900318832 ...


[CV] .... C=0.1, dual=True, random_state=1000, tol=0.01, total=   3.8s


[CV] C=0.1, dual=True, random_state=1000, tol=0.046415888336127774 ...


[CV]  C=0.1, dual=True, random_state=1000, tol=0.021544346900318832, total=   3.8s


[CV] C=0.1, dual=True, random_state=1000, tol=0.1 ....................


[CV]  C=0.1, dual=True, random_state=1000, tol=0.046415888336127774, total=   4.2s


[CV] C=0.46415888336127775, dual=True, random_state=1000, tol=0.001 ..


[CV] ..... C=0.1, dual=True, random_state=1000, tol=0.1, total=   3.9s


[CV] C=0.46415888336127775, dual=True, random_state=1000, tol=0.0021544346900318843 


[CV]  C=0.46415888336127775, dual=True, random_state=1000, tol=0.001, total=   5.2s


[CV] C=0.46415888336127775, dual=True, random_state=1000, tol=0.004641588833612777 


[CV]  C=0.46415888336127775, dual=True, random_state=1000, tol=0.0021544346900318843, total=   5.6s


[CV] C=0.46415888336127775, dual=True, random_state=1000, tol=0.01 ...


[CV]  C=0.46415888336127775, dual=True, random_state=1000, tol=0.004641588833612777, total=   5.5s


[CV] C=0.46415888336127775, dual=True, random_state=1000, tol=0.021544346900318832 


[CV]  C=0.46415888336127775, dual=True, random_state=1000, tol=0.01, total=   5.1s


[CV] C=0.46415888336127775, dual=True, random_state=1000, tol=0.046415888336127774 


[CV]  C=0.46415888336127775, dual=True, random_state=1000, tol=0.021544346900318832, total=   4.5s


[CV] C=0.46415888336127775, dual=True, random_state=1000, tol=0.1 ....


[CV]  C=0.46415888336127775, dual=True, random_state=1000, tol=0.046415888336127774, total=   4.0s


[CV] C=2.154434690031882, dual=True, random_state=1000, tol=0.001 ....


[CV]  C=0.46415888336127775, dual=True, random_state=1000, tol=0.1, total=   4.1s


[CV] C=2.154434690031882, dual=True, random_state=1000, tol=0.0021544346900318843 


[CV]  C=2.154434690031882, dual=True, random_state=1000, tol=0.001, total=   8.0s


[CV] C=2.154434690031882, dual=True, random_state=1000, tol=0.004641588833612777 


[CV]  C=2.154434690031882, dual=True, random_state=1000, tol=0.0021544346900318843, total=   7.5s


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  2.8min


[CV] C=2.154434690031882, dual=True, random_state=1000, tol=0.01 .....


[CV]  C=2.154434690031882, dual=True, random_state=1000, tol=0.004641588833612777, total=   6.1s


[CV] C=2.154434690031882, dual=True, random_state=1000, tol=0.021544346900318832 


[CV]  C=2.154434690031882, dual=True, random_state=1000, tol=0.01, total=   5.6s


[CV] C=2.154434690031882, dual=True, random_state=1000, tol=0.046415888336127774 


[CV]  C=2.154434690031882, dual=True, random_state=1000, tol=0.021544346900318832, total=   4.7s


[CV] C=2.154434690031882, dual=True, random_state=1000, tol=0.1 ......


[CV]  C=2.154434690031882, dual=True, random_state=1000, tol=0.046415888336127774, total=   4.7s


[CV] C=10.0, dual=True, random_state=1000, tol=0.001 .................


[CV]  C=2.154434690031882, dual=True, random_state=1000, tol=0.1, total=   4.4s


[CV] C=10.0, dual=True, random_state=1000, tol=0.0021544346900318843 .


[CV] .. C=10.0, dual=True, random_state=1000, tol=0.001, total=   7.2s


[CV] C=10.0, dual=True, random_state=1000, tol=0.004641588833612777 ..


[CV]  C=10.0, dual=True, random_state=1000, tol=0.0021544346900318843, total=   6.1s


[CV] C=10.0, dual=True, random_state=1000, tol=0.01 ..................


[CV]  C=10.0, dual=True, random_state=1000, tol=0.004641588833612777, total=   5.8s


[CV] C=10.0, dual=True, random_state=1000, tol=0.021544346900318832 ..


[CV] ... C=10.0, dual=True, random_state=1000, tol=0.01, total=   5.5s


[CV] C=10.0, dual=True, random_state=1000, tol=0.046415888336127774 ..


[CV]  C=10.0, dual=True, random_state=1000, tol=0.021544346900318832, total=   4.8s


[CV] C=10.0, dual=True, random_state=1000, tol=0.1 ...................


[CV]  C=10.0, dual=True, random_state=1000, tol=0.046415888336127774, total=   4.8s


[CV] .... C=10.0, dual=True, random_state=1000, tol=0.1, total=   4.3s


[Parallel(n_jobs=2)]: Done  49 out of  49 | elapsed:  3.6min finished


LinearSVC(C=0.004641588833612777, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=1000,
     tol=0.001, verbose=0)

In [25]:
print("Classifier: Linear Support Vector Classifier | Dataset: imdb | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 Score (macro)")
print()
print("Best Parameters found on validation set:")
print(imdb_svm_gs_bin.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % imdb_svm_gs_bin.best_score_)
print()
print("Training set prediction F1 Score (macro):")
imdb_svm_train_predicted = imdb_svm_best_clf_bin.predict(imdb_train_bin_x)
print("%.3f" % f1_score(imdb_train_y, imdb_svm_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_svm_valid_predicted = imdb_svm_best_clf_bin.predict(imdb_valid_bin_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_svm_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_svm_test_predicted = imdb_svm_best_clf_bin.predict(imdb_test_bin_x)
print("%.3f" % f1_score(imdb_test_y, imdb_svm_test_predicted, average='macro'))


Classifier: Linear Support Vector Classifier | Dataset: imdb | Bag of Words: Binary
Tuning hyper-parameters for F1 Score (macro)

Best Parameters found on validation set:
{'C': 0.004641588833612777, 'dual': True, 'random_state': 1000, 'tol': 0.001}

Best Scores on validation set:
0.878

Training set prediction F1 Score (macro):


0.946
Validation set prediction F1 Score (macro):


0.878
Test set prediction F1 Score (macro):


0.876


#### Create Majority Class Classifier for Frequency Bag of Words

In [12]:
imdb_majority_clf_freq = DummyClassifier(strategy="most_frequent", random_state=1000)
imdb_majority_clf_freq.fit(imdb_train_freq_x, imdb_train_y)

DummyClassifier(constant=None, random_state=1000, strategy='most_frequent')

In [13]:
print("Classifier: Majority Class | Dataset: IMDB | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier has no hyper parameters")
print()
print("Training set prediction F1 Score (macro):")
imdb_majority_train_predicted_freq = imdb_majority_clf_freq.predict(imdb_train_freq_x)
print("%.3f" % f1_score(imdb_train_y, imdb_majority_train_predicted_freq, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_majority_valid_predicted_freq = imdb_majority_clf_freq.predict(imdb_valid_freq_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_majority_valid_predicted_freq, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_majority_test_predicted_freq = imdb_majority_clf_freq.predict(imdb_test_freq_x)
print("%.3f" % f1_score(imdb_test_y, imdb_majority_test_predicted_freq, average='macro'))
print()

Classifier: Majority Class | Dataset: IMDB | Bag of Words: Frequency
Tuning hyper-parameters for F1 score (macro)

This classifier has no hyper parameters

Training set prediction F1 Score (macro):
0.333
Validation set prediction F1 Score (macro):
0.333
Test set prediction F1 Score (macro):
0.333



  'precision', 'predicted', average, warn_for)


#### Create Random Class Classifier for Frequency Bag of Words

In [14]:
imdb_random_clf_freq = DummyClassifier(strategy="uniform", random_state=1000)
imdb_random_clf_freq.fit(imdb_train_freq_x, imdb_train_y)

DummyClassifier(constant=None, random_state=1000, strategy='uniform')

In [15]:
print("Classifier: Random | Dataset: IMDB | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier has no hyper parameters")
print()
print("Training set prediction F1 Score (macro):")
imdb_random_train_predicted_freq = imdb_random_clf_freq.predict(imdb_train_freq_x)
print("%.3f" % f1_score(imdb_train_y, imdb_random_train_predicted_freq, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_random_valid_predicted_freq = imdb_random_clf_freq.predict(imdb_valid_freq_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_random_valid_predicted_freq, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_random_test_predicted_freq = imdb_random_clf_freq.predict(imdb_test_freq_x)
print("%.3f" % f1_score(imdb_test_y, imdb_random_test_predicted_freq, average='macro'))
print()


Classifier: Random | Dataset: IMDB | Bag of Words: Frequency
Tuning hyper-parameters for F1 score (macro)

This classifier has no hyper parameters

Training set prediction F1 Score (macro):
0.503
Validation set prediction F1 Score (macro):
0.501
Test set prediction F1 Score (macro):
0.497



#### Create Gaussian Naive Bayes Classifier for Frequency Bag of Words
Parameters:
* None

In [16]:
imdb_gnb_clf = GaussianNB()
imdb_gnb_clf.fit(imdb_train_freq_x, imdb_train_y)

GaussianNB(priors=None)

In [17]:
print("Classifier: Gaussian Naive Bayes | Dataset: imdb | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier does not have any hyper-parameters")
print()
print("Training set prediction F1 Score (macro):")
imdb_gnb_train_predicted = imdb_gnb_clf.predict(imdb_train_freq_x)
print("%.3f" % f1_score(imdb_train_y, imdb_gnb_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_gnb_valid_predicted = imdb_gnb_clf.predict(imdb_valid_freq_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_gnb_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_gnb_test_predicted = imdb_gnb_clf.predict(imdb_test_freq_x)
print("%.3f" % f1_score(imdb_test_y, imdb_gnb_test_predicted, average='macro'))
print()

Classifier: Gaussian Naive Bayes | Dataset: imdb | Bag of Words: Frequency
Tuning hyper-parameters for F1 score (macro)

This classifier does not have any hyper-parameters

Training set prediction F1 Score (macro):


0.864
Validation set prediction F1 Score (macro):


0.761
Test set prediction F1 Score (macro):


0.694



In [18]:
imdb_freq_train_valid_x = np.append(imdb_train_freq_x, imdb_valid_freq_x, axis=0)
imdb_freq_train_valid_y = np.append(imdb_train_y, imdb_valid_y, axis=0)
imdb_freq_train_valid_fold = np.zeros(shape=(imdb_freq_train_valid_x.shape[0]))
imdb_freq_train_valid_fold[0:imdb_train_freq_x.shape[0]] = -1
imdb_freq_ps = PredefinedSplit(imdb_freq_train_valid_fold)

#### Create Decision Tree Classifier for Frequency Bag of Words
Parameters:
* Criterion: 'gini' and 'entropy'
* Max Depth: 10 equally spred values over 2^3^ to 2^11^


In [19]:
imdb_dt_clf_freq = DecisionTreeClassifier()
imdb_dt_criterion_freq = ['gini', 'entropy']
imdb_dt_max_depth_freq = np.logspace(3, 11, 10, base=2)
imdb_dt_params_freq = [{'criterion': imdb_dt_criterion_freq, 'max_depth': imdb_dt_max_depth_freq, 'random_state': [1000]}]
imdb_dt_gs_freq = GridSearchCV(imdb_dt_clf_freq, imdb_dt_params_freq, cv=imdb_freq_ps, scoring='f1_macro', n_jobs=2, refit=True, verbose=2)
imdb_dt_gs_freq.fit(imdb_freq_train_valid_x, imdb_freq_train_valid_y)
imdb_dt_best_clf_freq = imdb_dt_gs_freq.best_estimator_
imdb_dt_best_clf_freq.fit(imdb_train_freq_x, imdb_train_y)

Fitting 1 folds for each of 20 candidates, totalling 20 fits


[CV] criterion=gini, max_depth=8.0, random_state=1000 ................


[CV] criterion=gini, max_depth=14.813995396596646, random_state=1000 .


[CV] . criterion=gini, max_depth=8.0, random_state=1000, total= 1.8min


[CV] criterion=gini, max_depth=27.43180745129833, random_state=1000 ..


[CV]  criterion=gini, max_depth=14.813995396596646, random_state=1000, total= 1.9min


[CV] criterion=gini, max_depth=50.796833662982365, random_state=1000 .


[CV]  criterion=gini, max_depth=27.43180745129833, random_state=1000, total= 1.1min


[CV] criterion=gini, max_depth=94.06300750563831, random_state=1000 ..


[CV]  criterion=gini, max_depth=50.796833662982365, random_state=1000, total= 1.3min


[CV] criterion=gini, max_depth=174.18112002232027, random_state=1000 .


[CV]  criterion=gini, max_depth=94.06300750563831, random_state=1000, total= 1.7min


[CV] criterion=gini, max_depth=322.53978877308725, random_state=1000 .


[CV]  criterion=gini, max_depth=174.18112002232027, random_state=1000, total= 1.6min


[CV] criterion=gini, max_depth=597.2628682629713, random_state=1000 ..


[CV]  criterion=gini, max_depth=322.53978877308725, random_state=1000, total= 1.6min


[CV] criterion=gini, max_depth=1105.9811726257212, random_state=1000 .


[CV]  criterion=gini, max_depth=597.2628682629713, random_state=1000, total= 1.6min


[CV] criterion=gini, max_depth=2048.0, random_state=1000 .............


[CV]  criterion=gini, max_depth=1105.9811726257212, random_state=1000, total= 1.6min


[CV] criterion=entropy, max_depth=8.0, random_state=1000 .............


[CV]  criterion=gini, max_depth=2048.0, random_state=1000, total= 1.6min


[CV] criterion=entropy, max_depth=14.813995396596646, random_state=1000 


[CV]  criterion=entropy, max_depth=8.0, random_state=1000, total=  38.7s


[CV] criterion=entropy, max_depth=27.43180745129833, random_state=1000 


[CV]  criterion=entropy, max_depth=14.813995396596646, random_state=1000, total=  47.7s


[CV] criterion=entropy, max_depth=50.796833662982365, random_state=1000 


[CV]  criterion=entropy, max_depth=27.43180745129833, random_state=1000, total=  59.0s


[CV] criterion=entropy, max_depth=94.06300750563831, random_state=1000 


[CV]  criterion=entropy, max_depth=50.796833662982365, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=174.18112002232027, random_state=1000 


[CV]  criterion=entropy, max_depth=94.06300750563831, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=322.53978877308725, random_state=1000 


[CV]  criterion=entropy, max_depth=174.18112002232027, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=597.2628682629713, random_state=1000 


[CV]  criterion=entropy, max_depth=322.53978877308725, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=1105.9811726257212, random_state=1000 


[CV]  criterion=entropy, max_depth=597.2628682629713, random_state=1000, total= 1.1min


[CV] criterion=entropy, max_depth=2048.0, random_state=1000 ..........


[CV]  criterion=entropy, max_depth=1105.9811726257212, random_state=1000, total= 1.1min


[CV]  criterion=entropy, max_depth=2048.0, random_state=1000, total= 1.0min


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed: 13.5min finished


DecisionTreeClassifier(class_weight=None, criterion='gini',
            max_depth=27.43180745129833, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1000, splitter='best')

In [20]:
print("Classifier: Decision Tree | Dataset: imdb | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 Score (macro)")
print()
print("Best Parameters found on validation set:")
print(imdb_dt_gs_freq.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % imdb_dt_gs_freq.best_score_)
print()
print("Training set prediction F1 Score (macro):")
imdb_dt_train_freq_predicted = imdb_dt_best_clf_freq.predict(imdb_train_freq_x)
print("%.3f" % f1_score(imdb_train_y, imdb_dt_train_freq_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_dt_valid_freq_predicted = imdb_dt_best_clf_freq.predict(imdb_valid_freq_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_dt_valid_freq_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_dt_test_freq_predicted = imdb_dt_best_clf_freq.predict(imdb_test_freq_x)
print("%.3f" % f1_score(imdb_test_y, imdb_dt_test_freq_predicted, average='macro'))


Classifier: Decision Tree | Dataset: imdb | Bag of Words: Frequency
Tuning hyper-parameters for F1 Score (macro)

Best Parameters found on validation set:
{'criterion': 'gini', 'max_depth': 27.43180745129833, 'random_state': 1000}

Best Scores on validation set:
0.705

Training set prediction F1 Score (macro):


0.930
Validation set prediction F1 Score (macro):


0.705
Test set prediction F1 Score (macro):


0.706


#### Create Linear Support Vector Classifier for Frequency Bag of Words
Parameters:
* C: 7 equally distributed values over 2^7^ to 2^9^
* Tolerance: 7 equally distributed values over 10^-1^ to 10^1^

In [31]:
imdb_svm_clf_freq = LinearSVC()
imdb_svm_c_freq = np.logspace(1, 8, 7, base=2)
imdb_svm_tol_freq = np.logspace(-5, -2, 7)
imdb_svm_dual_freq = [False]
imdb_svm_random_state_freq = [1000]
imdb_svm_params_freq = [{'C': imdb_svm_c_freq, 'tol': imdb_svm_tol_freq, 'random_state': imdb_svm_random_state_freq, 'dual': imdb_svm_dual_freq}]
imdb_svm_gs_freq = GridSearchCV(imdb_svm_clf_freq, imdb_svm_params_freq, cv=imdb_freq_ps, scoring='f1_macro', n_jobs=2, refit=True, verbose=2)

In [32]:
imdb_svm_gs_freq.fit(imdb_freq_train_valid_x, imdb_freq_train_valid_y)
imdb_svm_best_clf_freq = imdb_svm_gs_freq.best_estimator_
imdb_svm_best_clf_freq.fit(imdb_train_freq_x, imdb_train_y)

Fitting 1 folds for each of 49 candidates, totalling 49 fits


[CV] C=2.0, dual=False, random_state=1000, tol=1e-05 .................


[CV] C=2.0, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV] .. C=2.0, dual=False, random_state=1000, tol=1e-05, total=   3.4s


[CV] C=2.0, dual=False, random_state=1000, tol=0.0001 ................


[CV]  C=2.0, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   3.6s


[CV] C=2.0, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV] . C=2.0, dual=False, random_state=1000, tol=0.0001, total=   3.9s


[CV] C=2.0, dual=False, random_state=1000, tol=0.001 .................


[CV]  C=2.0, dual=False, random_state=1000, tol=0.00031622776601683794, total=   4.1s


[CV] C=2.0, dual=False, random_state=1000, tol=0.0031622776601683794 .


[CV] .. C=2.0, dual=False, random_state=1000, tol=0.001, total=   3.6s


[CV] C=2.0, dual=False, random_state=1000, tol=0.01 ..................


[CV]  C=2.0, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.3s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=1e-05 ...


[CV] ... C=2.0, dual=False, random_state=1000, tol=0.01, total=   3.6s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=1e-05, total=   3.7s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.0001 ..


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   4.0s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.0001, total=   3.7s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.001 ...


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.00031622776601683794, total=   3.6s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.001, total=   3.6s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.01 ....


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.6s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=1e-05 ..


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.01, total=   4.1s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=1e-05, total=   4.3s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   4.0s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.0001, total=   4.2s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.00031622776601683794, total=   3.9s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.001, total=   4.0s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.8s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=1e-05 ..


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.01, total=   3.7s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=1e-05, total=   4.7s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   5.3s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.0001, total=   4.6s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.00031622776601683794, total=   4.3s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.001, total=   3.9s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.0031622776601683794, total=   4.2s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=1e-05 ..


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.01, total=   3.7s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=1e-05, total=   4.7s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   4.6s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.0001, total=   4.7s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.00031622776601683794, total=   4.5s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.001, total=   4.2s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.0031622776601683794, total=   4.3s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=1e-05 ..


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.01, total=   3.8s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=1e-05, total=   5.7s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   5.8s


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  2.3min


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.0001, total=   5.6s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.00031622776601683794, total=   5.9s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.001, total=   5.1s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.0031622776601683794, total=   4.6s


[CV] C=256.0, dual=False, random_state=1000, tol=1e-05 ...............


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.01, total=   4.5s


[CV] C=256.0, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=256.0, dual=False, random_state=1000, tol=1e-05, total=   5.7s


[CV] C=256.0, dual=False, random_state=1000, tol=0.0001 ..............


[CV]  C=256.0, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   5.7s


[CV] C=256.0, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=256.0, dual=False, random_state=1000, tol=0.0001, total=   5.3s


[CV] C=256.0, dual=False, random_state=1000, tol=0.001 ...............


[CV]  C=256.0, dual=False, random_state=1000, tol=0.00031622776601683794, total=   5.8s


[CV] C=256.0, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=256.0, dual=False, random_state=1000, tol=0.001, total=   5.5s


[CV] C=256.0, dual=False, random_state=1000, tol=0.01 ................


[CV]  C=256.0, dual=False, random_state=1000, tol=0.0031622776601683794, total=   4.9s


[CV] . C=256.0, dual=False, random_state=1000, tol=0.01, total=   4.1s


[Parallel(n_jobs=2)]: Done  49 out of  49 | elapsed:  3.1min finished


LinearSVC(C=114.03503592196348, class_weight=None, dual=False,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=1000,
     tol=1e-05, verbose=0)

In [33]:
print("Classifier: Linear Support Vector Classifier | Dataset: imdb | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 Score (macro)")
print()
print("Best Parameters found on validation set:")
print(imdb_svm_gs_freq.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % imdb_svm_gs_freq.best_score_)
print()
print("Training set prediction F1 Score (macro):")
imdb_svm_train_freq_predicted = imdb_svm_best_clf_freq.predict(imdb_train_freq_x)
print("%.3f" % f1_score(imdb_train_y, imdb_svm_train_freq_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
imdb_svm_valid_freq_predicted = imdb_svm_best_clf_freq.predict(imdb_valid_freq_x)
print("%.3f" % f1_score(imdb_valid_y, imdb_svm_valid_freq_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
imdb_svm_test_freq_predicted = imdb_svm_best_clf_freq.predict(imdb_test_freq_x)
print("%.3f" % f1_score(imdb_test_y, imdb_svm_test_freq_predicted, average='macro'))

Classifier: Linear Support Vector Classifier | Dataset: imdb | Bag of Words: Frequency
Tuning hyper-parameters for F1 Score (macro)

Best Parameters found on validation set:
{'C': 114.03503592196348, 'dual': False, 'random_state': 1000, 'tol': 1e-05}

Best Scores on validation set:
0.880

Training set prediction F1 Score (macro):


0.951
Validation set prediction F1 Score (macro):


0.880
Test set prediction F1 Score (macro):


0.875
