In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier 

## Helper Functions

In [2]:
def get_vocab(data: pd.DataFrame, input_col: str):
    data[input_col] = data[input_col].str.lower()
    data[input_col] = data[input_col].str.replace('[^\w\s]', '').str.replace('<br />\|...', '')
    tokens = data[input_col].str.split(expand=True)
    return pd.Series(tokens.stack().value_counts()[:10000])

In [3]:
def pre_process_binary(data, input_col, vocab):
    data[input_col] = data[input_col].str.lower()
    data[input_col] = data[input_col].str.replace('[^\w\s]', '').str.replace('<br />\|...', '')
    vocab_dict = vocab.to_dict()
    vocab_dict = {x: i for i, x in enumerate(list(vocab_dict.keys()))}
    bow_matrix = np.zeros(shape=(data.shape[0], 10000))
    data_tokens = data[input_col].str.split(expand=True)
    for i, row in data_tokens.iterrows():
        for word in row.iteritems():
            if word[1] in vocab_dict:
                bow_matrix[i, vocab_dict.get(word[1])] = 1
    return bow_matrix

In [4]:
def pre_process_frequency(data: pd.DataFrame, input_col: str, vocab):
    data[input_col] = data[input_col].str.lower()
    data[input_col] = data[input_col].str.replace('[^\w\s]', '').str.replace('<br />\|...', '')
    vocab_dict = vocab.to_dict()
    vocab_dict = {x: i for i, x in enumerate(list(vocab_dict.keys()))}
    data_tokens = data[input_col].str.split(expand=True)
    bow_matrix = np.zeros(shape=(data_tokens.shape[0], 10000))
    for i, row in data_tokens.iterrows():
        word_count = 0
        for word in row.iteritems():
            if word[1] in vocab_dict:
                bow_matrix[i, vocab_dict.get(word[1])] += 1
                word_count += 1
        if word_count != 0:
            for j in range(bow_matrix[i].shape[0]):
                bow_matrix[i][j] /= word_count
    return bow_matrix


In [5]:
def save_data(fname, data, vocab, input_col, class_col):
    file = open(fname, 'w')
    input_data = data[input_col]
    classes = data[class_col]
    vocab_dict = vocab.to_dict()
    vocab_dict = {x: i + 1 for i, x in enumerate(list(vocab_dict.keys()))}
    data_tokens = input_data.str.split(expand=True)
    for i, row in data_tokens.iterrows():
        row_string = ""
        for word in row.iteritems():
            if word[1] in vocab_dict:
                index = vocab_dict.get(word[1])
                row_string += str(index) + " "
        row_string = row_string[:-1]
        row_string += "\t" + str(classes[i]) + "\n"
        file.write(row_string)
    file.close()

In [6]:
def save_vocab(fname, vocab):
    file = open(fname, 'w')
    vocab_dict = vocab.to_dict()
    for i, word in enumerate(list(vocab_dict.keys())):
        row_string = word + " " + str(i + 1) + " " + str(vocab_dict[word]) + "\n"
        file.write(row_string)
    file.close()


#### Read the data

In [7]:
yelp_train = pd.read_csv('./data/yelp-train.txt', sep='\t', names=["Comments", "Rating"])
yelp_valid = pd.read_csv('./data/yelp-valid.txt', sep='\t', names=["Comments", "Rating"])
yelp_test = pd.read_csv('./data/yelp-test.txt', sep='\t', names=["Comments", "Rating"])

#### Create and save vocabulary

In [8]:
vocab = get_vocab(data=yelp_train, input_col="Comments")

In [14]:
save_vocab("./data-modified/yelp-vocab.txt", vocab)

#### Save the reformatted reviews

In [15]:
save_data("./data-modified/yelp-train.txt", yelp_train, vocab, "Comments", "Rating")
save_data("./data-modified/yelp-valid.txt", yelp_valid, vocab, "Comments", "Rating")
save_data("./data-modified/yelp-test.txt", yelp_test, vocab, "Comments", "Rating")

#### Pre-process the data into binary bag-of-words

In [9]:
yelp_train_bin_x = pre_process_binary(yelp_train, "Comments", vocab)
yelp_valid_bin_x = pre_process_binary(yelp_valid, "Comments", vocab)
yelp_test_bin_x = pre_process_binary(yelp_test, "Comments", vocab)


#### Pre-process the data into frequency bag-of-words

In [9]:
yelp_train_freq_x = pre_process_frequency(yelp_train, "Comments", vocab)
yelp_valid_freq_x = pre_process_frequency(yelp_valid, "Comments", vocab)
yelp_test_freq_x = pre_process_frequency(yelp_test, "Comments", vocab)

In [10]:
yelp_train_y = np.array(yelp_train["Rating"])
yelp_valid_y = np.array(yelp_valid["Rating"])
yelp_test_y = np.array(yelp_test["Rating"])

#### Append the training and validation set together to perform PredefinedSplit cross validation

The following strategy has been used to be able to take advantage of the MultiThread enabled GridSearchCV function. 

The function is originally created for Cross Validation, but with a neat trick it is possible to use it for predefined splits as well.

First we combine both training and validation sets together. Next, we create a fold index (1D array with length len(training) + len(validation)) that has values -1 for all indices corresponding to training data, and 0 set for all indices corresponding to validation data. 

Using this array we can create a PredefinedSplit which then allows having one fold cross validation with our specific training and validation datasets taken into account.

In [11]:
yelp_train_valid_x = np.append(yelp_train_bin_x, yelp_valid_bin_x, axis=0)
yelp_train_valid_y = np.append(yelp_train_y, yelp_valid_y, axis=0)
yelp_train_valid_fold = np.zeros(shape=(yelp_train_valid_x.shape[0]))
yelp_train_valid_fold[0:yelp_train_bin_x.shape[0]] = -1
yelp_bin_ps = PredefinedSplit(yelp_train_valid_fold)

#### Create Majority Class Classifier

In [19]:
yelp_majority_clf_bin = DummyClassifier("most_frequent", random_state=1000)
yelp_majority_clf_bin.fit(yelp_train_bin_x, yelp_train_y)

DummyClassifier(constant=None, random_state=1000, strategy='most_frequent')

In [20]:
print("Classifier: Majority Class | Dataset: Yelp | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier has no hyper parameters")
print()
print("Training set prediction F1 Score (macro):")
yelp_majority_train_predicted = yelp_majority_clf_bin.predict(yelp_train_bin_x)
print("%.3f" % f1_score(yelp_train_y, yelp_majority_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_majority_valid_predicted = yelp_majority_clf_bin.predict(yelp_valid_bin_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_majority_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_majority_test_predicted = yelp_majority_clf_bin.predict(yelp_test_bin_x)
print("%.3f" % f1_score(yelp_test_y, yelp_majority_test_predicted, average='macro'))
print()

Classifier: Majority Class | Dataset: Yelp | Bag of Words: Binary
Tuning hyper-parameters for F1 score (macro)

This classifier has no hyper parameters

Training set prediction F1 Score (macro):
0.104
Validation set prediction F1 Score (macro):
0.105
Test set prediction F1 Score (macro):
0.104



  'precision', 'predicted', average, warn_for)


#### Create Random Class Classifier

In [21]:
yelp_random_clf_bin = DummyClassifier(strategy="uniform", random_state=1000)
yelp_random_clf_bin.fit(yelp_train_bin_x, yelp_train_y)

DummyClassifier(constant=None, random_state=1000, strategy='uniform')

In [22]:
print("Classifier: Random | Dataset: Yelp | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier has no hyper parameters")
print()
print("Training set prediction F1 Score (macro):")
yelp_random_train_predicted = yelp_random_clf_bin.predict(yelp_train_bin_x)
print("%.3f" % f1_score(yelp_train_y, yelp_random_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_random_valid_predicted = yelp_random_clf_bin.predict(yelp_valid_bin_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_random_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_random_test_predicted = yelp_random_clf_bin.predict(yelp_test_bin_x)
print("%.3f" % f1_score(yelp_test_y, yelp_random_test_predicted, average='macro'))
print()

Classifier: Random | Dataset: Yelp | Bag of Words: Binary
Tuning hyper-parameters for F1 score (macro)

This classifier has no hyper parameters

Training set prediction F1 Score (macro):
0.182
Validation set prediction F1 Score (macro):
0.197
Test set prediction F1 Score (macro):
0.195



#### Create Bernoulli Naive Bayes Classifier for Binary Bag of Words
Parameter ranges: 
* Alpha: 15 equally spread values over 10^-2^ to 10^0^

In [25]:
yelp_nb_clf = BernoulliNB()
yelp_nb_alphas = np.logspace(-2, 0, 15)
yelp_nb_params = [{'alpha': yelp_nb_alphas}]
yelp_nb_gs = GridSearchCV(yelp_nb_clf, yelp_nb_params, cv=yelp_bin_ps, scoring='f1_macro', refit=True, n_jobs=2, verbose=2)
yelp_nb_gs.fit(yelp_train_valid_x, yelp_train_valid_y)
yelp_nb_best_clf = yelp_nb_gs.best_estimator_
yelp_nb_best_clf.fit(yelp_train_bin_x, yelp_train_y)


Fitting 1 folds for each of 15 candidates, totalling 15 fits


[CV] alpha=0.01 ......................................................


[CV] alpha=0.013894954943731374 ......................................


[CV] ....................................... alpha=0.01, total=   2.2s


[CV] alpha=0.019306977288832496 ......................................


[CV] ....................... alpha=0.013894954943731374, total=   2.4s


[CV] alpha=0.02682695795279726 .......................................


[CV] ....................... alpha=0.019306977288832496, total=   2.3s


[CV] alpha=0.0372759372031494 ........................................


[CV] ........................ alpha=0.02682695795279726, total=   2.3s


[CV] alpha=0.0517947467923121 ........................................


[CV] ......................... alpha=0.0372759372031494, total=   2.3s


[CV] alpha=0.07196856730011521 .......................................


[CV] ......................... alpha=0.0517947467923121, total=   2.3s


[CV] alpha=0.1 .......................................................


[CV] ........................ alpha=0.07196856730011521, total=   2.4s


[CV] alpha=0.13894954943731375 .......................................


[CV] ........................................ alpha=0.1, total=   2.4s


[CV] alpha=0.19306977288832497 .......................................


[CV] ........................ alpha=0.13894954943731375, total=   3.7s


[CV] alpha=0.2682695795279725 ........................................


[CV] ........................ alpha=0.19306977288832497, total=   3.4s


[CV] alpha=0.372759372031494 .........................................


[CV] ......................... alpha=0.2682695795279725, total=   3.2s


[CV] alpha=0.517947467923121 .........................................


[CV] .......................... alpha=0.372759372031494, total=   3.0s


[CV] alpha=0.7196856730011517 ........................................


[CV] .......................... alpha=0.517947467923121, total=   2.7s


[CV] alpha=1.0 .......................................................


[CV] ......................... alpha=0.7196856730011517, total=   2.9s


[CV] ........................................ alpha=1.0, total=   2.5s


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:   33.2s finished


BernoulliNB(alpha=0.013894954943731374, binarize=0.0, class_prior=None,
      fit_prior=True)

In [26]:
print("Classifier: Bernoulli Naive Bayes | Dataset: Yelp | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("Best Parameters found on validation set:")
print(yelp_nb_gs.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % yelp_nb_gs.best_score_)
print()
print("Training set prediction F1 Score (macro):")
yelp_nb_train_predicted = yelp_nb_best_clf.predict(yelp_train_bin_x)
print("%.3f" % f1_score(yelp_train_y, yelp_nb_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_nb_valid_predicted = yelp_nb_best_clf.predict(yelp_valid_bin_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_nb_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_nb_test_predicted = yelp_nb_gs.predict(yelp_test_bin_x)
print("%.3f" % f1_score(yelp_test_y, yelp_nb_test_predicted, average='macro'))
print()


Classifier: Bernoulli Naive Bayes | Dataset: Yelp | Bag of Words: Binary
Tuning hyper-parameters for F1 score (macro)

Best Parameters found on validation set:
{'alpha': 0.013894954943731374}

Best Scores on validation set:
0.383

Training set prediction F1 Score (macro):


0.768
Validation set prediction F1 Score (macro):
0.383
Test set prediction F1 Score (macro):


0.374



#### Create Decision Tree Classifier for Binary Bag of Words

Parameter ranges:
* Criterion: 'gini' and 'entropy'
* Max Depth: 10 equally spread values over 2^3^ to 2^11^

In [27]:
yelp_dt_clf_bin = DecisionTreeClassifier()
yelp_dt_criterion_bin = ['gini', 'entropy']
yelp_dt_max_depth_bin = list(np.logspace(3, 11, 10, base=2))
yelp_dt_params_bin = [{'criterion': yelp_dt_criterion_bin, 'max_depth': yelp_dt_max_depth_bin, 'random_state': [1000]}]
yelp_dt_gs_bin = GridSearchCV(yelp_dt_clf_bin, yelp_dt_params_bin, cv=yelp_bin_ps, scoring='f1_macro', n_jobs=2, refit=True, verbose=2)
yelp_dt_gs_bin.fit(yelp_train_valid_x, yelp_train_valid_y)
yelp_dt_best_clf_bin = yelp_dt_gs_bin.best_estimator_
yelp_dt_best_clf_bin.fit(yelp_train_bin_x, yelp_train_y)

Fitting 1 folds for each of 20 candidates, totalling 20 fits


[CV] criterion=gini, max_depth=8.0, random_state=1000 ................


[CV] criterion=gini, max_depth=14.813995396596646, random_state=1000 .


[CV] . criterion=gini, max_depth=8.0, random_state=1000, total=   9.9s


[CV] criterion=gini, max_depth=27.43180745129833, random_state=1000 ..


[CV]  criterion=gini, max_depth=14.813995396596646, random_state=1000, total=  12.1s


[CV] criterion=gini, max_depth=50.796833662982365, random_state=1000 .


[CV]  criterion=gini, max_depth=27.43180745129833, random_state=1000, total=  13.5s


[CV] criterion=gini, max_depth=94.06300750563831, random_state=1000 ..


[CV]  criterion=gini, max_depth=50.796833662982365, random_state=1000, total=  14.1s


[CV] criterion=gini, max_depth=174.18112002232027, random_state=1000 .


[CV]  criterion=gini, max_depth=94.06300750563831, random_state=1000, total=  13.2s


[CV] criterion=gini, max_depth=322.53978877308725, random_state=1000 .


[CV]  criterion=gini, max_depth=174.18112002232027, random_state=1000, total=  13.0s


[CV] criterion=gini, max_depth=597.2628682629713, random_state=1000 ..


[CV]  criterion=gini, max_depth=322.53978877308725, random_state=1000, total=  13.1s


[CV] criterion=gini, max_depth=1105.9811726257212, random_state=1000 .


[CV]  criterion=gini, max_depth=597.2628682629713, random_state=1000, total=  13.1s


[CV] criterion=gini, max_depth=2048.0, random_state=1000 .............


[CV]  criterion=gini, max_depth=1105.9811726257212, random_state=1000, total=  13.0s


[CV] criterion=entropy, max_depth=8.0, random_state=1000 .............


[CV]  criterion=gini, max_depth=2048.0, random_state=1000, total=  13.2s


[CV] criterion=entropy, max_depth=14.813995396596646, random_state=1000 


[CV]  criterion=entropy, max_depth=8.0, random_state=1000, total=   9.2s


[CV] criterion=entropy, max_depth=27.43180745129833, random_state=1000 


[CV]  criterion=entropy, max_depth=14.813995396596646, random_state=1000, total=  11.5s


[CV] criterion=entropy, max_depth=50.796833662982365, random_state=1000 


[CV]  criterion=entropy, max_depth=27.43180745129833, random_state=1000, total=  12.6s


[CV] criterion=entropy, max_depth=94.06300750563831, random_state=1000 


[CV]  criterion=entropy, max_depth=50.796833662982365, random_state=1000, total=  13.0s


[CV] criterion=entropy, max_depth=174.18112002232027, random_state=1000 


[CV]  criterion=entropy, max_depth=94.06300750563831, random_state=1000, total=  13.5s


[CV] criterion=entropy, max_depth=322.53978877308725, random_state=1000 


[CV]  criterion=entropy, max_depth=174.18112002232027, random_state=1000, total=  13.9s


[CV] criterion=entropy, max_depth=597.2628682629713, random_state=1000 


[CV]  criterion=entropy, max_depth=322.53978877308725, random_state=1000, total=  13.7s


[CV] criterion=entropy, max_depth=1105.9811726257212, random_state=1000 


[CV]  criterion=entropy, max_depth=597.2628682629713, random_state=1000, total=  13.7s


[CV] criterion=entropy, max_depth=2048.0, random_state=1000 ..........


[CV]  criterion=entropy, max_depth=1105.9811726257212, random_state=1000, total=  13.1s


[CV]  criterion=entropy, max_depth=2048.0, random_state=1000, total=  12.4s


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:  2.2min finished


DecisionTreeClassifier(class_weight=None, criterion='entropy',
            max_depth=27.43180745129833, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1000, splitter='best')

In [28]:
print("Classifier: Decision Tree | Dataset: Yelp | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 Score (macro)")
print()
print("Best Parameters found on validation set:")
print(yelp_dt_gs_bin.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % yelp_dt_gs_bin.best_score_)
print()
print("Training set prediction F1 Score (macro):")
yelp_dt_train_predicted = yelp_dt_best_clf_bin.predict(yelp_train_bin_x)
print("%.3f" % f1_score(yelp_train_y, yelp_dt_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_dt_valid_predicted = yelp_dt_best_clf_bin.predict(yelp_valid_bin_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_dt_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_dt_test_predicted = yelp_dt_best_clf_bin.predict(yelp_test_bin_x)
print("%.3f" % f1_score(yelp_test_y, yelp_dt_test_predicted, average='macro'))


Classifier: Decision Tree | Dataset: Yelp | Bag of Words: Binary
Tuning hyper-parameters for F1 Score (macro)

Best Parameters found on validation set:
{'criterion': 'entropy', 'max_depth': 27.43180745129833, 'random_state': 1000}

Best Scores on validation set:
0.302

Training set prediction F1 Score (macro):
0.925
Validation set prediction F1 Score (macro):


0.302
Test set prediction F1 Score (macro):
0.283


#### Creating Linear Support Vector Classifier for Binary Bag of Words
Parameters:
* C: 20 equally spread values from 10^-3^ to 10^1^
* Tolerance: 20 equally spread values from 10^-3^ to 10^-1^

In [19]:
yelp_svm_clf_bin = LinearSVC()
yelp_svm_c_bin = np.logspace(-4, -1, 7, base=2)
yelp_svm_tol_bin = np.logspace(-5, -2, 7)
yelp_svm_dual_bin = [False]
yelp_svm_random_state_bin = [1000]
yelp_svm_params_bin = [{'C': yelp_svm_c_bin, 'tol': yelp_svm_tol_bin, 'random_state': yelp_svm_random_state_bin, 'dual': yelp_svm_dual_bin}]
yelp_svm_gs_bin = GridSearchCV(yelp_svm_clf_bin, yelp_svm_params_bin, cv=yelp_bin_ps, scoring='f1_macro', n_jobs=2, refit=True, verbose=2)

In [20]:
yelp_svm_gs_bin.fit(yelp_train_valid_x, yelp_train_valid_y)
yelp_svm_best_clf_bin = yelp_svm_gs_bin.best_estimator_
yelp_svm_best_clf_bin.fit(yelp_train_bin_x, yelp_train_y)

Fitting 1 folds for each of 49 candidates, totalling 49 fits


[CV] C=0.0625, dual=False, random_state=1000, tol=1e-05 ..............


[CV] C=0.0625, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=0.0625, dual=False, random_state=1000, tol=1e-05, total=   3.1s


[CV] C=0.0625, dual=False, random_state=1000, tol=0.0001 .............


[CV]  C=0.0625, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   3.1s


[CV] C=0.0625, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=0.0625, dual=False, random_state=1000, tol=0.0001, total=   2.9s


[CV] C=0.0625, dual=False, random_state=1000, tol=0.001 ..............


[CV]  C=0.0625, dual=False, random_state=1000, tol=0.00031622776601683794, total=   2.7s


[CV] C=0.0625, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=0.0625, dual=False, random_state=1000, tol=0.001, total=   2.5s


[CV] C=0.0625, dual=False, random_state=1000, tol=0.01 ...............


[CV]  C=0.0625, dual=False, random_state=1000, tol=0.0031622776601683794, total=   2.3s


[CV] C=0.08838834764831845, dual=False, random_state=1000, tol=1e-05 .


[CV]  C=0.0625, dual=False, random_state=1000, tol=0.01, total=   2.1s


[CV] C=0.08838834764831845, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=0.08838834764831845, dual=False, random_state=1000, tol=1e-05, total=   3.3s


[CV] C=0.08838834764831845, dual=False, random_state=1000, tol=0.0001 


[CV]  C=0.08838834764831845, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   3.0s


[CV] C=0.08838834764831845, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=0.08838834764831845, dual=False, random_state=1000, tol=0.0001, total=   3.2s


[CV] C=0.08838834764831845, dual=False, random_state=1000, tol=0.001 .


[CV]  C=0.08838834764831845, dual=False, random_state=1000, tol=0.00031622776601683794, total=   3.0s


[CV] C=0.08838834764831845, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=0.08838834764831845, dual=False, random_state=1000, tol=0.001, total=   2.9s


[CV] C=0.08838834764831845, dual=False, random_state=1000, tol=0.01 ..


[CV]  C=0.08838834764831845, dual=False, random_state=1000, tol=0.0031622776601683794, total=   2.5s


[CV] C=0.125, dual=False, random_state=1000, tol=1e-05 ...............


[CV]  C=0.08838834764831845, dual=False, random_state=1000, tol=0.01, total=   2.3s


[CV] C=0.125, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=0.125, dual=False, random_state=1000, tol=1e-05, total=   3.5s


[CV] C=0.125, dual=False, random_state=1000, tol=0.0001 ..............


[CV]  C=0.125, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   3.4s


[CV] C=0.125, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=0.125, dual=False, random_state=1000, tol=0.0001, total=   3.1s


[CV] C=0.125, dual=False, random_state=1000, tol=0.001 ...............


[CV]  C=0.125, dual=False, random_state=1000, tol=0.00031622776601683794, total=   3.2s


[CV] C=0.125, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=0.125, dual=False, random_state=1000, tol=0.001, total=   2.8s


[CV] C=0.125, dual=False, random_state=1000, tol=0.01 ................


[CV]  C=0.125, dual=False, random_state=1000, tol=0.0031622776601683794, total=   2.6s


[CV] C=0.1767766952966369, dual=False, random_state=1000, tol=1e-05 ..


[CV] . C=0.125, dual=False, random_state=1000, tol=0.01, total=   2.5s


[CV] C=0.1767766952966369, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=0.1767766952966369, dual=False, random_state=1000, tol=1e-05, total=   4.0s


[CV] C=0.1767766952966369, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=0.1767766952966369, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   3.9s


[CV] C=0.1767766952966369, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=0.1767766952966369, dual=False, random_state=1000, tol=0.0001, total=   4.2s


[CV] C=0.1767766952966369, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=0.1767766952966369, dual=False, random_state=1000, tol=0.00031622776601683794, total=   3.8s


[CV] C=0.1767766952966369, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=0.1767766952966369, dual=False, random_state=1000, tol=0.001, total=   3.4s


[CV] C=0.1767766952966369, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=0.1767766952966369, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.1s


[CV] C=0.25, dual=False, random_state=1000, tol=1e-05 ................


[CV]  C=0.1767766952966369, dual=False, random_state=1000, tol=0.01, total=   2.9s


[CV] C=0.25, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV] . C=0.25, dual=False, random_state=1000, tol=1e-05, total=   4.4s


[CV] C=0.25, dual=False, random_state=1000, tol=0.0001 ...............


[CV]  C=0.25, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   4.1s


[CV] C=0.25, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=0.25, dual=False, random_state=1000, tol=0.0001, total=   4.0s


[CV] C=0.25, dual=False, random_state=1000, tol=0.001 ................


[CV]  C=0.25, dual=False, random_state=1000, tol=0.00031622776601683794, total=   3.9s


[CV] C=0.25, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV] . C=0.25, dual=False, random_state=1000, tol=0.001, total=   3.6s


[CV] C=0.25, dual=False, random_state=1000, tol=0.01 .................


[CV]  C=0.25, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.2s


[CV] C=0.3535533905932738, dual=False, random_state=1000, tol=1e-05 ..


[CV] .. C=0.25, dual=False, random_state=1000, tol=0.01, total=   2.8s


[CV] C=0.3535533905932738, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=0.3535533905932738, dual=False, random_state=1000, tol=1e-05, total=   6.4s


[CV] C=0.3535533905932738, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=0.3535533905932738, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   5.5s


[CV] C=0.3535533905932738, dual=False, random_state=1000, tol=0.00031622776601683794 


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  1.2min


[CV]  C=0.3535533905932738, dual=False, random_state=1000, tol=0.0001, total=   5.4s


[CV] C=0.3535533905932738, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=0.3535533905932738, dual=False, random_state=1000, tol=0.00031622776601683794, total=   4.8s


[CV] C=0.3535533905932738, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=0.3535533905932738, dual=False, random_state=1000, tol=0.001, total=   3.9s


[CV] C=0.3535533905932738, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=0.3535533905932738, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.4s


[CV] C=0.5, dual=False, random_state=1000, tol=1e-05 .................


[CV]  C=0.3535533905932738, dual=False, random_state=1000, tol=0.01, total=   2.8s


[CV] C=0.5, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=0.5, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   9.8s


[CV] .. C=0.5, dual=False, random_state=1000, tol=1e-05, total=  11.9s


[CV] C=0.5, dual=False, random_state=1000, tol=0.0001 ................


[CV] C=0.5, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=0.5, dual=False, random_state=1000, tol=0.00031622776601683794, total=   6.9s


[CV] C=0.5, dual=False, random_state=1000, tol=0.001 .................


[CV] . C=0.5, dual=False, random_state=1000, tol=0.0001, total=   9.1s


[CV] C=0.5, dual=False, random_state=1000, tol=0.0031622776601683794 .


[CV]  C=0.5, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.8s


[CV] C=0.5, dual=False, random_state=1000, tol=0.01 ..................


[CV] .. C=0.5, dual=False, random_state=1000, tol=0.001, total=   5.1s


[CV] ... C=0.5, dual=False, random_state=1000, tol=0.01, total=   2.6s


[Parallel(n_jobs=2)]: Done  49 out of  49 | elapsed:  1.8min finished


LinearSVC(C=0.125, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=1000,
     tol=0.00031622776601683794, verbose=0)

In [21]:
print("Classifier: Linear Support Vector Classifier | Dataset: Yelp | Bag of Words: Binary")
print("Tuning hyper-parameters for F1 Score (macro)")
print()
print("Best Parameters found on validation set:")
print(yelp_svm_gs_bin.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % yelp_svm_gs_bin.best_score_)
print()
print("Training set prediction F1 Score (macro):")
yelp_svm_train_predicted = yelp_svm_best_clf_bin.predict(yelp_train_bin_x)
print("%.3f" % f1_score(yelp_train_y, yelp_svm_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_svm_valid_predicted = yelp_svm_best_clf_bin.predict(yelp_valid_bin_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_svm_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_svm_test_predicted = yelp_svm_best_clf_bin.predict(yelp_test_bin_x)
print("%.3f" % f1_score(yelp_test_y, yelp_svm_test_predicted, average='macro'))


Classifier: Linear Support Vector Classifier | Dataset: Yelp | Bag of Words: Binary
Tuning hyper-parameters for F1 Score (macro)

Best Parameters found on validation set:
{'C': 0.125, 'dual': False, 'random_state': 1000, 'tol': 0.00031622776601683794}

Best Scores on validation set:
0.443

Training set prediction F1 Score (macro):


0.981
Validation set prediction F1 Score (macro):
0.443
Test set prediction F1 Score (macro):
0.435


#### Create Majority Class Classifier for Frequency Bag of Words

In [22]:
yelp_majority_clf_freq = DummyClassifier(strategy="most_frequent", random_state=1000)
yelp_majority_clf_freq.fit(yelp_train_freq_x, yelp_train_y)


NameError: name 'yelp_train_freq_x' is not defined

In [12]:
print("Classifier: Majority Class | Dataset: Yelp | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier has no hyper parameters")
print()
print("Training set prediction F1 Score (macro):")
yelp_majority_train_predicted_freq = yelp_majority_clf_freq.predict(yelp_train_freq_x)
print("%.3f" % f1_score(yelp_train_y, yelp_majority_train_predicted_freq, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_majority_valid_predicted_freq = yelp_majority_clf_freq.predict(yelp_valid_freq_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_majority_valid_predicted_freq, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_majority_test_predicted_freq = yelp_majority_clf_freq.predict(yelp_test_freq_x)
print("%.3f" % f1_score(yelp_test_y, yelp_majority_test_predicted_freq, average='macro'))
print()


Classifier: Majority Class | Dataset: Yelp | Bag of Words: Frequency
Tuning hyper-parameters for F1 score (macro)

This classifier has no hyper parameters

Training set prediction F1 Score (macro):
0.104
Validation set prediction F1 Score (macro):
0.105
Test set prediction F1 Score (macro):
0.104



  'precision', 'predicted', average, warn_for)


#### Create Random Class Classifier for Frequency Bag of Words

In [11]:
yelp_random_clf_freq = DummyClassifier(strategy="uniform", random_state=1000)
yelp_random_clf_freq.fit(yelp_train_freq_x, yelp_train_y)

DummyClassifier(constant=None, random_state=1000, strategy='uniform')

In [12]:
print("Classifier: Random | Dataset: Yelp | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier has no hyper parameters")
print()
print("Training set prediction F1 Score (macro):")
yelp_random_train_predicted_freq = yelp_random_clf_freq.predict(yelp_train_freq_x)
print("%.3f" % f1_score(yelp_train_y, yelp_random_train_predicted_freq, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_random_valid_predicted_freq = yelp_random_clf_freq.predict(yelp_valid_freq_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_random_valid_predicted_freq, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_random_test_predicted_freq = yelp_random_clf_freq.predict(yelp_test_freq_x)
print("%.3f" % f1_score(yelp_test_y, yelp_random_test_predicted_freq, average='macro'))
print()

Classifier: Random | Dataset: Yelp | Bag of Words: Frequency
Tuning hyper-parameters for F1 score (macro)

This classifier has no hyper parameters

Training set prediction F1 Score (macro):
0.182
Validation set prediction F1 Score (macro):
0.197
Test set prediction F1 Score (macro):
0.195



#### Create Gaussian Naive Bayes Classifier for Frequency Bag of Words
Parameters:
* None

In [15]:
yelp_gnb_clf = GaussianNB()
yelp_gnb_clf.fit(yelp_train_freq_x, yelp_train_y)

GaussianNB(priors=None)

In [16]:
print("Classifier: Gaussian Naive Bayes | Dataset: Yelp | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 score (macro)")
print()
print("This classifier does not have any hyper-parameters")
print()
print("Training set prediction F1 Score (macro):")
yelp_gnb_train_predicted = yelp_gnb_clf.predict(yelp_train_freq_x)
print("%.3f" % f1_score(yelp_train_y, yelp_gnb_train_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_gnb_valid_predicted = yelp_gnb_clf.predict(yelp_valid_freq_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_gnb_valid_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_gnb_test_predicted = yelp_gnb_clf.predict(yelp_test_freq_x)
print("%.3f" % f1_score(yelp_test_y, yelp_gnb_test_predicted, average='macro'))
print()

Classifier: Gaussian Naive Bayes | Dataset: Yelp | Bag of Words: Frequency
Tuning hyper-parameters for F1 score (macro)

This classifier does not have any hyper-parameters

Training set prediction F1 Score (macro):


0.788
Validation set prediction F1 Score (macro):


0.244
Test set prediction F1 Score (macro):


0.250



In [17]:
yelp_freq_train_valid_x = np.append(yelp_train_freq_x, yelp_valid_freq_x, axis=0)
yelp_freq_train_valid_y = np.append(yelp_train_y, yelp_valid_y, axis=0)
yelp_freq_train_valid_fold = np.zeros(shape=(yelp_freq_train_valid_x.shape[0]))
yelp_freq_train_valid_fold[0:yelp_train_freq_x.shape[0]] = -1
yelp_freq_ps = PredefinedSplit(yelp_freq_train_valid_fold)

#### Create Decision Tree Classifier for Frequency Bag of Words
Parameters:
* Criterion: 'gini' and 'entropy'
* Max Depth: 10 equally spred values over 2^3^ to 2^11^

In [18]:
yelp_dt_clf_freq = DecisionTreeClassifier()
yelp_dt_criterion_freq = ['gini', 'entropy']
yelp_dt_max_depth_freq = np.logspace(3, 11, 10, base=2)
yelp_dt_params_freq = [{'criterion': yelp_dt_criterion_freq, 'max_depth': yelp_dt_max_depth_freq, 'random_state': [1000]}]
yelp_dt_gs_freq = GridSearchCV(yelp_dt_clf_freq, yelp_dt_params_freq, cv=yelp_freq_ps, scoring='f1_macro', n_jobs=2, refit=True, verbose=2)
yelp_dt_gs_freq.fit(yelp_freq_train_valid_x, yelp_freq_train_valid_y)
yelp_dt_best_clf_freq = yelp_dt_gs_freq.best_estimator_
yelp_dt_best_clf_freq.fit(yelp_train_freq_x, yelp_train_y)

Fitting 1 folds for each of 20 candidates, totalling 20 fits


[CV] criterion=gini, max_depth=8.0, random_state=1000 ................


[CV] criterion=gini, max_depth=14.813995396596646, random_state=1000 .


[CV] . criterion=gini, max_depth=8.0, random_state=1000, total=  10.2s


[CV] criterion=gini, max_depth=27.43180745129833, random_state=1000 ..


[CV]  criterion=gini, max_depth=14.813995396596646, random_state=1000, total=  12.4s


[CV] criterion=gini, max_depth=50.796833662982365, random_state=1000 .


[CV]  criterion=gini, max_depth=27.43180745129833, random_state=1000, total=  14.2s


[CV] criterion=gini, max_depth=94.06300750563831, random_state=1000 ..


[CV]  criterion=gini, max_depth=50.796833662982365, random_state=1000, total=  14.3s


[CV] criterion=gini, max_depth=174.18112002232027, random_state=1000 .


[CV]  criterion=gini, max_depth=94.06300750563831, random_state=1000, total=  14.5s


[CV] criterion=gini, max_depth=322.53978877308725, random_state=1000 .


[CV]  criterion=gini, max_depth=174.18112002232027, random_state=1000, total=  14.1s


[CV] criterion=gini, max_depth=597.2628682629713, random_state=1000 ..


[CV]  criterion=gini, max_depth=322.53978877308725, random_state=1000, total=  14.0s


[CV] criterion=gini, max_depth=1105.9811726257212, random_state=1000 .


[CV]  criterion=gini, max_depth=597.2628682629713, random_state=1000, total=  14.3s


[CV] criterion=gini, max_depth=2048.0, random_state=1000 .............


[CV]  criterion=gini, max_depth=1105.9811726257212, random_state=1000, total=  14.0s


[CV] criterion=entropy, max_depth=8.0, random_state=1000 .............


[CV]  criterion=gini, max_depth=2048.0, random_state=1000, total=  14.2s


[CV] criterion=entropy, max_depth=14.813995396596646, random_state=1000 


[CV]  criterion=entropy, max_depth=8.0, random_state=1000, total=  10.5s


[CV] criterion=entropy, max_depth=27.43180745129833, random_state=1000 


[CV]  criterion=entropy, max_depth=14.813995396596646, random_state=1000, total=  13.7s


[CV] criterion=entropy, max_depth=50.796833662982365, random_state=1000 


[CV]  criterion=entropy, max_depth=27.43180745129833, random_state=1000, total=  15.4s


[CV] criterion=entropy, max_depth=94.06300750563831, random_state=1000 


[CV]  criterion=entropy, max_depth=50.796833662982365, random_state=1000, total=  15.8s


[CV] criterion=entropy, max_depth=174.18112002232027, random_state=1000 


[CV]  criterion=entropy, max_depth=94.06300750563831, random_state=1000, total=  15.9s


[CV] criterion=entropy, max_depth=322.53978877308725, random_state=1000 


[CV]  criterion=entropy, max_depth=174.18112002232027, random_state=1000, total=  15.9s


[CV] criterion=entropy, max_depth=597.2628682629713, random_state=1000 


[CV]  criterion=entropy, max_depth=322.53978877308725, random_state=1000, total=  15.8s


[CV] criterion=entropy, max_depth=1105.9811726257212, random_state=1000 


[CV]  criterion=entropy, max_depth=597.2628682629713, random_state=1000, total=  15.9s


[CV] criterion=entropy, max_depth=2048.0, random_state=1000 ..........


[CV]  criterion=entropy, max_depth=1105.9811726257212, random_state=1000, total=  15.7s


[CV]  criterion=entropy, max_depth=2048.0, random_state=1000, total=  15.4s


[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:  2.5min finished


DecisionTreeClassifier(class_weight=None, criterion='entropy',
            max_depth=14.813995396596646, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1000, splitter='best')

In [19]:
print("Classifier: Decision Tree | Dataset: Yelp | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 Score (macro)")
print()
print("Best Parameters found on validation set:")
print(yelp_dt_gs_freq.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % yelp_dt_gs_freq.best_score_)
print()
print("Training set prediction F1 Score (macro):")
yelp_dt_train_freq_predicted = yelp_dt_best_clf_freq.predict(yelp_train_freq_x)
print("%.3f" % f1_score(yelp_train_y, yelp_dt_train_freq_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_dt_valid_freq_predicted = yelp_dt_best_clf_freq.predict(yelp_valid_freq_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_dt_valid_freq_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_dt_test_freq_predicted = yelp_dt_best_clf_freq.predict(yelp_test_freq_x)
print("%.3f" % f1_score(yelp_test_y, yelp_dt_test_freq_predicted, average='macro'))


Classifier: Decision Tree | Dataset: Yelp | Bag of Words: Frequency
Tuning hyper-parameters for F1 Score (macro)

Best Parameters found on validation set:
{'criterion': 'entropy', 'max_depth': 14.813995396596646, 'random_state': 1000}

Best Scores on validation set:
0.310

Training set prediction F1 Score (macro):


0.618
Validation set prediction F1 Score (macro):
0.310
Test set prediction F1 Score (macro):
0.296


#### Create Linear Support Vector Classifier for Frequency Bag of Words
Parameters:
* C: 7 equally distributed values over 2^1^ to 2^8^
* Tolerance: 7 equally distributed values over 10^-5^ to 10^-2^

In [44]:
yelp_svm_clf_freq = LinearSVC()
yelp_svm_c_freq = np.logspace(1, 8, 7, base=2)
yelp_svm_tol_freq = np.logspace(-5, -2, 7)
yelp_svm_dual_freq = [False]
yelp_svm_random_state_freq = [1000]
yelp_svm_params_freq = [{'C': yelp_svm_c_freq, 'tol': yelp_svm_tol_freq, 'random_state': yelp_svm_random_state_freq, 'dual': yelp_svm_dual_freq}]
yelp_svm_gs_freq = GridSearchCV(yelp_svm_clf_freq, yelp_svm_params_freq, cv=yelp_freq_ps, scoring='f1_macro', n_jobs=2, refit=True, verbose=2)

In [45]:
yelp_svm_gs_freq.fit(yelp_freq_train_valid_x, yelp_freq_train_valid_y)
yelp_svm_best_clf_freq = yelp_svm_gs_freq.best_estimator_
yelp_svm_best_clf_freq.fit(yelp_train_freq_x, yelp_train_y)

Fitting 1 folds for each of 49 candidates, totalling 49 fits


[CV] C=2.0, dual=False, random_state=1000, tol=1e-05 .................


[CV] C=2.0, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV] .. C=2.0, dual=False, random_state=1000, tol=1e-05, total=   2.1s


[CV] C=2.0, dual=False, random_state=1000, tol=0.0001 ................


[CV]  C=2.0, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   2.1s


[CV] C=2.0, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV] . C=2.0, dual=False, random_state=1000, tol=0.0001, total=   1.9s


[CV] C=2.0, dual=False, random_state=1000, tol=0.001 .................


[CV]  C=2.0, dual=False, random_state=1000, tol=0.00031622776601683794, total=   1.8s


[CV] C=2.0, dual=False, random_state=1000, tol=0.0031622776601683794 .


[CV] .. C=2.0, dual=False, random_state=1000, tol=0.001, total=   1.7s


[CV] C=2.0, dual=False, random_state=1000, tol=0.01 ..................


[CV]  C=2.0, dual=False, random_state=1000, tol=0.0031622776601683794, total=   1.7s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=1e-05 ...


[CV] ... C=2.0, dual=False, random_state=1000, tol=0.01, total=   1.6s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=1e-05, total=   2.3s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.0001 ..


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   2.3s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.0001, total=   2.1s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.001 ...


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.00031622776601683794, total=   2.0s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.001, total=   1.9s


[CV] C=4.489848193237493, dual=False, random_state=1000, tol=0.01 ....


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.0031622776601683794, total=   1.9s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=1e-05 ..


[CV]  C=4.489848193237493, dual=False, random_state=1000, tol=0.01, total=   1.7s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=1e-05, total=   2.8s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   2.7s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.0001, total=   2.4s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.00031622776601683794, total=   2.4s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.001, total=   2.1s


[CV] C=10.079368399158986, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.0031622776601683794, total=   2.1s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=1e-05 ..


[CV]  C=10.079368399158986, dual=False, random_state=1000, tol=0.01, total=   1.8s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=1e-05, total=   3.4s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   3.2s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.0001, total=   2.9s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.00031622776601683794, total=   2.7s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.001, total=   2.5s


[CV] C=22.627416997969522, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.0031622776601683794, total=   2.3s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=1e-05 ..


[CV]  C=22.627416997969522, dual=False, random_state=1000, tol=0.01, total=   2.0s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=1e-05, total=   4.4s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   3.7s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.0001, total=   3.7s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.00031622776601683794, total=   3.3s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.001, total=   3.0s


[CV] C=50.796833662982394, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.0031622776601683794, total=   2.7s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=1e-05 ..


[CV]  C=50.796833662982394, dual=False, random_state=1000, tol=0.01, total=   2.2s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=1e-05, total=   6.3s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.0001 .


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=   5.8s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.00031622776601683794 


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   57.7s


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.0001, total=   5.3s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.001 ..


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.00031622776601683794, total=   4.9s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.001, total=   3.9s


[CV] C=114.03503592196348, dual=False, random_state=1000, tol=0.01 ...


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.1s


[CV] C=256.0, dual=False, random_state=1000, tol=1e-05 ...............


[CV]  C=114.03503592196348, dual=False, random_state=1000, tol=0.01, total=   2.7s


[CV] C=256.0, dual=False, random_state=1000, tol=3.1622776601683795e-05 


[CV]  C=256.0, dual=False, random_state=1000, tol=1e-05, total=  10.9s


[CV] C=256.0, dual=False, random_state=1000, tol=0.0001 ..............


[CV]  C=256.0, dual=False, random_state=1000, tol=3.1622776601683795e-05, total=  10.1s


[CV] C=256.0, dual=False, random_state=1000, tol=0.00031622776601683794 


[CV]  C=256.0, dual=False, random_state=1000, tol=0.00031622776601683794, total=   7.1s


[CV] C=256.0, dual=False, random_state=1000, tol=0.001 ...............


[CV]  C=256.0, dual=False, random_state=1000, tol=0.0001, total=   9.0s


[CV] C=256.0, dual=False, random_state=1000, tol=0.0031622776601683794 


[CV]  C=256.0, dual=False, random_state=1000, tol=0.0031622776601683794, total=   3.7s


[CV] C=256.0, dual=False, random_state=1000, tol=0.01 ................


[CV]  C=256.0, dual=False, random_state=1000, tol=0.001, total=   5.1s


[CV] . C=256.0, dual=False, random_state=1000, tol=0.01, total=   2.2s


[Parallel(n_jobs=2)]: Done  49 out of  49 | elapsed:  1.6min finished


LinearSVC(C=114.03503592196348, class_weight=None, dual=False,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=1000,
     tol=0.001, verbose=0)

In [46]:
print("Classifier: Linear Support Vector Classifier | Dataset: Yelp | Bag of Words: Frequency")
print("Tuning hyper-parameters for F1 Score (macro)")
print()
print("Best Parameters found on validation set:")
print(yelp_svm_gs_freq.best_params_)
print()
print("Best Scores on validation set:")
print("%.3f" % yelp_svm_gs_freq.best_score_)
print()
print("Training set prediction F1 Score (macro):")
yelp_svm_train_freq_predicted = yelp_svm_best_clf_freq.predict(yelp_train_freq_x)
print("%.3f" % f1_score(yelp_train_y, yelp_svm_train_freq_predicted, average='macro'))
print("Validation set prediction F1 Score (macro):")
yelp_svm_valid_freq_predicted = yelp_svm_best_clf_freq.predict(yelp_valid_freq_x)
print("%.3f" % f1_score(yelp_valid_y, yelp_svm_valid_freq_predicted, average='macro'))
print("Test set prediction F1 Score (macro):")
yelp_svm_test_freq_predicted = yelp_svm_best_clf_freq.predict(yelp_test_freq_x)
print("%.3f" % f1_score(yelp_test_y, yelp_svm_test_freq_predicted, average='macro'))

Classifier: Linear Support Vector Classifier | Dataset: Yelp | Bag of Words: Frequency
Tuning hyper-parameters for F1 Score (macro)

Best Parameters found on validation set:
{'C': 114.03503592196348, 'dual': False, 'random_state': 1000, 'tol': 0.001}

Best Scores on validation set:
0.465

Training set prediction F1 Score (macro):


0.897
Validation set prediction F1 Score (macro):
0.465
Test set prediction F1 Score (macro):
0.460
