In [1]:
import os
import sklearn
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from glob import glob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.decomposition import PCA

In [2]:
# use nltk to remove stopwords and lemmatize
# you might need to run: nltk.download() to fetch the stopword package in "all packages"
# you might also need to run ntlk.download("punkt")
english_stopwords = set(stopwords.words("english"))
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


In [4]:
def get_data(filepath, label):
    data = []
    labels = []
    with open('./data/' + filepath, 'r') as f:
        for line in f.readlines():
            data.append(clean_str(line))
            labels.append(label)
    return data, labels

In [5]:
def clean_input_text(text):
    clean_text = []
    for sent in text:
        clean_sent = ""
        sent_tokens = word_tokenize(sent)
        for token in sent_tokens:
            clean_sent += wordnet_lemmatizer.lemmatize(token) + " " if token not in english_stopwords else ""
        clean_text.append(clean_sent)
    return clean_text

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Normalizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

import six
from abc import ABCMeta
import numpy as np
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer


class NBSVM(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)):

    def __init__(self, alpha=1.0, C=1.0, max_iter=10000, penalty = 'l2', loss = 'squared_hinge'):
        self.alpha = alpha
        self.max_iter = max_iter
        self.C = C
        self.svm_ = []
        self.penalty = penalty
        self.loss = loss

    def fit(self, X, y):
        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # so we don't have to cast X to floating point
        Y = Y.astype(np.float64)

        # Count raw events from data
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.ratios_ = np.full((n_effective_classes, n_features), self.alpha,
                                 dtype=np.float64)
        self._compute_ratios(X, Y)

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            svm = LinearSVC(C=self.C, max_iter=self.max_iter, penalty = self.penalty, loss = self.loss)
            Y_i = Y[:,i]
            svm.fit(X_i, Y_i)
            self.svm_.append(svm) 

        return self

    def predict(self, X):
        n_effective_classes = self.class_count_.shape[0]
        n_examples = X.shape[0]

        D = np.zeros((n_effective_classes, n_examples))

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            D[i] = self.svm_[i].decision_function(X_i)
        
        return self.classes_[np.argmax(D, axis=0)]
        
    def _compute_ratios(self, X, Y):
        """Count feature occurrences and compute ratios."""
        if np.any((X.data if issparse(X) else X) < 0):
            raise ValueError("Input X must be non-negative")

        self.ratios_ += safe_sparse_dot(Y.T, X)  # ratio + feature_occurrance_c
        normalize(self.ratios_, norm='l1', axis=1, copy=False)
        row_calc = lambda r: np.log(np.divide(r, (1 - r)))
        self.ratios_ = np.apply_along_axis(row_calc, axis=1, arr=self.ratios_)
        check_array(self.ratios_)
        self.ratios_ = sparse.csr_matrix(self.ratios_)
        
class Features:

	def transform_bag_of_words(self, X_train):
		X_train_counts = self.count_transformer.transform(X_train)
		return self.transform_tfidf(X_train_counts)

	def transform_tfidf(self, unmodified_set):
		modified_set = self.tfidf_transformer.transform(unmodified_set)
		return modified_set
	def pipelined_features(self, with_random_search = True, with_nb_features = True):
		from sklearn.pipeline import Pipeline
		stop_words = ['', ' ', 'a', 'this', 'of', 'if', 'is', 'are']
		if with_nb_features:
			model = NBSVM()
		else:
			model = LinearSVC()
            
            
		pclf = Pipeline([
		    ('vect', CountVectorizer(ngram_range = (1,2), token_pattern = r'\b(?:\d+(?:,\d{3,3})+(?:\.\d+)?)\b|(?:\b(?:([a-zA-Z]{2,}|\d+)(?::?\d{0,2}(?:am|pm)?|\'?\w?))\b)', stop_words = stop_words, binary = False)),
		    ('tfidf', TfidfTransformer()),
		    ('norm', Normalizer()),
            ('clf', model),
		])

		from sklearn.model_selection import RandomizedSearchCV
		from scipy.stats import randint as randint
		from scipy.stats import uniform

		alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
		max_iter = [5000, 4000, 6000, 7000, 3500, 10000, 9000, 11000]
		C = [1000, 100, 10, 1]
		penalty = ['l2']
		loss = ['hinge', 'squared_hinge']
		if with_nb_features:
			parameters = {'clf__C': C, 'clf__alpha': alpha, 'clf__max_iter':max_iter, 'clf__loss': loss, 'clf__penalty': penalty}
		else:
			parameters = {'clf__C': C,'clf__max_iter':max_iter, 'clf__loss': loss, 'clf__penalty': penalty}
		seed = 50
		if with_random_search: 
			random_search = RandomizedSearchCV(pclf, param_distributions = parameters, cv=2, verbose = 10, random_state = seed, n_iter = 50)
			return random_search
		
		return pclf

In [7]:
f = Features()

## MR Polarity Dataset with NB-SVM


In [8]:
pclf = f.pipelined_features()
data = {}
neg_values = get_data('rt-polaritydata/rt-polarity.neg', 'neg')
pos_values = get_data('rt-polaritydata/rt-polarity.pos', 'pos')
data['neg'] = neg_values[0]
data['pos'] = pos_values[0]
train_docs = []
train_docs.extend(data['neg'])
train_docs.extend(data['pos'])
train_labels = []
train_labels.extend(neg_values[1])
train_labels.extend(pos_values[1])
X_train, X_test, y_train, y_test = train_test_split(train_docs, train_labels, test_size=0.1, random_state=42)

In [9]:
pclf.fit(X_train, y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1, score=0.7594831179658191, total=   3.8s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.5s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1, score=0.7488013341671879, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.7s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10, score=0.7596915381408921, total=   9.3s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   16.7s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10, score=0.7490097977902856, total=   1.5s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   18.8s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100, score=0.763443101292205, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   21.1s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100, score=0.7540129247446321, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   22.9s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100, score=0.7613588995414756, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   25.3s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100, score=0.7510944340212633, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   27.6s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7596915381408921, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   29.6s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7490097977902856, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000, score=0.7596915381408921, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000, score=0.7490097977902856, total=   1.7s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10, score=0.7701125468945393, total=   1.7s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10, score=0.7550552428601209, total=   1.7s
[CV] clf__p

[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7671946644435181, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.01, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7554721701063164, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=100, score=0.7701125468945393, total=   1.5s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=100, score=0.7550552428601209, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=10, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7596915381408921, total=   1.5s
[CV] clf

[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7554721701063164, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1, score=0.7596915381408921, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1, score=0.7488013341671879, total=   1.5s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100, score=0.7671946644435181, total=   1.5s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100, score=0.75547217010

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  4.5min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '..., norm='l2')), ('clf', NBSVM(C=1.0, alpha=1.0, loss='squared_hinge', max_iter=10000, penalty='l2'))]),
          fit_params=None, iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'clf__C': [1000, 100, 10, 1], 'clf__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'clf__max_iter': [5000, 4000, 6000, 7000, 3500, 10000, 9000, 11000], 'clf__loss': ['hinge', 'squared_hinge'], 'clf__penalty': ['l2']},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [10]:
from sklearn import metrics
y_pred_val = pclf.predict(X_test)
print(metrics.classification_report(y_test, y_pred_val))
accuracy = np.mean(y_pred_val == y_test)

              precision    recall  f1-score   support

         neg       0.82      0.81      0.81       518
         pos       0.82      0.83      0.82       549

   micro avg       0.82      0.82      0.82      1067
   macro avg       0.82      0.82      0.82      1067
weighted avg       0.82      0.82      0.82      1067



In [11]:
print("Accuracy of model = ", accuracy)

Accuracy of model =  0.8191190253045924


## Customer Review Dataset with NB-SVM

In [12]:
def load_reviews_dataset():
    #src = list(files.upload().values())[0]
    #open('customer review data','wb').write(src)

    products = ["Apex AD2600 Progressive-scan DVD player.txt"
    ,"Canon G3.txt"
    ,"Creative Labs Nomad Jukebox Zen Xtra 40GB.txt"
    ,"Nikon coolpix 4300.txt"
    ,"Nokia 6610.txt"]
    examples = []
    for product in products:
        examples += list(open('./data/customer_reviews/' + product, "r", encoding="utf-8").readlines())
    
    # for every examples, keep the one starting with a ranking
    x_text, y = [],[]
    for example in examples:
        final_label = 0
        temp_split = example.split("##")
        # don't consider unlabeled sentences
        if len(temp_split) <= 1:
            continue
        temp_label, temp_sentence = temp_split
        # parse the temp_label to find positive or negative
        positive_label = temp_label.split("+")
        #print("len positive label: {}".format(len(positive_label)))
        if len(positive_label) > 1:
            final_label = 1
        
        # so the final_label is either 0 or 1, 0 if negative, 1 if positive
        final_sentence = clean_str(temp_sentence.strip())
        x_text.append(final_sentence)
        y.append(final_label)
    return x_text, y
        
        
x_text, y = load_reviews_dataset()
x_text = clean_input_text(x_text)
X_train, X_test, y_train, y_test = train_test_split(x_text, y, test_size=0.1, random_state=42)

In [13]:
pclf = f.pipelined_features()
pclf.fit(X_train, y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1, score=0.7729577464788733, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1, score=0.7491544532130778, total=   0.7s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10, score=0.7695774647887323, total=   3.0s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.4s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10, score=0.7446448703494927, total=   2.3s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.9s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100, score=0.7791549295774648, total=   3.0s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.1s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100, score=0.7587373167981961, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   11.5s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100, score=0.7746478873239436, total=   2.5s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   14.2s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100, score=0.7519729425028185, total=   0.7s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   15.1s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7712676056338028, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   15.5s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7435174746335964, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000, score=0.7690140845070422, total=   4.9s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000, score=0.7435174746335964, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10, score=0.7752112676056339, total=   0.4s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10, score=0.762119503945885, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=0.1, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=0.1, clf__C=10, score=0.7791549295774648, total=   0.6s
[CV] clf_



[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=hinge, clf__alpha=0.001, clf__C=100, score=0.7729577464788733, total=   2.7s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=hinge, clf__alpha=0.001, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=hinge, clf__alpha=0.001, clf__C=100, score=0.762119503945885, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.1, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.1, clf__C=1000, score=0.7769014084507042, total=   4.3s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.1, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.1, clf__C=1000, score=0.7587373167981961, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=1000, score=0.7774647887323943, total=   3.6s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=1000, score=0.7638105975197295, total=   2.3s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=1000, clf__C=100 




[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=1000, clf__C=100, score=0.7690140845070422, total=   3.2s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=1000, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=1000, clf__C=100, score=0.7423900789177001, total=   0.6s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=1000, clf__C=100 




[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=1000, clf__C=100, score=0.7661971830985915, total=   2.9s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=1000, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=1000, clf__C=100, score=0.7423900789177001, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10 




[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10, score=0.7701408450704226, total=   4.7s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10, score=0.7446448703494927, total=   2.0s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100 




[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100, score=0.7769014084507042, total=   3.8s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100, score=0.7638105975197295, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__alpha=1000, clf__C=100 




[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__alpha=1000, clf__C=100, score=0.767887323943662, total=   7.0s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__alpha=1000, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__alpha=1000, clf__C=100, score=0.7423900789177001, total=   0.7s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=10 




[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=10, score=0.7769014084507042, total=   2.3s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=10, score=0.762119503945885, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=10, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7712676056338028, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=10, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7435174746335964, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__alpha=100, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__alpha=100, clf__C=100, score=0.7707042253521127, total=   4.8s




[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=100, score=0.7735211267605634, total=   6.2s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=100, score=0.762119503945885, total=   2.3s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=10, score=0.7752112676056339, total=   0.4s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=10, score=0.762119503945885, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=100 




[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=100, score=0.7769014084507042, total=   6.1s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=100, score=0.758173618940248, total=   0.8s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=10, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=10, clf__C=10, score=0.7712676056338028, total=   0.5s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=10, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=10, clf__C=10, score=0.7435174746335964, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.01, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7797183098591549, total= 



[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=100, clf__C=1000, score=0.7695774647887323, total=   7.6s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=100, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=100, clf__C=1000, score=0.7423900789177001, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=1000, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=1000, clf__C=100, score=0.7701408450704226, total=   4.6s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=1000, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=1000, clf__C=100, score=0.7423900789177001, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=1000, score=0.7740845070422535, total=   3.8s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=1000, score=0.762119503945885, total=   2.4s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=10, score=0.7746478873239436, total=   5.8s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=10, score=0.7626832018038332, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=100 




[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=100, score=0.7740845070422535, total=   2.7s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=100 




[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=100, score=0.762119503945885, total=   1.9s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=1000, score=0.7774647887323943, total=   4.5s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=1000, score=0.7587373167981961, total=   2.1s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=10, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=10, clf__C=1, score=0.772394366197183, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=10, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=10, clf__C=1, score=0.7480270574971815, total=   0.7s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__alpha=0.1, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__alpha=0.1, clf__C=1000, score=0.775774647887324, total=   7.1s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__alpha=0.1, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__alpha=0.1, clf__C=1000, score=0.7587373167981961, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=1, score=0.7808450704225353, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=squared_hinge, clf__alpha=0.1, clf__C=1, score=0.7694475760992108, total=   0.5s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.01, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7797183098591549, 



[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=10, score=0.7746478873239436, total=   4.1s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__alpha=0.001, clf__C=10, score=0.7626832018038332, total=   2.3s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=100, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=100, clf__C=1, score=0.7707042253521127, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=100, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=100, clf__C=1, score=0.7423900789177001, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=1000, score=0.7769014084507042, total=   7.4s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=1000, score=0.7638105975197295, total=   2.0s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=1000, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=1000, clf__C=1, score=0.7701408450704226, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=1000, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=1000, clf__C=1, score=0.7423900789177001, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=1000, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=1000, clf__C=1000, score=0.7673239436619719, total=   4.2s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=1000, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=1000, clf__C=1000, score=0.7423900789177001, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=0.01, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7797183098591549, total=   0.5s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=0.01, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7638105975197295, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1, score=0.7701408450704226, total=   0.2s
[CV]



[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100, score=0.7791549295774648, total=   2.5s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100, score=0.7638105975197295, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=1000, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=1000, clf__C=100, score=0.7701408450704226, total=   4.2s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=1000, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=hinge, clf__alpha=1000, clf__C=100, score=0.7423900789177001, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=100, clf__C=1000 




[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=100, clf__C=1000, score=0.7673239436619719, total=   3.4s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=100, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__alpha=100, clf__C=1000, score=0.7423900789177001, total=   1.9s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=10 




[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=10, score=0.7785915492957747, total=   3.3s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=10, score=0.7649379932356257, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=100, score=0.7752112676056339, total=   3.3s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=100, score=0.762119503945885, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__alpha=1, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__alpha=1, clf__C=1, score=0.777

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.4min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '..., norm='l2')), ('clf', NBSVM(C=1.0, alpha=1.0, loss='squared_hinge', max_iter=10000, penalty='l2'))]),
          fit_params=None, iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'clf__C': [1000, 100, 10, 1], 'clf__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'clf__max_iter': [5000, 4000, 6000, 7000, 3500, 10000, 9000, 11000], 'clf__loss': ['hinge', 'squared_hinge'], 'clf__penalty': ['l2']},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [14]:
from sklearn import metrics
y_pred_val = pclf.predict(X_test)
print(metrics.classification_report(y_test, y_pred_val))
accuracy = np.mean(y_pred_val == y_test)
print("Accuracy of model = ", accuracy)

              precision    recall  f1-score   support

           0       0.83      0.88      0.86       292
           1       0.60      0.50      0.55       103

   micro avg       0.78      0.78      0.78       395
   macro avg       0.72      0.69      0.70       395
weighted avg       0.77      0.78      0.78       395

Accuracy of model =  0.7848101265822784


In [44]:
pclf = f.pipelined_features(with_random_search = False)
pclf.fit(x_text1, y_train1)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '..., norm='l2')), ('clf', NBSVM(C=1.0, alpha=1.0, loss='squared_hinge', max_iter=10000, penalty='l2'))])

In [55]:
from sklearn import metrics
y_pred_val = pclf.predict(x_text2)
print(metrics.classification_report(y_train2, y_pred_val))
accuracy = np.mean(y_pred_val == y_train2)
print("Accuracy of model = ", accuracy)

              precision    recall  f1-score   support

           0       0.46      0.29      0.35      2008
           1       0.59      0.46      0.52      9255
           2       0.86      0.92      0.89     56548
           3       0.57      0.55      0.56     10998
           4       0.59      0.44      0.50      3791

   micro avg       0.78      0.78      0.78     82600
   macro avg       0.61      0.53      0.56     82600
weighted avg       0.77      0.78      0.77     82600

Accuracy of model =  0.781731234866828


## SST-2 Dataset NBSVM

In [20]:
def get_data_sst2(filepath):
    data = []
    labels = []
    with open('./data/' + filepath, 'r') as f:
        for line in f.readlines():
            words = line.split(' ')
            if(words[0] == '1'):
                labels.append('pos')
            else:
                labels.append('neg')
            del words[0]
            data.append(" ".join(words))
    return data, labels

In [21]:
f = Features()
pclf = f.pipelined_features()
train = get_data_sst2('stanfordSentimentTreeBank/stsa.binary.train')
test = get_data_sst2('stanfordSentimentTreeBank/stsa.binary.test')

In [22]:
pclf.fit(train[0], train[1])

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1, score=0.7826589595375723, total=   2.7s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1000, clf__C=1, score=0.7757225433526012, total=   2.0s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.6s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10, score=0.7826589595375723, total=   7.6s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   13.7s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__alpha=10, clf__C=10, score=0.7780346820809249, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   15.6s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100, score=0.7846820809248555, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.1s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=hinge, clf__alpha=0.1, clf__C=100, score=0.7800578034682081, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   18.8s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100, score=0.7826589595375723, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   20.6s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=1, clf__C=100, score=0.7823699421965318, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   22.2s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7826589595375723, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   23.6s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7780346820809249, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000, score=0.7826589595375723, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=10, clf__C=1000, score=0.7780346820809249, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10, score=0.784971098265896, total=   0.8s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=0.001, clf__C=10, score=0.7823699421965318, total=   1.0s
[CV] clf__pe

[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7858381502890174, total=   0.8s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.01, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7817919075144508, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=100, score=0.784971098265896, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__alpha=0.001, clf__C=100, score=0.7823699421965318, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=10, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=10, clf__C=1, score=0.7826589595375723, total=   1.0s
[CV] clf_

[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__alpha=0.01, clf__C=10, score=0.7817919075144508, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1, score=0.7829479768786127, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__alpha=1000, clf__C=1, score=0.7754335260115607, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100, score=0.7858381502890174, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__alpha=0.01, clf__C=100, score=0.78179190751

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.4min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '..., norm='l2')), ('clf', NBSVM(C=1.0, alpha=1.0, loss='squared_hinge', max_iter=10000, penalty='l2'))]),
          fit_params=None, iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'clf__C': [1000, 100, 10, 1], 'clf__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'clf__max_iter': [5000, 4000, 6000, 7000, 3500, 10000, 9000, 11000], 'clf__loss': ['hinge', 'squared_hinge'], 'clf__penalty': ['l2']},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [23]:
from sklearn import metrics
y_pred_val = pclf.predict(test[0])
print(metrics.classification_report(test[1], y_pred_val))
accuracy = np.mean(y_pred_val == test[1])
print("Accuracy of model = ", accuracy)


              precision    recall  f1-score   support

         neg       0.83      0.79      0.81       912
         pos       0.80      0.84      0.82       909

   micro avg       0.82      0.82      0.82      1821
   macro avg       0.82      0.82      0.82      1821
weighted avg       0.82      0.82      0.82      1821

Accuracy of model =  0.8154859967051071


# LinearSVC Implementation

## MR Dataset LinearSVC

In [24]:
f = Features()
pclf = f.pipelined_features(with_nb_features = False)
data = {}
neg_values = get_data('rt-polaritydata/rt-polarity.neg', 'neg')
pos_values = get_data('rt-polaritydata/rt-polarity.pos', 'pos')
data['neg'] = neg_values[0]
data['pos'] = pos_values[0]
train_docs = []
train_docs.extend(data['neg'])
train_docs.extend(data['pos'])
train_labels = []
train_labels.extend(neg_values[1])
train_labels.extend(pos_values[1])
X_train, X_test, y_train, y_test = train_test_split(train_docs, train_labels, test_size=0.1, random_state=42)

In [25]:
pclf.fit(X_train, y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000, score=0.759899958315965, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000, score=0.7485928705440901, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10 .


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.9s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10, score=0.759899958315965, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10 .


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.2s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10, score=0.7485928705440901, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.0s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100, score=0.759899958315965, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.0s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100, score=0.7485928705440901, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   11.8s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100, score=0.759899958315965, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   13.6s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100, score=0.7485928705440901, total=   1.5s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   15.7s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10, score=0.760108378491038, total=   1.8s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   18.2s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10, score=0.7502605795288723, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1 ..
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1, score=0.7611504793664027, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1 ..
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1, score=0.7440066708359391, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10 .
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10, score=0.759899958315965, total=   1.4s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10 .
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10, score=0.7485928705440901, total=   1.7s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__C=



[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__C=1000, score=0.760108378491038, total=   9.2s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=squared_hinge, clf__C=1000, score=0.7485928705440901, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=1, score=0.7586494372655272, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=1, score=0.747759016051699, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=10, score=0.760108378491038, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=10 
[CV]  clf__penal

[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=10, score=0.7485928705440901, total=   1.8s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=1000, score=0.760108378491038, total=  10.6s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=1000, score=0.7485928705440901, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__C=1, score=0.7586494372655272, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__C=1, score=0.747759016051699, total=   1.2s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=10 
[CV]  clf__penalty=l

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  4.7min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
          fit_params=None, iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'clf__C': [1000, 100, 10, 1], 'clf__max_iter': [5000, 4000, 6000, 7000, 3500, 10000, 9000, 11000], 'clf__loss': ['hinge', 'squared_hinge'], 'clf__penalty': ['l2']},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [26]:
from sklearn import metrics
y_pred_val = pclf.predict(X_test)
print(metrics.classification_report(y_test, y_pred_val))
accuracy = np.mean(y_pred_val == y_test)

              precision    recall  f1-score   support

         neg       0.81      0.80      0.81       518
         pos       0.81      0.82      0.82       549

   micro avg       0.81      0.81      0.81      1067
   macro avg       0.81      0.81      0.81      1067
weighted avg       0.81      0.81      0.81      1067



In [27]:
print("Accuracy of model = ", accuracy)

Accuracy of model =  0.8116213683223993


## CR Dataset LinearSVC

In [28]:
x_text, y = load_reviews_dataset()
x_text = clean_input_text(x_text)
X_train, X_test, y_train, y_test = train_test_split(x_text, y, test_size=0.1, random_state=42)

In [29]:
pclf.fit(X_train, y_train)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000, score=0.7701408450704226, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000, score=0.7423900789177001, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10 .


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10, score=0.7774647887323943, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10 .


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.9s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10, score=0.7564825253664036, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.4s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100, score=0.7701408450704226, total=   0.5s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.1s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100, score=0.7423900789177001, total=   0.4s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.7s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100, score=0.7701408450704226, total=   0.5s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    4.3s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100, score=0.7423900789177001, total=   0.4s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    4.9s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10, score=0.7791549295774648, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    5.5s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10, score=0.7604284103720406, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1 ..
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1, score=0.8005633802816902, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1 ..
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1, score=0.7897406989853438, total=   0.1s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10 .
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10, score=0.7774647887323943, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10 .
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10, score=0.7564825253664036, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__C



[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__C=1000, score=0.767887323943662, total=   1.8s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__C=1000, score=0.7429537767756482, total=   1.6s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=hinge, clf__C=10 .
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=hinge, clf__C=10, score=0.7774647887323943, total=   0.3s
[CV] clf__penalty=l2, clf__max_iter=3500, clf__loss=hinge, clf__C=10 .
[CV]  clf__penalty=l2, clf__max_iter=3500, clf__loss=hinge, clf__C=10, score=0.7564825253664036, total=   0.2s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1, score=0.7977464788732395, total=   0.1s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=110

[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__C=1000, score=0.7429537767756482, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=1, score=0.7977464788732395, total=   0.1s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=1, score=0.7818489289740699, total=   0.1s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1000, score=0.7684507042253521, total=   3.9s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1000, score=0.7429537767756482, total=   2.9s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=1 ..
[CV]  clf__p

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.4min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
          fit_params=None, iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'clf__C': [1000, 100, 10, 1], 'clf__max_iter': [5000, 4000, 6000, 7000, 3500, 10000, 9000, 11000], 'clf__loss': ['hinge', 'squared_hinge'], 'clf__penalty': ['l2']},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [30]:
from sklearn import metrics
y_pred_val = pclf.predict(X_test)
print(metrics.classification_report(y_test, y_pred_val))
accuracy = np.mean(y_pred_val == y_test)

              precision    recall  f1-score   support

           0       0.83      0.94      0.88       292
           1       0.73      0.47      0.57       103

   micro avg       0.82      0.82      0.82       395
   macro avg       0.78      0.70      0.73       395
weighted avg       0.81      0.82      0.80       395



In [31]:
print("Accuracy of model = ", accuracy)

Accuracy of model =  0.8151898734177215


## SST2 with LinearSVC

In [36]:
f = Features()
pclf = f.pipelined_features(with_nb_features = False)
data = {}
train = get_data_sst2('stanfordSentimentTreeBank/stsa.binary.train')
test = get_data_sst2('stanfordSentimentTreeBank/stsa.binary.test')
pclf.fit(train[0], train[1])

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000, score=0.7841040462427745, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=1000, score=0.7754335260115607, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10 .


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10, score=0.7841040462427745, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10 .


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.7s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=hinge, clf__C=10, score=0.7754335260115607, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.2s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100, score=0.7841040462427745, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.5s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=hinge, clf__C=100, score=0.7754335260115607, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    8.9s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100, score=0.7841040462427745, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   10.5s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=hinge, clf__C=100, score=0.7754335260115607, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   11.9s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10, score=0.7829479768786127, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   13.4s remaining:    0.0s


[CV]  clf__penalty=l2, clf__max_iter=9000, clf__loss=squared_hinge, clf__C=10, score=0.7754335260115607, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1 ..
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1, score=0.7841040462427745, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1 ..
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1, score=0.780635838150289, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10 .
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10, score=0.7841040462427745, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10 .
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=hinge, clf__C=10, score=0.7754335260115607, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__C=100 
[CV]  clf__penalty=l2, clf__max_iter=10000, clf__loss=squared_hinge, clf__C=

[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__C=10, score=0.7829479768786127, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__C=10 
[CV]  clf__penalty=l2, clf__max_iter=5000, clf__loss=squared_hinge, clf__C=10, score=0.7754335260115607, total=   1.3s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=squared_hinge, clf__C=1, score=0.7867052023121387, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=squared_hinge, clf__C=1, score=0.7791907514450868, total=   0.9s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1000, score=0.7841040462427745, total=   1.1s
[CV] clf__penalty=l2, clf__max_iter=7000, clf__loss=hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=



[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__C=1000, score=0.7841040462427745, total=   6.3s
[CV] clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=4000, clf__loss=squared_hinge, clf__C=1000, score=0.7754335260115607, total=   2.0s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=1, score=0.7867052023121387, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=1 
[CV]  clf__penalty=l2, clf__max_iter=6000, clf__loss=squared_hinge, clf__C=1, score=0.7791907514450868, total=   1.0s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1000 
[CV]  clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1000, score=0.7841040462427745, total=  10.0s
[CV] clf__penalty=l2, clf__max_iter=11000, clf__loss=squared_hinge, clf__C=1000 
[CV]

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.0min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
          fit_params=None, iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'clf__C': [1000, 100, 10, 1], 'clf__max_iter': [5000, 4000, 6000, 7000, 3500, 10000, 9000, 11000], 'clf__loss': ['hinge', 'squared_hinge'], 'clf__penalty': ['l2']},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [37]:
from sklearn import metrics
y_pred_val = pclf.predict(test[0])
print(metrics.classification_report(test[1], y_pred_val))
accuracy = np.mean(y_pred_val == test[1])
print("Accuracy of model = ", accuracy)


              precision    recall  f1-score   support

         neg       0.83      0.79      0.81       912
         pos       0.80      0.83      0.82       909

   micro avg       0.81      0.81      0.81      1821
   macro avg       0.81      0.81      0.81      1821
weighted avg       0.81      0.81      0.81      1821

Accuracy of model =  0.8127402526084568
