In [1]:
import os
import sklearn
import numpy as np
import re
from sklearn.model_selection import train_test_split
from glob import glob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.decomposition import PCA

In [2]:
# use nltk to remove stopwords and lemmatize
# you might need to run: nltk.download() to fetch the stopword package in "all packages"
# you might also need to run ntlk.download("punkt")
english_stopwords = set(stopwords.words("english"))
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


In [4]:
def get_data(filepath, label):
    data = []
    labels = []
    with open('./data/' + filepath, 'r') as f:
        for line in f.readlines():
            data.append(clean_str(line))
            labels.append(label)
    return data, labels

In [5]:
def clean_input_text(text):
    clean_text = []
    for sent in text:
        clean_sent = ""
        sent_tokens = word_tokenize(sent)
        for token in sent_tokens:
            clean_sent += wordnet_lemmatizer.lemmatize(token) + " " if token not in english_stopwords else ""
        clean_text.append(clean_sent)
    return clean_text

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Normalizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

import six
from abc import ABCMeta
import numpy as np
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer


class NBSVM(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)):

    def __init__(self, alpha=1.0, C=1.0, max_iter=10000):
        self.alpha = alpha
        self.max_iter = max_iter
        self.C = C
        self.svm_ = []

    def fit(self, X, y):
        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # so we don't have to cast X to floating point
        Y = Y.astype(np.float64)

        # Count raw events from data
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.ratios_ = np.full((n_effective_classes, n_features), self.alpha,
                                 dtype=np.float64)
        self._compute_ratios(X, Y)

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            svm = LinearSVC(C=self.C, max_iter=self.max_iter)
            Y_i = Y[:,i]
            svm.fit(X_i, Y_i)
            self.svm_.append(svm) 

        return self

    def predict(self, X):
        n_effective_classes = self.class_count_.shape[0]
        n_examples = X.shape[0]

        D = np.zeros((n_effective_classes, n_examples))

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            D[i] = self.svm_[i].decision_function(X_i)
        
        return self.classes_[np.argmax(D, axis=0)]
        
    def _compute_ratios(self, X, Y):
        """Count feature occurrences and compute ratios."""
        if np.any((X.data if issparse(X) else X) < 0):
            raise ValueError("Input X must be non-negative")

        self.ratios_ += safe_sparse_dot(Y.T, X)  # ratio + feature_occurrance_c
        normalize(self.ratios_, norm='l1', axis=1, copy=False)
        row_calc = lambda r: np.log(np.divide(r, (1 - r)))
        self.ratios_ = np.apply_along_axis(row_calc, axis=1, arr=self.ratios_)
        check_array(self.ratios_)
        self.ratios_ = sparse.csr_matrix(self.ratios_)

class Features:

	def transform_bag_of_words(self, X_train):
		X_train_counts = self.count_transformer.transform(X_train)
		return self.transform_tfidf(X_train_counts)

	def transform_tfidf(self, unmodified_set):
		modified_set = self.tfidf_transformer.transform(unmodified_set)
		return modified_set
	def pipelined_features(self):
		from sklearn.pipeline import Pipeline
		stop_words = ['', ' ', 'a', 'this', 'of', 'if', 'is', 'are']
		pclf = Pipeline([
		    ('vect', CountVectorizer(ngram_range = (1,2), token_pattern = r'\b(?:\d+(?:,\d{3,3})+(?:\.\d+)?)\b|(?:\b(?:([a-zA-Z]{2,}|\d+)(?::?\d{0,2}(?:am|pm)?|\'?\w?))\b)', stop_words = stop_words, binary = False)),
		    ('tfidf', TfidfTransformer()),
		    ('norm', Normalizer()),
            ('clf', NBSVM()),
		])

		from sklearn.model_selection import RandomizedSearchCV
		from scipy.stats import randint as randint
		from scipy.stats import uniform

		params = {
		           "clf__alpha": [1, 2, 3, 4, 5],
		           "clf__C": [1, 2, 3, 0.5, 1.5, 2.5, 3.5],
		           "clf__max_iter": [1000,1500, 2000,2500, 3000,3500, 10000, 9000],

		           }

		seed = 50

		random_search = RandomizedSearchCV(pclf, param_distributions = params, cv=2, verbose = 10, random_state = seed, n_iter = 30)

		return random_search

In [7]:
f = Features()

## MR Polarity Dataset with NB-SVM


In [8]:
pclf = f.pipelined_features()
data = {}
neg_values = get_data('rt-polaritydata/rt-polarity.neg', 'neg')
pos_values = get_data('rt-polaritydata/rt-polarity.pos', 'pos')
data['neg'] = neg_values[0]
data['pos'] = pos_values[0]
train_docs = []
train_docs.extend(data['neg'])
train_docs.extend(data['pos'])
train_labels = []
train_labels.extend(neg_values[1])
train_labels.extend(pos_values[1])
X_train, X_test, y_train, y_test = train_test_split(train_docs, train_labels, test_size=0.2, random_state=42)

In [9]:
pclf.fit(X_train, y_train)

Fitting 2 folds for each of 30 candidates, totalling 60 fits
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=2 ......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=2, score=0.757327080890973, total=   2.3s
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=2 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s remaining:    0.0s


[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=2, score=0.7413227016885553, total=   0.6s
[CV] clf__max_iter=9000, clf__alpha=5, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.5s remaining:    0.0s


[CV]  clf__max_iter=9000, clf__alpha=5, clf__C=1, score=0.7575615474794841, total=   2.0s
[CV] clf__max_iter=9000, clf__alpha=5, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.9s remaining:    0.0s


[CV]  clf__max_iter=9000, clf__alpha=5, clf__C=1, score=0.7410881801125704, total=   0.7s
[CV] clf__max_iter=3000, clf__alpha=3, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.8s remaining:    0.0s


[CV]  clf__max_iter=3000, clf__alpha=3, clf__C=1, score=0.7575615474794841, total=   1.8s
[CV] clf__max_iter=3000, clf__alpha=3, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.9s remaining:    0.0s


[CV]  clf__max_iter=3000, clf__alpha=3, clf__C=1, score=0.7422607879924953, total=   0.6s
[CV] clf__max_iter=1500, clf__alpha=1, clf__C=2.5 ....................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    9.9s remaining:    0.0s


[CV]  clf__max_iter=1500, clf__alpha=1, clf__C=2.5, score=0.7592028135990622, total=   2.5s
[CV] clf__max_iter=1500, clf__alpha=1, clf__C=2.5 ....................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   12.7s remaining:    0.0s


[CV]  clf__max_iter=1500, clf__alpha=1, clf__C=2.5, score=0.7436679174484052, total=   0.6s
[CV] clf__max_iter=1000, clf__alpha=2, clf__C=1.5 ....................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   13.6s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__alpha=2, clf__C=1.5, score=0.7582649472450176, total=   2.1s
[CV] clf__max_iter=1000, clf__alpha=2, clf__C=1.5 ....................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.1s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__alpha=2, clf__C=1.5, score=0.7431988742964353, total=   0.6s
[CV] clf__max_iter=2500, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2500, clf__alpha=3, clf__C=3, score=0.7577960140679953, total=   3.2s
[CV] clf__max_iter=2500, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2500, clf__alpha=3, clf__C=3, score=0.7422607879924953, total=   0.6s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=2.5 ....................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=2.5, score=0.7575615474794841, total=   2.8s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=2.5 ....................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=2.5, score=0.7410881801125704, total=   0.6s
[CV] clf__max_iter=2000, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2000, clf__alpha=3, clf__C=3, score=0.7577960140679953, total=   3.0s
[CV] clf__max_iter=2000, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2000, cl



[CV]  clf__max_iter=1000, clf__alpha=5, clf__C=3.5, score=0.757327080890973, total=   3.5s
[CV] clf__max_iter=1000, clf__alpha=5, clf__C=3.5 ....................
[CV]  clf__max_iter=1000, clf__alpha=5, clf__C=3.5, score=0.7410881801125704, total=   0.7s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=2 ......................
[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=2, score=0.757327080890973, total=   2.5s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=2 ......................
[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=2, score=0.7410881801125704, total=   1.1s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=2.5 ....................
[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=2.5, score=0.757327080890973, total=   3.4s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=2.5 ....................
[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=2.5, score=0.7410881801125704, total=   0.6s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=0.5 ....................
[CV]  clf__max_iter=1500, clf



[CV]  clf__max_iter=1000, clf__alpha=3, clf__C=3.5, score=0.7577960140679953, total=   2.6s
[CV] clf__max_iter=1000, clf__alpha=3, clf__C=3.5 ....................
[CV]  clf__max_iter=1000, clf__alpha=3, clf__C=3.5, score=0.7422607879924953, total=   0.8s
[CV] clf__max_iter=1000, clf__alpha=1, clf__C=2.5 ....................




[CV]  clf__max_iter=1000, clf__alpha=1, clf__C=2.5, score=0.7592028135990622, total=   2.5s
[CV] clf__max_iter=1000, clf__alpha=1, clf__C=2.5 ....................
[CV]  clf__max_iter=1000, clf__alpha=1, clf__C=2.5, score=0.7436679174484052, total=   0.6s
[CV] clf__max_iter=3500, clf__alpha=2, clf__C=2.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=2, clf__C=2.5, score=0.7582649472450176, total=   2.6s
[CV] clf__max_iter=3500, clf__alpha=2, clf__C=2.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=2, clf__C=2.5, score=0.7431988742964353, total=   0.6s
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=0.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=0.5, score=0.757327080890973, total=   1.2s
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=0.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=0.5, score=0.7415572232645403, total=   0.6s
[CV] clf__max_iter=9000, clf__alpha=2, clf__C=3.5 ....................
[CV]  clf__max_iter=900

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  2.0min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '...rue)), ('norm', Normalizer(copy=True, norm='l2')), ('clf', NBSVM(C=1.0, alpha=1.0, max_iter=10000))]),
          fit_params=None, iid='warn', n_iter=30, n_jobs=None,
          param_distributions={'clf__alpha': [1, 2, 3, 4, 5], 'clf__C': [1, 2, 3, 0.5, 1.5, 2.5, 3.5], 'clf__max_iter': [1000, 1500, 2000, 2500, 3000, 3500, 10000, 9000]},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [10]:
from sklearn import metrics
y_pred_val = pclf.predict(X_test)
print(metrics.classification_report(y_test, y_pred_val))
accuracy = np.mean(y_pred_val == y_test)


              precision    recall  f1-score   support

         neg       0.79      0.79      0.79      1062
         pos       0.79      0.79      0.79      1071

   micro avg       0.79      0.79      0.79      2133
   macro avg       0.79      0.79      0.79      2133
weighted avg       0.79      0.79      0.79      2133



In [11]:
print("Accuracy of model = ", accuracy)

Accuracy of model =  0.7876230661040787


In [12]:
pclf = f.pipelined_features()
data = {}
neg_values = get_data('sensitivity/plot.tok.gt9.5000', 'obj')
pos_values = get_data('sensitivity/quote.tok.gt9.5000', 'sub')
data['obj'] = neg_values[0]
data['sub'] = pos_values[0]
train_docs = []
train_docs.extend(data['obj'])
train_docs.extend(data['sub'])
train_labels = []
train_labels.extend(neg_values[1])
train_labels.extend(pos_values[1])
X_train, X_test, y_train, y_test = train_test_split(train_docs, train_labels, test_size=0.2, random_state=42)
pclf.fit(X_train, y_train)

Fitting 2 folds for each of 30 candidates, totalling 60 fits
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=2 ......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=2, score=0.90525, total=   0.7s
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=2 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=2, score=0.8985, total=   0.7s
[CV] clf__max_iter=9000, clf__alpha=5, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV]  clf__max_iter=9000, clf__alpha=5, clf__C=1, score=0.90475, total=   0.8s
[CV] clf__max_iter=9000, clf__alpha=5, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.1s remaining:    0.0s


[CV]  clf__max_iter=9000, clf__alpha=5, clf__C=1, score=0.89825, total=   0.7s
[CV] clf__max_iter=3000, clf__alpha=3, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.1s remaining:    0.0s


[CV]  clf__max_iter=3000, clf__alpha=3, clf__C=1, score=0.905, total=   0.7s
[CV] clf__max_iter=3000, clf__alpha=3, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.1s remaining:    0.0s


[CV]  clf__max_iter=3000, clf__alpha=3, clf__C=1, score=0.899, total=   0.6s
[CV] clf__max_iter=1500, clf__alpha=1, clf__C=2.5 ....................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.2s remaining:    0.0s


[CV]  clf__max_iter=1500, clf__alpha=1, clf__C=2.5, score=0.90675, total=   0.9s
[CV] clf__max_iter=1500, clf__alpha=1, clf__C=2.5 ....................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    7.5s remaining:    0.0s


[CV]  clf__max_iter=1500, clf__alpha=1, clf__C=2.5, score=0.901, total=   0.7s
[CV] clf__max_iter=1000, clf__alpha=2, clf__C=1.5 ....................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    8.5s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__alpha=2, clf__C=1.5, score=0.90525, total=   0.7s
[CV] clf__max_iter=1000, clf__alpha=2, clf__C=1.5 ....................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    9.6s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__alpha=2, clf__C=1.5, score=0.89975, total=   0.7s
[CV] clf__max_iter=2500, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2500, clf__alpha=3, clf__C=3, score=0.90525, total=   0.6s
[CV] clf__max_iter=2500, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2500, clf__alpha=3, clf__C=3, score=0.899, total=   0.6s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=2.5 ....................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=2.5, score=0.90525, total=   0.7s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=2.5 ....................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=2.5, score=0.8985, total=   0.6s
[CV] clf__max_iter=2000, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2000, clf__alpha=3, clf__C=3, score=0.90525, total=   0.6s
[CV] clf__max_iter=2000, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2000, clf__alpha=3, clf__C=3, score=0.899, total=   0.6s
[CV] clf__max_iter=2

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.3min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '...rue)), ('norm', Normalizer(copy=True, norm='l2')), ('clf', NBSVM(C=1.0, alpha=1.0, max_iter=10000))]),
          fit_params=None, iid='warn', n_iter=30, n_jobs=None,
          param_distributions={'clf__alpha': [1, 2, 3, 4, 5], 'clf__C': [1, 2, 3, 0.5, 1.5, 2.5, 3.5], 'clf__max_iter': [1000, 1500, 2000, 2500, 3000, 3500, 10000, 9000]},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [13]:
from sklearn import metrics
y_pred_val = pclf.predict(X_test)
print(metrics.classification_report(y_test, y_pred_val))
accuracy = np.mean(y_pred_val == y_test)
print("Accuracy of model = ", accuracy)

              precision    recall  f1-score   support

         obj       0.92      0.91      0.91      1012
         sub       0.91      0.91      0.91       988

   micro avg       0.91      0.91      0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000

Accuracy of model =  0.913


## Customer Review Dataset with NB-SVM

In [14]:
def load_reviews_dataset():
    #src = list(files.upload().values())[0]
    #open('customer review data','wb').write(src)

    products = ["Apex AD2600 Progressive-scan DVD player.txt"
    ,"Canon G3.txt"
    ,"Creative Labs Nomad Jukebox Zen Xtra 40GB.txt"
    ,"Nikon coolpix 4300.txt"
    ,"Nokia 6610.txt"]
    examples = []
    for product in products:
        examples += list(open('./data/customer_reviews/' + product, "r", encoding="utf-8").readlines())
    
    # for every examples, keep the one starting with a ranking
    x_text, y = [],[]
    for example in examples:
        final_label = 0
        temp_split = example.split("##")
        # don't consider unlabeled sentences
        if len(temp_split) <= 1:
            continue
        temp_label, temp_sentence = temp_split
        # parse the temp_label to find positive or negative
        positive_label = temp_label.split("+")
        #print("len positive label: {}".format(len(positive_label)))
        if len(positive_label) > 1:
            final_label = 1
        
        # so the final_label is either 0 or 1, 0 if negative, 1 if positive
        final_sentence = clean_str(temp_sentence.strip())
        x_text.append(final_sentence)
        y.append(final_label)
    return x_text, y
        
        
x_text, y = load_reviews_dataset()
x_text = clean_input_text(x_text)
X_train, X_test, y_train, y_test = train_test_split(x_text, y, test_size=0.2, random_state=42)

In [15]:
pclf = f.pipelined_features()
pclf.fit(X_train, y_train)

Fitting 2 folds for each of 30 candidates, totalling 60 fits
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=2 ......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=2, score=0.7743979721166033, total=   0.9s
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=2 ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=2, score=0.7425491439441978, total=   0.4s
[CV] clf__max_iter=9000, clf__alpha=5, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV]  clf__max_iter=9000, clf__alpha=5, clf__C=1, score=0.7762991128010139, total=   0.4s
[CV] clf__max_iter=9000, clf__alpha=5, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.0s remaining:    0.0s


[CV]  clf__max_iter=9000, clf__alpha=5, clf__C=1, score=0.7476220672162334, total=   0.3s
[CV] clf__max_iter=3000, clf__alpha=3, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.5s remaining:    0.0s


[CV]  clf__max_iter=3000, clf__alpha=3, clf__C=1, score=0.7775665399239544, total=   0.4s
[CV] clf__max_iter=3000, clf__alpha=3, clf__C=1 ......................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.0s remaining:    0.0s


[CV]  clf__max_iter=3000, clf__alpha=3, clf__C=1, score=0.7476220672162334, total=   0.2s
[CV] clf__max_iter=1500, clf__alpha=1, clf__C=2.5 ....................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.4s remaining:    0.0s


[CV]  clf__max_iter=1500, clf__alpha=1, clf__C=2.5, score=0.7782002534854245, total=   0.6s
[CV] clf__max_iter=1500, clf__alpha=1, clf__C=2.5 ....................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    4.2s remaining:    0.0s


[CV]  clf__max_iter=1500, clf__alpha=1, clf__C=2.5, score=0.7482561826252377, total=   0.4s
[CV] clf__max_iter=1000, clf__alpha=2, clf__C=1.5 ....................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    4.7s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__alpha=2, clf__C=1.5, score=0.7782002534854245, total=   0.5s
[CV] clf__max_iter=1000, clf__alpha=2, clf__C=1.5 ....................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    5.3s remaining:    0.0s


[CV]  clf__max_iter=1000, clf__alpha=2, clf__C=1.5, score=0.7463538363982245, total=   0.3s
[CV] clf__max_iter=2500, clf__alpha=3, clf__C=3 ......................




[CV]  clf__max_iter=2500, clf__alpha=3, clf__C=3, score=0.7724968314321926, total=   0.9s
[CV] clf__max_iter=2500, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2500, clf__alpha=3, clf__C=3, score=0.7431832593532023, total=   0.5s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=2.5 ....................




[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=2.5, score=0.7712294043092522, total=   0.6s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=2.5 ....................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=2.5, score=0.7419150285351934, total=   0.5s
[CV] clf__max_iter=2000, clf__alpha=3, clf__C=3 ......................




[CV]  clf__max_iter=2000, clf__alpha=3, clf__C=3, score=0.7724968314321926, total=   0.7s
[CV] clf__max_iter=2000, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=2000, clf__alpha=3, clf__C=3, score=0.7431832593532023, total=   0.7s
[CV] clf__max_iter=2000, clf__alpha=1, clf__C=3.5 ....................




[CV]  clf__max_iter=2000, clf__alpha=1, clf__C=3.5, score=0.7743979721166033, total=   0.9s
[CV] clf__max_iter=2000, clf__alpha=1, clf__C=3.5 ....................
[CV]  clf__max_iter=2000, clf__alpha=1, clf__C=3.5, score=0.7469879518072289, total=   0.5s
[CV] clf__max_iter=1000, clf__alpha=4, clf__C=1 ......................




[CV]  clf__max_iter=1000, clf__alpha=4, clf__C=1, score=0.7762991128010139, total=   0.5s
[CV] clf__max_iter=1000, clf__alpha=4, clf__C=1 ......................
[CV]  clf__max_iter=1000, clf__alpha=4, clf__C=1, score=0.7476220672162334, total=   0.4s
[CV] clf__max_iter=1500, clf__alpha=1, clf__C=3 ......................




[CV]  clf__max_iter=1500, clf__alpha=1, clf__C=3, score=0.7756653992395437, total=   0.7s
[CV] clf__max_iter=1500, clf__alpha=1, clf__C=3 ......................
[CV]  clf__max_iter=1500, clf__alpha=1, clf__C=3, score=0.7476220672162334, total=   0.7s
[CV] clf__max_iter=1000, clf__alpha=5, clf__C=3.5 ....................




[CV]  clf__max_iter=1000, clf__alpha=5, clf__C=3.5, score=0.7712294043092522, total=   0.4s
[CV] clf__max_iter=1000, clf__alpha=5, clf__C=3.5 ....................




[CV]  clf__max_iter=1000, clf__alpha=5, clf__C=3.5, score=0.7438173747622068, total=   0.4s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=2 ......................




[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=2, score=0.7743979721166033, total=   0.9s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=2 ......................
[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=2, score=0.7425491439441978, total=   0.5s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=2.5 ....................




[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=2.5, score=0.7712294043092522, total=   0.8s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=2.5 ....................
[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=2.5, score=0.7419150285351934, total=   0.5s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=0.5 ....................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=0.5, score=0.7807351077313055, total=   0.3s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=0.5 ....................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=0.5, score=0.7469879518072289, total=   0.2s
[CV] clf__max_iter=9000, clf__alpha=1, clf__C=3.5 ....................
[CV]  clf__max_iter=9000, clf__alpha=1, clf__C=3.5, score=0.7743979721166033, total=   1.4s
[CV] clf__max_iter=9000, clf__alpha=1, clf__C=3.5 ....................
[CV]  clf__max_iter=9000, clf__alpha=1, clf__C=3.5, score=0.7469879518072289, total=   0.4s
[CV] clf__max_iter=1000, clf__alpha=3, clf__C=3.5 ....................




[CV]  clf__max_iter=1000, clf__alpha=3, clf__C=3.5, score=0.7718631178707225, total=   0.4s
[CV] clf__max_iter=1000, clf__alpha=3, clf__C=3.5 ....................




[CV]  clf__max_iter=1000, clf__alpha=3, clf__C=3.5, score=0.7444514901712111, total=   0.4s
[CV] clf__max_iter=1000, clf__alpha=1, clf__C=2.5 ....................




[CV]  clf__max_iter=1000, clf__alpha=1, clf__C=2.5, score=0.7775665399239544, total=   0.4s
[CV] clf__max_iter=1000, clf__alpha=1, clf__C=2.5 ....................




[CV]  clf__max_iter=1000, clf__alpha=1, clf__C=2.5, score=0.7482561826252377, total=   0.3s
[CV] clf__max_iter=3500, clf__alpha=2, clf__C=2.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=2, clf__C=2.5, score=0.7737642585551331, total=   1.1s
[CV] clf__max_iter=3500, clf__alpha=2, clf__C=2.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=2, clf__C=2.5, score=0.7444514901712111, total=   0.7s
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=0.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=0.5, score=0.7807351077313055, total=   0.3s
[CV] clf__max_iter=3500, clf__alpha=4, clf__C=0.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=4, clf__C=0.5, score=0.7469879518072289, total=   0.1s
[CV] clf__max_iter=9000, clf__alpha=2, clf__C=3.5 ....................
[CV]  clf__max_iter=9000, clf__alpha=2, clf__C=3.5, score=0.7731305449936628, total=   1.5s
[CV] clf__max_iter=9000, clf__alpha=2, clf__C=3.5 ....................
[CV]  clf__max_iter=90



[CV]  clf__max_iter=1500, clf__alpha=2, clf__C=1.5, score=0.7782002534854245, total=   0.6s
[CV] clf__max_iter=1500, clf__alpha=2, clf__C=1.5 ....................
[CV]  clf__max_iter=1500, clf__alpha=2, clf__C=1.5, score=0.7463538363982245, total=   0.4s
[CV] clf__max_iter=10000, clf__alpha=3, clf__C=3.5 ...................
[CV]  clf__max_iter=10000, clf__alpha=3, clf__C=3.5, score=0.7724968314321926, total=   1.5s
[CV] clf__max_iter=10000, clf__alpha=3, clf__C=3.5 ...................
[CV]  clf__max_iter=10000, clf__alpha=3, clf__C=3.5, score=0.7444514901712111, total=   0.5s
[CV] clf__max_iter=1000, clf__alpha=2, clf__C=1 ......................




[CV]  clf__max_iter=1000, clf__alpha=2, clf__C=1, score=0.7775665399239544, total=   0.4s
[CV] clf__max_iter=1000, clf__alpha=2, clf__C=1 ......................
[CV]  clf__max_iter=1000, clf__alpha=2, clf__C=1, score=0.7469879518072289, total=   0.2s
[CV] clf__max_iter=9000, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=9000, clf__alpha=3, clf__C=3, score=0.7724968314321926, total=   1.2s
[CV] clf__max_iter=9000, clf__alpha=3, clf__C=3 ......................
[CV]  clf__max_iter=9000, clf__alpha=3, clf__C=3, score=0.7431832593532023, total=   0.4s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=1 ......................
[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=1, score=0.7762991128010139, total=   0.5s
[CV] clf__max_iter=2000, clf__alpha=5, clf__C=1 ......................
[CV]  clf__max_iter=2000, clf__alpha=5, clf__C=1, score=0.7476220672162334, total=   0.3s
[CV] clf__max_iter=3500, clf__alpha=1, clf__C=3.5 ....................




[CV]  clf__max_iter=3500, clf__alpha=1, clf__C=3.5, score=0.7743979721166033, total=   1.1s
[CV] clf__max_iter=3500, clf__alpha=1, clf__C=3.5 ....................
[CV]  clf__max_iter=3500, clf__alpha=1, clf__C=3.5, score=0.7469879518072289, total=   0.5s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=1 ......................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=1, score=0.7762991128010139, total=   0.5s
[CV] clf__max_iter=1500, clf__alpha=4, clf__C=1 ......................
[CV]  clf__max_iter=1500, clf__alpha=4, clf__C=1, score=0.7476220672162334, total=   0.2s
[CV] clf__max_iter=3000, clf__alpha=2, clf__C=2.5 ....................
[CV]  clf__max_iter=3000, clf__alpha=2, clf__C=2.5, score=0.7737642585551331, total=   1.2s
[CV] clf__max_iter=3000, clf__alpha=2, clf__C=2.5 ....................
[CV]  clf__max_iter=3000, clf__alpha=2, clf__C=2.5, score=0.7444514901712111, total=   0.7s
[CV] clf__max_iter=3500, clf__alpha=1, clf__C=3 ......................
[CV]  clf__max_iter=3500, 

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   45.0s finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['', ' ', '...rue)), ('norm', Normalizer(copy=True, norm='l2')), ('clf', NBSVM(C=1.0, alpha=1.0, max_iter=10000))]),
          fit_params=None, iid='warn', n_iter=30, n_jobs=None,
          param_distributions={'clf__alpha': [1, 2, 3, 4, 5], 'clf__C': [1, 2, 3, 0.5, 1.5, 2.5, 3.5], 'clf__max_iter': [1000, 1500, 2000, 2500, 3000, 3500, 10000, 9000]},
          pre_dispatch='2*n_jobs', random_state=50, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [16]:
from sklearn import metrics
y_pred_val = pclf.predict(X_test)
print(metrics.classification_report(y_test, y_pred_val))
accuracy = np.mean(y_pred_val == y_test)
print("Accuracy of model = ", accuracy)

              precision    recall  f1-score   support

           0       0.84      0.87      0.86       574
           1       0.62      0.56      0.59       215

   micro avg       0.79      0.79      0.79       789
   macro avg       0.73      0.71      0.72       789
weighted avg       0.78      0.79      0.78       789

Accuracy of model =  0.7858048162230672


In [17]:
print(pclf.best_estimator_.named_steps['vect'].get_feature_names())


