# About this Script

This script computes embeddings with Gensim word2vec Algorithm from our indeed review dataset and evaluates different classifiers with it.

### Create embeddings with Gensims Word2Vec Algorithm

##### Preparing Environment

In [10]:
import numpy as np
import pandas as pd
import gensim
import time
import nltk
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from gensim.models import KeyedVectors

data = pd.read_csv('indeed_reviews_preprocessed_balanced.csv').sample(frac=1)
# x_corpus = data['review'] # for TFIDF Feature Generation
data.head()

# Reduce data_set for training
LIMIT = 1000
data = data[0:LIMIT]
data.head()

Unnamed: 0,company,review,rating,sentiment
13979,university of sydney,My experience at the Sydney Uni has been great...,3.0,1
8094,dentsu aegis network,"When the merger happened, the new guys in char...",4.0,0
544,spectrum health,Mental facilities crazy people come though the...,4.0,0
5147,televisa,"Buena empresa, ambiente por lo general bueno, ...",5.0,0
8356,engie,Great people to work for and with. Internation...,5.0,0


##### Helper Functions

In [11]:
# Right Target Labeling
def target_names (targets): #inherits to confusion_matrix and print_results function
    le = preprocessing.LabelEncoder()
    le.fit(data[targets])

    target_names = le.classes_
    
    return list(map(str,target_names)) # Convert target names into strings to avoid "TypeError: object of type 'numpy.int64' has no len()""

# Evaluation
def confusion_matrix(classifier, x_test, y_test, target_names):
    titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
    for title, normalize in titles_options:
        disp = plot_confusion_matrix(classifier, x_test, y_test,
                                 display_labels=target_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
        disp.ax_.set_title(title)
    
        print(title)
        print(disp.confusion_matrix)

    plt.show()

# Print Classification Report
def print_results(time_train, time_predict,y_testings,y_predictions, target_names):
    print("Training time: %fs; Prediction time: %fs" % (time_train, time_predict))
    print(classification_report(y_test, y_predic, target_names= target_names))

    
# returns text without special chars and punctuations
def cleanText(text): 
    cleaned = re.sub("[^a-zA-Z0-9']"," ",text)
    lowered = cleaned.lower()
    return lowered.strip()

# Test
# cleanText("WH@T !N THE H$§L")


# Removing common stopwords
def remove_stopwords(input_text):
    '''
    Function to remove English stopwords from a Pandas Series.
    
    Parameters:
        input_text : text to clean
    Output:
        cleaned Pandas Series 
    '''
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 


# Lemmatizaton and Tokenization
def lem_and_tok(tokens):
    le = WordNetLemmatizer()
    lemmatized_tokens_empty = []
    for word in tokens:
        lemmatized_tokens = lemmatized_tokens_empty.append(le.lemmatize(word))
    tokenizated = [[w for w in sentence.split(" ") if w != ""] for sentence in tokens]
    return tokenizated

# Compute Embedding Sequences 
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()

# Create Feature Generation with TFIDF
def tfidf_feat_generator(corpus):
    vectorizer = TfidfVectorizer()
    vec_corpus = vectorizer.fit_transform(corpus)
    return vec_corpus

##### Cleaning and Tokenization

In [12]:
data.review = data.review.apply(remove_stopwords)

x = np.asarray(data["review"])

x_cleaned = [cleanText(t) for t in x]
# x_cleaned[:4]

# Tokenization
x_tokenized = lem_and_tok(x_cleaned)
print("Number of reviews: " + str(len(x_tokenized)))
x_tokenized[0:10]

Number of reviews: 1000


[['my',
  'experience',
  'sydney',
  'uni',
  'great',
  'it',
  'amazing',
  'learning',
  'experience',
  'friendly',
  'colleagues',
  'guiding',
  'supervisors',
  'good',
  'work',
  'space',
  'umpteen',
  'number',
  'tasks',
  'done',
  'employment',
  'work',
  'environment',
  'conducive',
  'made',
  'things',
  'enjoyable',
  'the',
  'hardest',
  'part',
  'job',
  'data',
  'collection',
  'management',
  'managed',
  'well',
  'one',
  'thing',
  'job',
  'taught',
  'me',
  'the',
  'best',
  'part',
  'job',
  'getting',
  'meet',
  'many',
  'different',
  'people',
  'various',
  'skills',
  'getting',
  'opportunity',
  'learn',
  'many',
  'new',
  'things'],
 ['when',
  'merger',
  'happened',
  'new',
  'guys',
  'charge',
  'london',
  'really',
  'see',
  'power',
  'vision',
  'they',
  'media',
  'guys',
  'pure',
  'simple',
  'see',
  'potential',
  'become',
  'first',
  'collaborative',
  'truly',
  'integrated',
  'holding',
  'company',
  'model',
  'b

##### Create word2vec Model with Gensim

In [13]:
import time

start = time.time()

model = gensim.models.Word2Vec(x_tokenized,
                 vector_size=100 # Size is the length of our vector.
                )

end = round(time.time()-start,2)
print("The modeling took",end,"seconds.")

# Test
model.wv.most_similar("company")
# model.wv[0]

# Save word2vec embeddings
# word_vectors = model.wv
# word_vectors.save('w2v_vectors.wv')

# Fetch learned word - vector pairs
w2v = dict({})
for idx, key in enumerate(model.wv.key_to_index):
    w2v[key] = model.wv.get_vector(key)

# save learned word - vector pairs as csv
pd.DataFrame(w2v).to_csv('w2v_embeddings.csv', index=False)

# Most frequent words/features
pd.DataFrame(w2v).head()

The modeling took 0.11 seconds.


Unnamed: 0,work,not,the,company,great,management,good,job,people,no,...,india,feed,luck,harder,interested,budget,cons,engineer,leads,physics
0,-0.383646,-0.407361,-0.366816,-0.367624,-0.298257,-0.380843,-0.275764,-0.356595,-0.397731,-0.416969,...,-0.04814,-0.042542,-0.037204,-0.024157,-0.04323,-0.048765,-0.027576,-0.026452,-0.047843,-0.063602
1,0.549129,0.572836,0.526904,0.524276,0.41684,0.533489,0.401154,0.517539,0.564518,0.579732,...,0.064204,0.070277,0.059058,0.050873,0.064915,0.058431,0.040753,0.036688,0.051115,0.100589
2,0.000877,0.00173,-0.012193,-0.004413,-0.011494,-0.003757,-0.006691,0.005978,0.000765,-0.012023,...,0.008862,0.007304,-0.00881,-0.002457,-0.002444,-0.001367,0.005118,-0.000188,0.006879,-0.001133
3,-0.023733,-0.034095,-0.038609,-0.037982,-0.029568,-0.048472,-0.02672,-0.028073,-0.029292,-0.034246,...,-0.011099,-0.007229,-0.013901,-0.009113,-0.006322,-0.002696,-0.008845,0.00191,0.003909,-0.003168
4,0.060236,0.077343,0.072054,0.068571,0.055268,0.056565,0.051837,0.059421,0.07565,0.070805,...,0.008641,0.007672,0.002797,0.014774,0.003657,0.004309,0.007428,0.004794,0.000267,0.014657


##### Create Sequences

In constructor function our class takes 4 parameters: 
- all_words
    - Transforms our dataset into al list including all word tokens (not list of lists (sentences) concatenate all the sentences)
- max_words
    - If dataset has a lot of unique words, limit the number of words. This parameter will be used in finding most used N (max_words) word.
- seq_length
    - In machine learning dataset's number of variable has to be specified. But in real life each sentence might has a different length. In order to prevent this problem we'll determine a length and adapt our sentences to that length (padding).
- embedding_matrix
    - Contains the learned word embeddings from the word2vec model

In [14]:
# Initialize Sequencer
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1000, # Number of unique words which are considered
              seq_len = 15, # Every sentence has different length --> seq_len computes equal length for every sentence
              embedding_matrix = model.wv
             )

# Test Sequencer
test_vec = sequencer.textToVector("hello its me again")
test_vec
test_vec.shape

(1500,)

##### Reduce dimensionality with PCA

- Principal Component Analysis (PCA) is an unsupervised, non-parametric statistical technique primarily used for dimensionality reduction in machine learning. Most used, hence lower impact on computational performance

In [15]:
# create vectors for our each vector
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])
print(x_vecs.shape)

from sklearn.decomposition import PCA
pca_model = PCA(n_components=300)
pca_model.fit(x_vecs)
print("accuracy after dimension reduction := " + "Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))
# The pca.explained_variance_ratio_ parameter returns a vector of the variance explained by each dimension.
# For non techs: "How much accuracy is kept, after dimensionality reduction"

# Shape after reduction
x_comps = pca_model.transform(x_vecs)
x_comps.shape


(1000, 1500)
accuracy after dimension reduction := Sum of variance ratios:  0.999581515621615


(1000, 300)

##### Train and Test split

In [16]:
# x_tfidf = tfidf_feat_generator(x_corpus) # --> Train with TFIDF Features

x_train,x_test,y_train,y_test = train_test_split(x_comps, # replace x_comps with x_tfidf for tfidf
                                                 data.sentiment,
                                                 test_size=0.2,
                                                 random_state=42)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

'''
About Normalization and Word Embeddings (cosine similarity [0,1]):
--> From Levy et al., 2015 (and, actually, most of the literature on word embeddings):
    Vectors are normalized to unit length before they are used for similarity calculation, 
    making cosine similarity and dot-product equivalent.

--> Also from Wilson and Schakel, 2015:
    Most applications of word embeddings explore not the word vectors themselves, 
    but relations between them to solve, for example, similarity and word relation tasks. 
    For these tasks, it was found that using normalised word vectors improves performance.

--> Schakel and Wilson, 2015 observed some interesting facts regarding the length of word vectors:
    A word that is consistently used in a similar context will be represented by a longer vector 
    than a word of the same frequency that is used in different contexts. Not only the direction, 
    but also the length of word vectors carries important information. Word vector length furnishes, 
    in combination with term frequency, a useful measure of word significance.
'''

(800, 300)
(200, 300)
(800,)
(200,)


'\nAbout Normalization and Word Embeddings (cosine similarity [0,1]):\n--> From Levy et al., 2015 (and, actually, most of the literature on word embeddings):\n    Vectors are normalized to unit length before they are used for similarity calculation, \n    making cosine similarity and dot-product equivalent.\n\n--> Also from Wilson and Schakel, 2015:\n    Most applications of word embeddings explore not the word vectors themselves, \n    but relations between them to solve, for example, similarity and word relation tasks. \n    For these tasks, it was found that using normalised word vectors improves performance.\n\n--> Schakel and Wilson, 2015 observed some interesting facts regarding the length of word vectors:\n    A word that is consistently used in a similar context will be represented by a longer vector \n    than a word of the same frequency that is used in different contexts. Not only the direction, \n    but also the length of word vectors carries important information. Word ve

### Classifiers

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn import svm

# RandomForestClassifier
print("Random Forest: ")
rfc = RandomForestClassifier()
t0 = time.time()
rfc.fit(x_train,y_train)
t1 = time.time()
y_predic = rfc.predict(x_test)
t2 = time.time()
time_random_train = t1-t0
time_random_predict = t2-t1
print_results(time_random_train, time_random_predict, y_test, y_predic, target_names('sentiment'))
# confusion_matrix(rfc, x_test, y_test, target_names('sentiment')) #classifier,#x_test,#y_test #target_names

# LogisticRegression
print("Logistic Regression: ")
logreg = LogisticRegression()
t0 = time.time()
logreg.fit(x_train,y_train)
t1 = time.time()
y_predic = logreg.predict(x_test)
t2 = time.time()
time_log_train = t1-t0
time_log_predict = t2-t1
print_results(time_log_train, time_log_predict, y_test, y_predic, target_names('sentiment'))
# confusion_matrix(logreg, x_test, y_test, target_names('sentiment')) #classifier,#x_test,#y_test #target_names

# GausianNB
print("GausianNB: ")
gaus = GaussianNB()
t0 = time.time()
gaus.fit(x_train,y_train)
t1 = time.time()
y_predic = gaus.predict(x_test)
t2 = time.time()
time_gaus_train = t1-t0
time_gaus_predict = t2-t1
print_results(time_gaus_train, time_gaus_predict, y_test, y_predic, target_names('sentiment'))
# confusion_matrix(gaus, x_test, y_test, target_names('sentiment')) #classifier,#x_test,#y_test #target_names

# BernoulliNB
print("BernoulliNB: ")
bern = BernoulliNB()
t0 = time.time()
bern.fit(x_train,y_train)
t1 = time.time()
y_predic = bern.predict(x_test)
t2 = time.time()
time_bern_train = t1-t0
time_bern_predict = t2-t1
print_results(time_bern_train, time_bern_predict, y_test, y_predic, target_names('sentiment'))
# confusion_matrix(bern, x_test, y_test, target_names('sentiment')) #classifier,#x_test,#y_test #target_names

# LinearSVC
print("LinearSVC: ")
lin_svc = svm.LinearSVC()
# by default square hinge loss --> worse f1-scores than SVC with linear kernel
# hinge loss function spills out similiar f1-scores as SVC with linear kernel

t0 = time.time()
lin_svc.fit(x_train,y_train)
t1 = time.time()
y_predic = lin_svc.predict(x_test)
t2 = time.time()
time_lin_svc_train = t1-t0
time_lin_svc_predict = t2-t1
print_results(time_lin_svc_train, time_lin_svc_predict, y_test, y_predic, target_names('sentiment'))
# confusion_matrix(lin_svc, x_test, y_test, target_names('sentiment')) #classifier,#x_test,#y_test #target_names

# SVC
print("SVC: ")
svc = svm.SVC()
# by default square hinge loss --> worse f1-scores than SVC with linear kernel
# hinge loss function spills out similiar f1-scores as SVC with linear kernel

t0 = time.time()
svc.fit(x_train,y_train)
t1 = time.time()
y_predic = lin_svc.predict(x_test)
t2 = time.time()
time_svc_train = t1-t0
time_svc_predict = t2-t1
print_results(time_svc_train, time_svc_predict, y_test, y_predic, target_names('sentiment'))
# confusion_matrix(svc, x_test, y_test, target_names('sentiment')) #classifier,#x_test,#y_test #target_names

'''
--> ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.

--> Solving the linear SVM is just solving a quadratic optimization problem. 
    The solver is typically an iterative algorithm that keeps a running 
    estimate of the solution (i.e., the weight and bias for the SVM). It 
    stops running when the solution corresponds to an objective value 
    that is optimal for this convex optimization problem, or when it hits 
    the maximum number of iterations set. If the algorithm does not converge, 
    then the current estimate of the SVM's parameters are not guaranteed to 
    be any good, hence the predictions can also be complete garbage.

--> Solution:
    1)  Normalize your training data so that the problem hopefully becomes more well conditioned, 
        which in turn can speed up convergence. One possibility is to scale your data to 0 mean, 
        unit standard deviation using Scikit-Learn's StandardScaler for an example. Note that you 
        have to apply the StandardScaler fitted on the training data to the test data.
        
    2)  Related to 1), make sure the other arguments such as regularization weight, C, is set appropriately.
    
    3)  Set max_iter to a larger value. The default is 1000.
    4)  Set dual = True if number of features > number of examples and vice versa. 
        This solves the SVM optimization problem using the dual formulation. 
        Use a different solver, for e.g., the L-BFGS solver if you are using 
        Logistic Regression.

Note: Doc2Vec seems to converge just fine - 
'''

Random Forest: 
Training time: 0.775411s; Prediction time: 0.014640s
              precision    recall  f1-score   support

           0       0.50      0.40      0.44        93
           1       0.56      0.65      0.60       107

    accuracy                           0.54       200
   macro avg       0.53      0.53      0.52       200
weighted avg       0.53      0.54      0.53       200

Logistic Regression: 
Training time: 0.013391s; Prediction time: 0.000250s
              precision    recall  f1-score   support

           0       0.48      0.46      0.47        93
           1       0.55      0.56      0.55       107

    accuracy                           0.52       200
   macro avg       0.51      0.51      0.51       200
weighted avg       0.51      0.52      0.51       200

GausianNB: 
Training time: 0.003915s; Prediction time: 0.001980s
              precision    recall  f1-score   support

           0       0.59      0.48      0.53        93
           1       0.61     



###### Hyperparameter Pruning

In [18]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'penalty': ['l1', '12', 'elasticnet', 'none'], # C determines how many data samples are allowed to be placed in different classes. If the value of C is set to a low value, the probability of the outliers is increased, and the general decision boundary is found. If the value of C is set high, the decision boundary is found more carefully.
              'C': [-4, 4, 20], # gamma determines the distance a single data sample exerts influence. That is, the gamma parameter can be said to adjust the curvature of the decision boundary.
              'solver': ['liblinear', 'sag', 'saga']}

grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(x_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 2/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 3/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 4/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 5/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 1/5] END ..................C=-4, penalty=l1, solver=saga; total time=   0.0s
[CV 2/5] END ..................C=-4, penalty=l1

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1309, in fit
    raise ValueError("Penalty term must be positive; got (C=%r)"
ValueError: Penalty term must be positive; got (C=-4)

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1309, in fit
    raise ValueError("Penalty term must be positive; got (C=%r)"
ValueError: Penalty term must be positive; got (C=-4)

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/skle



[CV 1/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s
[CV 2/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s




[CV 3/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s
[CV 4/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s


Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 439, in _check_solver
    raise ValueError("Logistic Regression supports only penalties in %s,"
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 12.

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logis

[CV 5/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s
[CV 1/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 2/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 3/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 4/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 5/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 1/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 2/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 3/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 4/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 5/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 1/5] END ...................C=4, penalty=12, solver=saga; total time=   0.0s
[CV 2/5] END ...............



[CV 2/5] END ..................C=4, penalty=none, solver=sag; total time=   0.1s
[CV 3/5] END ..................C=4, penalty=none, solver=sag; total time=   0.1s




[CV 4/5] END ..................C=4, penalty=none, solver=sag; total time=   0.1s
[CV 5/5] END ..................C=4, penalty=none, solver=sag; total time=   0.1s




[CV 1/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s
[CV 2/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s




[CV 3/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s
[CV 4/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s


Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in f

[CV 5/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s
[CV 1/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s
[CV 2/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s
[CV 3/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s
[CV 4/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s
[CV 5/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s




[CV 1/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s




[CV 2/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s
[CV 3/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s




[CV 4/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s


Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 439, in _check_solver
    raise ValueError("Logistic Regression supports only penalties in %s,"
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 12.

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logis

[CV 5/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s
[CV 1/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 2/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 3/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 4/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 5/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 1/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 2/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 3/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 4/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 5/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 1/5] END ..................C=20, penalty=12, solver=saga; total time=   0.0s
[CV 2/5] END ...............



[CV 2/5] END .................C=20, penalty=none, solver=sag; total time=   0.1s
[CV 3/5] END .................C=20, penalty=none, solver=sag; total time=   0.1s




[CV 4/5] END .................C=20, penalty=none, solver=sag; total time=   0.1s
[CV 5/5] END .................C=20, penalty=none, solver=sag; total time=   0.1s




[CV 1/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s
[CV 2/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s




[CV 3/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s
[CV 4/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s
[CV 5/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s
{'C': 4, 'penalty': 'none', 'solver': 'sag'}
LogisticRegression(C=4, penalty='none', solver='sag')


     nan     nan     nan 0.61125     nan 0.56875     nan     nan     nan
     nan     nan     nan     nan 0.61375 0.58    0.58        nan 0.57375
     nan     nan     nan     nan     nan     nan     nan 0.61    0.58   ]


###### Evaluation on best Classifier with pruned hyperparameters (Logistic Regression)

In [19]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'penalty': ['l1', '12', 'elasticnet', 'none'], # C determines how many data samples are allowed to be placed in different classes. If the value of C is set to a low value, the probability of the outliers is increased, and the general decision boundary is found. If the value of C is set high, the decision boundary is found more carefully.
              'C': [-4, 4, 20], # gamma determines the distance a single data sample exerts influence. That is, the gamma parameter can be said to adjust the curvature of the decision boundary.
              'solver': ['liblinear', 'sag', 'saga']}

grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(x_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
# confusion_matrix(logreg, x_test, y_test, target_names('sentiment')) #classifier,#x_test,#y_test #target_names

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1309, in fit
    raise ValueError("Penalty term must be positive; got (C=%r)"
ValueError: Penalty term must be positive; got (C=-4)

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1309, in fit
    raise ValueError("Penalty term must be positive; got (C=%r)"
ValueError: Penalty term must be positive; got (C=-4)

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/skle

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END .............C=-4, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 2/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 3/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 4/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 5/5] END ...................C=-4, penalty=l1, solver=sag; total time=   0.0s
[CV 1/5] END ..................C=-4, penalty=l1, solver=saga; total time=   0.0s
[CV 2/5] END ..................C=-4, penalty=l1

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1309, in fit
    raise ValueError("Penalty term must be positive; got (C=%r)"
ValueError: Penalty term must be positive; got (C=-4)

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1309, in fit
    raise ValueError("Penalty term must be positive; got (C=%r)"
ValueError: Penalty term must be positive; got (C=-4)

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/skle



[CV 1/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s
[CV 2/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s




[CV 3/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s
[CV 4/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s


Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 439, in _check_solver
    raise ValueError("Logistic Regression supports only penalties in %s,"
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 12.

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logis

[CV 5/5] END ...................C=4, penalty=l1, solver=saga; total time=   0.2s
[CV 1/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 2/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 3/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 4/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 5/5] END ..............C=4, penalty=12, solver=liblinear; total time=   0.0s
[CV 1/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 2/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 3/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 4/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 5/5] END ....................C=4, penalty=12, solver=sag; total time=   0.0s
[CV 1/5] END ...................C=4, penalty=12, solver=saga; total time=   0.0s
[CV 2/5] END ...............



[CV 2/5] END ..................C=4, penalty=none, solver=sag; total time=   0.1s
[CV 3/5] END ..................C=4, penalty=none, solver=sag; total time=   0.1s




[CV 4/5] END ..................C=4, penalty=none, solver=sag; total time=   0.1s
[CV 5/5] END ..................C=4, penalty=none, solver=sag; total time=   0.1s




[CV 1/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s
[CV 2/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s




[CV 3/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s
[CV 4/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s


Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in f

[CV 5/5] END .................C=4, penalty=none, solver=saga; total time=   0.1s
[CV 1/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END .............C=20, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s
[CV 2/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s
[CV 3/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s
[CV 4/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s
[CV 5/5] END ...................C=20, penalty=l1, solver=sag; total time=   0.0s




[CV 1/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s




[CV 2/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s




[CV 3/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s




[CV 4/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s


Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 439, in _check_solver
    raise ValueError("Logistic Regression supports only penalties in %s,"
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 12.

Traceback (most recent call last):
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/philipp/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logis

[CV 5/5] END ..................C=20, penalty=l1, solver=saga; total time=   0.2s
[CV 1/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 2/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 3/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 4/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 5/5] END .............C=20, penalty=12, solver=liblinear; total time=   0.0s
[CV 1/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 2/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 3/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 4/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 5/5] END ...................C=20, penalty=12, solver=sag; total time=   0.0s
[CV 1/5] END ..................C=20, penalty=12, solver=saga; total time=   0.0s
[CV 2/5] END ...............



[CV 2/5] END .................C=20, penalty=none, solver=sag; total time=   0.1s
[CV 3/5] END .................C=20, penalty=none, solver=sag; total time=   0.1s




[CV 4/5] END .................C=20, penalty=none, solver=sag; total time=   0.1s
[CV 5/5] END .................C=20, penalty=none, solver=sag; total time=   0.1s




[CV 1/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s
[CV 2/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s




[CV 3/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s
[CV 4/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s
[CV 5/5] END ................C=20, penalty=none, solver=saga; total time=   0.1s
{'C': 20, 'penalty': 'none', 'solver': 'sag'}
LogisticRegression(C=20, penalty='none', solver='sag')


     nan     nan     nan 0.61125     nan 0.5675      nan     nan     nan
     nan     nan     nan     nan 0.61125 0.5775  0.58        nan 0.57375
     nan     nan     nan     nan     nan     nan     nan 0.61375 0.57875]
