**Mehran Hosseinzadeh**

In [None]:
# Imports and setup
from sklearn.model_selection import train_test_split
import re
import nltk
import string
nltk.download("book")
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from pandas import read_csv, to_numeric
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model 
from sklearn.model_selection import GridSearchCV
import gensim
from sklearn.neural_network import MLPClassifier



stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
 

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package dependency_treebank is already up-to-date!
[nltk_data]    | Downloadi

In [None]:
# module for preparing text according to three modes as mentioned
def prepare_text(raw_text, mode):
    if mode == 1:
        return nltk.word_tokenize(raw_text)
    prepared_text = re.sub(r'\d+', '', raw_text)
    prepared_text = "".join([char.lower() for char in prepared_text if char not in string.punctuation]) 
    tokens = nltk.word_tokenize(prepared_text)
    if mode == 2:
      return tokens
    words = []
    for w in tokens:
      if w not in stop_words:
        words.append(w)
    stem_words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]
    return stem_words

In [None]:
# reading data
df = read_csv("dataset.csv", encoding='utf-8', error_bad_lines=False, engine='python')
X = df['comment'].to_numpy()
Y = df['sentiment'].to_numpy()

In [None]:
# module for creating BoW according to desired mode.
# if Train=True, the Vectorizer is fitted; else, the input is only transformed to the previously fitted global vectorizer
def bow(X, mode, train):
    sentences = []
    i = 0
    for x in X:
        i+=1
        if i % 3000 == 0:
            print("Finisehd pre-processing of {} comments".format(i))
        words = prepare_text(x, mode)
        sentences.append(" ".join(words))
    print("Creating Bag of Words.....")
    if train:
        result = vectorizer.fit_transform(sentences)
    else:
        result = vectorizer.transform(sentences)
    return np.array(result.toarray())

In [None]:
# module for creatin word2vec according to desired mode
def w2v(X, mode):
    words = []
    sentences = []
    i = 0
    for x in X:
        i+=1
        if i % 3000 == 0:
            print("Finisehd pre-processing of {} comments".format(i))
        words.append(prepare_text(x, mode))
        sentences.append(words[-1])
    print("Fitting W2V.....")
    w2v_model = gensim.models.Word2Vec(sentences, min_count=1,size= 100,workers=3, window =3, sg = 1)
    result = []
    i = 0
    for x in X:
        if i % 3000 == 0:
            print("Finisehd w2v for {} comments".format(i))
        result.append(sum([w2v_model.wv[word] for word in words[i]]) / len(x))
        i+=1
    return np.array(result)

In [None]:
# creating Bow for train
vectorizer = CountVectorizer(min_df=0.01, max_df=0.5)
X_train_bow, X_test_bow, Y_train_bow, Y_test_bow = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train_bow = bow(X_train_bow, 3, train=True)

Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Finisehd pre-processing of 12000 comments
Finisehd pre-processing of 15000 comments
Finisehd pre-processing of 18000 comments
Finisehd pre-processing of 21000 comments
Finisehd pre-processing of 24000 comments
Finisehd pre-processing of 27000 comments
Finisehd pre-processing of 30000 comments
Finisehd pre-processing of 33000 comments
Finisehd pre-processing of 36000 comments
Creating Bag of Words.....


In [None]:
# creating Bow for test
X_test_bow = bow(X_test_bow, 3, train=False)

Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Creating Bag of Words.....


In [None]:
# module to fit a model based on model_name
def fit_model(X, Y, model_name):
    if model_name == 'svm':
        model = svm.LinearSVC(C=1)
    elif model_name == 'knn':
        model = KNeighborsClassifier(n_neighbors=7)
    elif model_name == 'lr':
        model = linear_model.LogisticRegression(warm_start=True, C=0.1)
    model.fit(X, Y)
    return model

In [None]:
# analysis
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def analysis(labels, predictions):
    print("Classification Report:\n", classification_report(labels, predictions, target_names=['positive', 'negative']))
    print("Confusion Matrix:\n", confusion_matrix(labels, predictions))
    print("Accuracy:\n", accuracy_score(labels, predictions))

In [None]:
# initializang all needed modes for part 3.1
Xs = {'train_bow_1': None, 'train_bow_2': None, 'train_bow_3': None, 'test_bow_1': None, 'test_bow_2': None, 'test_bow_3': None}
Ys = {'train_bow_1': None, 'train_bow_2': None, 'train_bow_3': None, 'test_bow_1': None, 'test_bow_2': None, 'test_bow_3': None}

In [None]:
# filling in needed data for three modes for 3.1
vectorizer = CountVectorizer(min_df=0.01, max_df=0.5)
Xs['train_bow_1'], Xs['test_bow_1'], Ys['train_bow_1'], Ys['test_bow_1'] = train_test_split(X, Y, test_size=0.2, random_state=42)
Xs['train_bow_1'] = bow(Xs['train_bow_1'], 1, train=True)
Xs['test_bow_1'] = bow(Xs['test_bow_1'], 1, train=False)

Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Finisehd pre-processing of 12000 comments
Finisehd pre-processing of 15000 comments
Finisehd pre-processing of 18000 comments
Finisehd pre-processing of 21000 comments
Finisehd pre-processing of 24000 comments
Finisehd pre-processing of 27000 comments
Finisehd pre-processing of 30000 comments
Finisehd pre-processing of 33000 comments
Finisehd pre-processing of 36000 comments
Creating Bag of Words.....
Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Creating Bag of Words.....


In [None]:
# filling in needed data for three modes for 3.1
vectorizer = CountVectorizer(min_df=0.01, max_df=0.5)
Xs['train_bow_2'], Xs['test_bow_2'], Ys['train_bow_2'], Ys['test_bow_2'] = train_test_split(X, Y, test_size=0.2, random_state=42)
Xs['train_bow_2'] = bow(Xs['train_bow_2'], 2, train=True)
Xs['test_bow_2'] = bow(Xs['test_bow_2'], 2, train=False)

Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Finisehd pre-processing of 12000 comments
Finisehd pre-processing of 15000 comments
Finisehd pre-processing of 18000 comments
Finisehd pre-processing of 21000 comments
Finisehd pre-processing of 24000 comments
Finisehd pre-processing of 27000 comments
Finisehd pre-processing of 30000 comments
Finisehd pre-processing of 33000 comments
Finisehd pre-processing of 36000 comments
Creating Bag of Words.....
Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Creating Bag of Words.....


In [None]:
# filling in needed data for three modes for 3.1
vectorizer = CountVectorizer(min_df=0.01, max_df=0.5)
Xs['train_bow_3'], Xs['test_bow_3'], Ys['train_bow_3'], Ys['test_bow_3'] = train_test_split(X, Y, test_size=0.2, random_state=42)
Xs['train_bow_3'] = bow(Xs['train_bow_3'], 3, train=True)
Xs['test_bow_3'] = bow(Xs['test_bow_3'], 3, train=False)

Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Finisehd pre-processing of 12000 comments
Finisehd pre-processing of 15000 comments
Finisehd pre-processing of 18000 comments
Finisehd pre-processing of 21000 comments
Finisehd pre-processing of 24000 comments
Finisehd pre-processing of 27000 comments
Finisehd pre-processing of 30000 comments
Finisehd pre-processing of 33000 comments
Finisehd pre-processing of 36000 comments
Creating Bag of Words.....
Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Creating Bag of Words.....


**3.1_part_A**

In [None]:
# 3.1_part_A
models = {'svm_bow_1': None, 'svm_bow_2': None, 'svm_bow_3': None, 'lr_bow_1': None, 'lr_bow_2': None, 'lr_bow_3': None, 'knn_bow_1': None, 'knn_bow_2': None, 'knn_bow_3': None,}
for mode in [1, 2, 3]:
    for model_name in ['svm', 'lr', 'knn']:
        print("Fitting {} on bow with pre-process mode: {}".format(model_name, mode))
        models['{0}_bow_{1}'.format(model_name, mode)] = fit_model(Xs['train_bow_{}'.format(mode)], Ys['train_bow_{}'.format(mode)], model_name)

Fitting svm on bow with pre-process mode: 1




Fitting lr on bow with pre-process mode: 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting knn on bow with pre-process mode: 1
Fitting svm on bow with pre-process mode: 2




Fitting lr on bow with pre-process mode: 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting knn on bow with pre-process mode: 2




Fitting svm on bow with pre-process mode: 3
Fitting lr on bow with pre-process mode: 3
Fitting knn on bow with pre-process mode: 3


**3.1_part_B**

In [None]:
# 3.1_part_B
for model_name in ['svm', 'lr', 'knn']:
    for mode in [1, 2, 3]:
        print("{0} results for bag of words wih pre-process mode {1}:".format(model_name, mode))
        analysis(list(Ys['test_bow_{}'.format(mode)]), list(models['{0}_bow_{1}'.format(model_name, mode)].predict(Xs['test_bow_{}'.format(mode)])))
        print("####################################################")

svm results for bag of words wih pre-process mode 1:
Classification Report:
               precision    recall  f1-score   support

    positive       0.88      0.88      0.88      4508
    negative       0.88      0.88      0.88      4492

    accuracy                           0.88      9000
   macro avg       0.88      0.88      0.88      9000
weighted avg       0.88      0.88      0.88      9000

Confusion Matrix:
 [[3952  556]
 [ 550 3942]]
Accuracy:
 0.8771111111111111
####################################################
svm results for bag of words wih pre-process mode 2:
Classification Report:
               precision    recall  f1-score   support

    positive       0.89      0.87      0.88      4508
    negative       0.87      0.89      0.88      4492

    accuracy                           0.88      9000
   macro avg       0.88      0.88      0.88      9000
weighted avg       0.88      0.88      0.88      9000

Confusion Matrix:
 [[3926  582]
 [ 508 3984]]
Accuracy:
 0.8788

In [None]:
# creating word2vec structures
X_w2v = w2v(X, 3)

Finisehd pre-processing of 3000 comments
Finisehd pre-processing of 6000 comments
Finisehd pre-processing of 9000 comments
Finisehd pre-processing of 12000 comments
Finisehd pre-processing of 15000 comments
Finisehd pre-processing of 18000 comments
Finisehd pre-processing of 21000 comments
Finisehd pre-processing of 24000 comments
Finisehd pre-processing of 27000 comments
Finisehd pre-processing of 30000 comments
Finisehd pre-processing of 33000 comments
Finisehd pre-processing of 36000 comments
Finisehd pre-processing of 39000 comments
Finisehd pre-processing of 42000 comments
Finisehd pre-processing of 45000 comments
Fitting W2V.....
Finisehd w2v for 3000 comments
Finisehd w2v for 6000 comments
Finisehd w2v for 9000 comments
Finisehd w2v for 12000 comments
Finisehd w2v for 15000 comments
Finisehd w2v for 18000 comments
Finisehd w2v for 21000 comments
Finisehd w2v for 24000 comments
Finisehd w2v for 27000 comments
Finisehd w2v for 30000 comments
Finisehd w2v for 33000 comments
Finiseh

In [None]:
# splitting word2vec for train and test
X_train_w2v, X_test_w2v, Y_train_w2v, Y_test_w2v = train_test_split(X_w2v, Y, test_size=0.2, random_state=42)

In [None]:
# selecting hyperparameter from possible candidates using 5-fold by GridSearchCV
def hyperparameter_cv(X, Y, model_name, model_params):
    if model_name == 'svm':
        model = svm.LinearSVC()
    elif model_name == 'knn':
        model = KNeighborsClassifier()
    elif model_name == 'lr':
        model = linear_model.LogisticRegression(warm_start=True)
    elif model_name == 'mlp':
        model = MLPClassifier(warm_start=True, max_iter=300)
    clf = GridSearchCV(model, model_params, verbose=3)
    clf.fit(X, Y)
    return clf

**3.2_part_A**

In [None]:
# 3.2_part_A
# hyperparamtere selection for SVM with bag of words method
print("Hyperparamtere selection for SVM with bag of words method")

clf = hyperparameter_cv(X_train_bow, Y_train_bow, 'svm', {'C': [0.1, 0.5, 1, 2, 5, 10, 15]})
best_svm_bow = clf.best_estimator_
best_svm_bow_params = clf.best_params_
print(clf.best_params_)

# hyperparamtere selection for LR with bag of words method
print("Hyperparamtere selection for LR with bag of words method")

clf = hyperparameter_cv(X_train_bow, Y_train_bow, 'lr', {'C': [0.1, 0.5, 1, 2, 5, 10, 15]})
best_lr_bow = clf.best_estimator_
best_lr_bow_params = clf.best_params_
print(clf.best_params_)

# hyperparamtere selection for knn with bag of words method
print("Hyperparamtere selection for KNN with bag of words method")

clf = hyperparameter_cv(X_train_bow, Y_train_bow, 'knn', {'n_neighbors': [3, 5, 7]})
best_knn_bow = clf.best_estimator_
best_knn_bow_params = clf.best_params_
print(clf.best_params_)

Hyperparamtere selection for SVM with bag of words method
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.0s remaining:    0.0s


[CV] ............................... C=0.1, score=0.860, total=   8.0s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   15.6s remaining:    0.0s


[CV] ............................... C=0.1, score=0.865, total=   7.6s
[CV] C=0.1 ...........................................................




[CV] ............................... C=0.1, score=0.854, total=   7.4s
[CV] C=0.1 ...........................................................




[CV] ............................... C=0.1, score=0.860, total=   7.6s
[CV] C=0.1 ...........................................................




[CV] ............................... C=0.1, score=0.864, total=   7.6s
[CV] C=0.5 ...........................................................




[CV] ............................... C=0.5, score=0.859, total=   8.7s
[CV] C=0.5 ...........................................................




[CV] ............................... C=0.5, score=0.863, total=   8.4s
[CV] C=0.5 ...........................................................




[CV] ............................... C=0.5, score=0.855, total=   8.6s
[CV] C=0.5 ...........................................................




[CV] ............................... C=0.5, score=0.860, total=   8.2s
[CV] C=0.5 ...........................................................




[CV] ............................... C=0.5, score=0.864, total=   8.6s
[CV] C=1 .............................................................




[CV] ................................. C=1, score=0.861, total=   9.0s
[CV] C=1 .............................................................




[CV] ................................. C=1, score=0.862, total=   9.1s
[CV] C=1 .............................................................




[CV] ................................. C=1, score=0.857, total=   9.3s
[CV] C=1 .............................................................




[CV] ................................. C=1, score=0.858, total=   9.5s
[CV] C=1 .............................................................




[CV] ................................. C=1, score=0.867, total=   9.4s
[CV] C=2 .............................................................




[CV] ................................. C=2, score=0.862, total=  10.1s
[CV] C=2 .............................................................




[CV] ................................. C=2, score=0.859, total=  10.2s
[CV] C=2 .............................................................




[CV] ................................. C=2, score=0.858, total=  10.1s
[CV] C=2 .............................................................




[CV] ................................. C=2, score=0.858, total=  10.2s
[CV] C=2 .............................................................




[CV] ................................. C=2, score=0.865, total=  10.2s
[CV] C=5 .............................................................




[CV] ................................. C=5, score=0.850, total=  11.4s
[CV] C=5 .............................................................




[CV] ................................. C=5, score=0.856, total=  11.5s
[CV] C=5 .............................................................




[CV] ................................. C=5, score=0.852, total=  11.4s
[CV] C=5 .............................................................




[CV] ................................. C=5, score=0.855, total=  11.8s
[CV] C=5 .............................................................




[CV] ................................. C=5, score=0.858, total=  11.8s
[CV] C=10 ............................................................




[CV] ................................ C=10, score=0.852, total=  12.2s
[CV] C=10 ............................................................




[CV] ................................ C=10, score=0.838, total=  12.0s
[CV] C=10 ............................................................




[CV] ................................ C=10, score=0.841, total=  12.0s
[CV] C=10 ............................................................




[CV] ................................ C=10, score=0.845, total=  12.1s
[CV] C=10 ............................................................




[CV] ................................ C=10, score=0.852, total=  12.1s
[CV] C=15 ............................................................




[CV] ................................ C=15, score=0.836, total=  12.1s
[CV] C=15 ............................................................




[CV] ................................ C=15, score=0.841, total=  12.3s
[CV] C=15 ............................................................




[CV] ................................ C=15, score=0.838, total=  12.1s
[CV] C=15 ............................................................




[CV] ................................ C=15, score=0.829, total=  12.3s
[CV] C=15 ............................................................


[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  6.0min finished


[CV] ................................ C=15, score=0.835, total=  12.1s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'C': 1}
Hyperparamtere selection for LR with bag of words method
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.863, total=   5.5s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.5s remaining:    0.0s


[CV] ............................... C=0.1, score=0.865, total=   5.1s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.6s remaining:    0.0s


[CV] ............................... C=0.1, score=0.860, total=   5.3s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.862, total=   5.5s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.869, total=   5.6s
[CV] C=0.5 ...........................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ............................... C=0.5, score=0.861, total=   7.4s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.864, total=   6.6s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.855, total=   6.3s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.861, total=   6.4s
[CV] C=0.5 ...........................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ............................... C=0.5, score=0.866, total=   7.5s
[CV] C=1 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................. C=1, score=0.861, total=   7.6s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.863, total=   6.4s
[CV] C=1 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................. C=1, score=0.855, total=   7.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.860, total=   7.3s
[CV] C=1 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................. C=1, score=0.866, total=   7.1s
[CV] C=2 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................. C=2, score=0.861, total=   7.5s
[CV] C=2 .............................................................
[CV] ................................. C=2, score=0.863, total=   6.5s
[CV] C=2 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................. C=2, score=0.854, total=   7.4s
[CV] C=2 .............................................................
[CV] ................................. C=2, score=0.859, total=   6.2s
[CV] C=2 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................. C=2, score=0.866, total=   7.4s
[CV] C=5 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................. C=5, score=0.860, total=   7.3s
[CV] C=5 .............................................................
[CV] ................................. C=5, score=0.863, total=   7.1s
[CV] C=5 .............................................................
[CV] ................................. C=5, score=0.854, total=   6.8s
[CV] C=5 .............................................................
[CV] ................................. C=5, score=0.859, total=   6.5s
[CV] C=5 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................. C=5, score=0.866, total=   7.4s
[CV] C=10 ............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................ C=10, score=0.860, total=   7.7s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.863, total=   7.2s
[CV] C=10 ............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................ C=10, score=0.854, total=   7.3s
[CV] C=10 ............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................ C=10, score=0.860, total=   7.2s
[CV] C=10 ............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................ C=10, score=0.865, total=   7.3s
[CV] C=15 ............................................................
[CV] ................................ C=15, score=0.860, total=   6.9s
[CV] C=15 ............................................................
[CV] ................................ C=15, score=0.863, total=   7.0s
[CV] C=15 ............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................ C=15, score=0.854, total=   7.1s
[CV] C=15 ............................................................
[CV] ................................ C=15, score=0.860, total=   7.0s
[CV] C=15 ............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  4.0min finished


[CV] ................................ C=15, score=0.865, total=   7.3s
{'C': 0.1}
Hyperparamtere selection for KNN with bag of words method
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....................... n_neighbors=3, score=0.616, total=11.4min
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 11.4min remaining:    0.0s


[CV] ....................... n_neighbors=3, score=0.633, total=11.4min
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 22.8min remaining:    0.0s


[CV] ....................... n_neighbors=3, score=0.638, total=11.4min
[CV] n_neighbors=3 ...................................................
[CV] ....................... n_neighbors=3, score=0.630, total=11.2min
[CV] n_neighbors=3 ...................................................
[CV] ....................... n_neighbors=3, score=0.663, total=11.3min
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.636, total=11.4min
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.641, total=11.3min
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.676, total=11.3min
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.642, total=11.3min
[CV] n_neighbors=5 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 169.9min finished


{'n_neighbors': 7}


**3.2_part_B**

In [None]:
# 3.2_part_B
# hyperparamtere selection for SVM with word2vec method
print("Hyperparamtere selection for SVM with word2vec method")

clf = hyperparameter_cv(X_train_w2v, Y_train_w2v, 'svm', {'C': [0.1, 0.5, 1, 2, 5, 10, 15]})
best_svm_w2v = clf.best_estimator_
best_svm_w2v_params = clf.best_params_
print(clf.best_params_)

# hyperparamtere selection for LR with word2vec method
print("Hyperparamtere selection for LR with word2vec method")

clf = hyperparameter_cv(X_train_w2v, Y_train_w2v, 'lr', {'C': [0.1, 0.5, 1, 2, 5, 10, 15]})
best_lr_w2v = clf.best_estimator_
best_lr_w2v_params = clf.best_params_
print(clf.best_params_)

# hyperparamtere selection for knn with word2vec method
print("Hyperparamtere selection for knn with word2vec method")

clf = hyperparameter_cv(X_train_w2v, Y_train_w2v, 'knn', {'n_neighbors': [3, 5, 7]})
best_knn_w2v = clf.best_estimator_
best_knn_w2v_params = clf.best_params_
print(clf.best_params_)

Hyperparamtere selection for SVM with bag of words method
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... C=0.1, score=0.804, total=   0.4s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ............................... C=0.1, score=0.806, total=   0.4s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] ............................... C=0.1, score=0.801, total=   0.4s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.803, total=   0.4s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.810, total=   0.4s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.844, total=   0.6s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.845, total=   0.6s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.837, total=   0.6s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.839, total=   0.6s
[CV] C=0.5 ...........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  1.3min finished


{'C': 10}
Hyperparamtere selection for LR with bag of words method
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... C=0.1, score=0.759, total=   0.3s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ............................... C=0.1, score=0.758, total=   0.3s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV] ............................... C=0.1, score=0.759, total=   0.3s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.758, total=   0.3s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.764, total=   0.2s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.791, total=   0.3s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.790, total=   0.4s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.786, total=   0.2s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.792, total=   0.3s
[CV] C=0.5 ...........................................................
[CV] .

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................ C=10, score=0.857, total=   1.1s
[CV] C=15 ............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................................ C=15, score=0.858, total=   1.1s
[CV] C=15 ............................................................
[CV] ................................ C=15, score=0.855, total=   0.9s
[CV] C=15 ............................................................
[CV] ................................ C=15, score=0.853, total=   1.0s
[CV] C=15 ............................................................
[CV] ................................ C=15, score=0.853, total=   1.1s
[CV] C=15 ............................................................
[CV] ................................ C=15, score=0.860, total=   0.9s


[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:   19.1s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'C': 15}
Hyperparamtere selection for LR with bag of words method
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] n_neighbors=3 ...................................................
[CV] ....................... n_neighbors=3, score=0.785, total= 1.2min
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


[CV] ....................... n_neighbors=3, score=0.784, total= 1.2min
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.4min remaining:    0.0s


[CV] ....................... n_neighbors=3, score=0.776, total= 1.2min
[CV] n_neighbors=3 ...................................................
[CV] ....................... n_neighbors=3, score=0.779, total= 1.2min
[CV] n_neighbors=3 ...................................................
[CV] ....................... n_neighbors=3, score=0.783, total= 1.2min
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.797, total= 1.2min
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.795, total= 1.2min
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.787, total= 1.2min
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.797, total= 1.2min
[CV] n_neighbors=5 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 17.9min finished


{'n_neighbors': 7}


**3.2_part_C**

In [None]:
# 3.2_part_C
print("SVM results for word2vec")
analysis(list(Y_test_w2v), list(best_svm_w2v.predict(X_test_w2v)))
print("######################")
print("LR results for word2vec")
analysis(list(Y_test_w2v), list(best_lr_w2v.predict(X_test_w2v)))
print("######################")
print("KNN results for word2vec")
analysis(list(Y_test_w2v), list(best_knn_w2v.predict(X_test_w2v)))

SVM results for word2vec
Classification Report:
               precision    recall  f1-score   support

    positive       0.87      0.86      0.86      4508
    negative       0.86      0.87      0.86      4492

    accuracy                           0.86      9000
   macro avg       0.86      0.86      0.86      9000
weighted avg       0.86      0.86      0.86      9000

Confusion Matrix:
 [[3860  648]
 [ 588 3904]]
Accuracy:
 0.8626666666666667
######################
LR results for word2vec
Classification Report:
               precision    recall  f1-score   support

    positive       0.86      0.85      0.86      4508
    negative       0.85      0.86      0.86      4492

    accuracy                           0.86      9000
   macro avg       0.86      0.86      0.86      9000
weighted avg       0.86      0.86      0.86      9000

Confusion Matrix:
 [[3823  685]
 [ 609 3883]]
Accuracy:
 0.8562222222222222
######################
KNN results for word2vec
Classification Report:
   

**3.2_part_D**

In [None]:
# 3.2_part_D
print("SVM results for bag of words")
analysis(list(Y_test_bow), list(best_svm_bow.predict(X_test_bow)))
print("######################")
print("LR results for bag of words")
analysis(list(Y_test_bow), list(best_lr_bow.predict(X_test_bow)))
print("######################")
print("KNN results for bag of words")
analysis(list(Y_test_bow), list(best_knn_bow.predict(X_test_bow)))

SVM results for bag of words
Classification Report:
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      4508
    negative       0.86      0.89      0.87      4492

    accuracy                           0.87      9000
   macro avg       0.87      0.87      0.87      9000
weighted avg       0.87      0.87      0.87      9000

Confusion Matrix:
 [[3868  640]
 [ 514 3978]]
Accuracy:
 0.8717777777777778
######################
LR results for bag of words
Classification Report:
               precision    recall  f1-score   support

    positive       0.88      0.87      0.87      4508
    negative       0.87      0.88      0.88      4492

    accuracy                           0.87      9000
   macro avg       0.87      0.87      0.87      9000
weighted avg       0.87      0.87      0.87      9000

Confusion Matrix:
 [[3905  603]
 [ 528 3964]]
Accuracy:
 0.8743333333333333
######################
KNN results for bag of words
Classification

**3.2_part_E**

In [None]:
# 3.2_part_E
import pickle

pickle.dump(best_svm_w2v, open('best_svm_w2v.pkl', 'wb'))
pickle.dump(best_lr_w2v, open('best_lr_w2v.pkl', 'wb'))
pickle.dump(best_knn_w2v, open('best_knn_w2v.pkl', 'wb'))

pickle.dump(best_svm_bow, open('best_svm_bow.pkl', 'wb'))
pickle.dump(best_lr_bow, open('best_lr_bow.pkl', 'wb'))
pickle.dump(best_knn_bow, open('best_knn_bow.pkl', 'wb'))

pickle.dump(best_lr_bow, open('LR.pkl', 'wb'))
pickle.dump(best_knn_bow, open('kNN.pkl', 'wb'))
pickle.dump(best_svm_bow, open('SVM.pkl', 'wb'))

**3.3_part_A**

In [None]:
# 3.3_part_A
# defining MLP with bag of words and 1 hidden layer. choosing proper hidden_layer_sizes and activation from GridSearchCV as before
cv_params = {'hidden_layer_sizes': [(100,), (150,)], 'activation': ['tanh', 'relu']}
clf = hyperparameter_cv(X_train_bow, Y_train_bow, 'mlp', cv_params)
best_mlp_bow = clf.best_estimator_
best_mlp_bow_params = clf.best_params_
print(clf.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] activation=tanh, hidden_layer_sizes=(100,) ......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  activation=tanh, hidden_layer_sizes=(100,), score=0.859, total= 2.3min
[CV] activation=tanh, hidden_layer_sizes=(100,) ......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s


[CV]  activation=tanh, hidden_layer_sizes=(100,), score=0.859, total= 2.4min
[CV] activation=tanh, hidden_layer_sizes=(100,) ......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.7min remaining:    0.0s


[CV]  activation=tanh, hidden_layer_sizes=(100,), score=0.853, total= 2.3min
[CV] activation=tanh, hidden_layer_sizes=(100,) ......................
[CV]  activation=tanh, hidden_layer_sizes=(100,), score=0.857, total= 2.3min
[CV] activation=tanh, hidden_layer_sizes=(100,) ......................
[CV]  activation=tanh, hidden_layer_sizes=(100,), score=0.863, total= 2.3min
[CV] activation=tanh, hidden_layer_sizes=(150,) ......................
[CV]  activation=tanh, hidden_layer_sizes=(150,), score=0.858, total= 3.0min
[CV] activation=tanh, hidden_layer_sizes=(150,) ......................
[CV]  activation=tanh, hidden_layer_sizes=(150,), score=0.859, total= 3.1min
[CV] activation=tanh, hidden_layer_sizes=(150,) ......................
[CV]  activation=tanh, hidden_layer_sizes=(150,), score=0.855, total= 3.1min
[CV] activation=tanh, hidden_layer_sizes=(150,) ......................
[CV]  activation=tanh, hidden_layer_sizes=(150,), score=0.860, total= 3.1min
[CV] activation=tanh, hidden_layer_

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 45.7min finished


{'activation': 'relu', 'hidden_layer_sizes': (150,)}


3.3_part_B

In [None]:
# 3.3_part_B
import pickle

print("MLP results for bag of words")
analysis(list(Y_test_bow), list(best_mlp_bow.predict(X_test_bow)))
print("######################")
pickle.dump(best_mlp_bow, open('best_mlp_bow.pkl', 'wb'))
pickle.dump(best_mlp_bow, open('best.pkl', 'wb'))

MLP results for bag of words
Classification Report:
               precision    recall  f1-score   support

    positive       0.87      0.87      0.87      4508
    negative       0.87      0.87      0.87      4492

    accuracy                           0.87      9000
   macro avg       0.87      0.87      0.87      9000
weighted avg       0.87      0.87      0.87      9000

Confusion Matrix:
 [[3903  605]
 [ 573 3919]]
Accuracy:
 0.8691111111111111
######################
