In [None]:
import pandas as pd
from google_drive_downloader import GoogleDriveDownloader as gdd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


import advanced_processor_chain_factory
import simple_processor_chain_factory

In [None]:
gdd.download_file_from_google_drive(file_id='15JJ6ZysFM57tlUjXo2nHVhkGwePbVMVV',dest_path='./dataset.csv')

In [None]:
dataset = pd.read_csv('./dataset.csv')
dataset['sentiment'] = dataset['sentiment'].replace(['negative', 'positive'] , [0, 1])
dataset.head()

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",0
1,If you're a layman interested in quantum theor...,0
2,It's amazing that this no talent actor Chapa g...,0
3,This must be one of the most overrated Spanish...,0
4,Some critics have compared Chop Shop with the ...,1


In [None]:
def analysis(labels, predictions):
    print("Report: Classification\n", classification_report(labels, predictions, target_names=["positive", "negative"]))
    print("Matrix: Confusion\n", confusion_matrix(labels, predictions))
    print("Accuracy:\n", accuracy_score(labels, predictions))

In [None]:
def evaluate_models_with_data(models, X_train, X_test, Y_train, Y_test):
    for name, model in models.items():
        print(f'------Evaluating {name}------')
        model.fit(X_train, Y_train)
        pred = model.predict(X_test)
        analysis(Y_test, pred)

In [None]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced'),
          'svm' : svm.SVC(),
          'knn' : KNeighborsClassifier(n_neighbors=8)
         }

In [None]:
def prepare_data(vectorizer, processor_chain = None, debug = False, debug_data_size = 4000):
    X , Y = dataset['comment'], dataset['sentiment']
    if debug:
        X , Y = X[:debug_data_size], Y[:debug_data_size]
    if processor_chain:
        X = X.apply(processor_chain.process)
    X = vectorizer(X)
    return train_test_split(X,Y)

# Bag of Words

In [None]:
def count_vectorizer(X):
  vectorizer = CountVectorizer(max_features = 2000)
  return vectorizer.fit_transform(X)

## Without Pre-Process

In [None]:
evaluate_models_with_data(models, *prepare_data(count_vectorizer, debug = True))

------Evaluating logistic regression------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.77      0.81       499
    negative       0.79      0.88      0.83       501

    accuracy                           0.82      1000
   macro avg       0.83      0.82      0.82      1000
weighted avg       0.83      0.82      0.82      1000

Matrix: Confusion
 [[384 115]
 [ 61 440]]
Accuracy:
 0.824
------Evaluating svm------
Report: Classification
               precision    recall  f1-score   support

    positive       0.83      0.70      0.76       499
    negative       0.74      0.85      0.79       501

    accuracy                           0.78      1000
   macro avg       0.78      0.77      0.77      1000
weighted avg       0.78      0.78      0.77      1000

Matrix: Confusion
 [[347 152]
 [ 73 428]]
Accuracy:
 0.775
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.62      0.61      0.61    

## Simple Pre-Process

In [None]:
evaluate_models_with_data(models, *prepare_data(count_vectorizer,processor_chain=simple_processor_chain_factory.create(), debug=True))

------Evaluating logistic regression------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.83      0.84       514
    negative       0.83      0.85      0.84       486

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000

Matrix: Confusion
 [[427  87]
 [ 74 412]]
Accuracy:
 0.839
------Evaluating svm------
Report: Classification
               precision    recall  f1-score   support

    positive       0.81      0.75      0.78       514
    negative       0.76      0.82      0.79       486

    accuracy                           0.78      1000
   macro avg       0.79      0.78      0.78      1000
weighted avg       0.79      0.78      0.78      1000

Matrix: Confusion
 [[386 128]
 [ 88 398]]
Accuracy:
 0.784
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.62      0.60      0.61    

## Pre-Process with Lemmitization

In [None]:
evaluate_models_with_data(models, *prepare_data(count_vectorizer,processor_chain=advanced_processor_chain_factory.create('lem'), debug=True))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.83      0.82      0.83       490
    negative       0.83      0.84      0.83       510

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[403  87]
 [ 83 427]]
Accuracy:
 0.83
------Evaluating svm------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.78      0.81       490
    negative       0.80      0.87      0.84       510

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[381 109]
 [ 64 446]]
Accuracy:
 0.827
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.62      0.75      0.68       490
    negative       0.70      0.56      0.63       510

    accuracy                           0.66      1000
   macro avg       0.66      0.66      0.65      1000
weighted avg       0.67      0.66      0.65      1000

Matrix: Confusion
 [[369 121]
 [222 288]]
Accuracy:
 0.657


## Pre-Process with Stemmimg

In [None]:
evaluate_models_with_data(models, *prepare_data(count_vectorizer,processor_chain=advanced_processor_chain_factory.create('stem'), debug=True))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.83      0.84       509
    negative       0.83      0.84      0.83       491

    accuracy                           0.83      1000
   macro avg       0.83      0.84      0.83      1000
weighted avg       0.84      0.83      0.84      1000

Matrix: Confusion
 [[424  85]
 [ 80 411]]
Accuracy:
 0.835
------Evaluating svm------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.80      0.82       509
    negative       0.80      0.85      0.83       491

    accuracy                           0.82      1000
   macro avg       0.83      0.83      0.82      1000
weighted avg       0.83      0.82      0.82      1000

Matrix: Confusion
 [[407 102]
 [ 73 418]]
Accuracy:
 0.825
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.62      0.73      0.67       509
    negative       0.66      0.54      0.59       491

    accuracy                           0.64      1000
   macro avg       0.64      0.63      0.63      1000
weighted avg       0.64      0.64      0.63      1000

Matrix: Confusion
 [[371 138]
 [227 264]]
Accuracy:
 0.635


## Word2Vec

In [None]:
import itertools

class Word2VecDataProvider:
  
  def __init__(self, processor_chain, debug = False, debug_data_size = 20000):
    X , Y = dataset['comment'], dataset['sentiment']
    if debug:
      X , Y = X[:debug_data_size] , Y[:debug_data_size]
    sentences = X.apply(nltk.sent_tokenize)
    sentences = sentences.apply(lambda com: 
                                [nltk.word_tokenize(processor_chain.process(s)) 
                                for s in com])
    self.data =  list(itertools.chain.from_iterable(sentences.to_list()))    

In [None]:
from gensim.models import word2vec
import numpy as np

class Word2Vec: 
  
  def __init__(self, num_features=250, min_count=40,workers=4,
               window=10,sample=0.001):
    
    self.num_features=num_features
    self.min_count=min_count
    self.workers=workers
    self.window=window
    self.sample=sample
    
  
  def fit(self,data):
    self.model = word2vec.Word2Vec(data, workers = self.workers, 
                            size = self.num_features, min_count = self.min_count,
                            window = self.window, sample = self.sample)
    self.model.init_sims(replace = True)


  def predict(self,comment):
    result = np.zeros((self.num_features,), dtype = "float32")
    word_index = set(self.model.wv.index2word)
    nword = 0
    for word in comment:
        if word in word_index:
            nword += 1
            result = np.add(result, self.model[word])
    return np.divide(result, nword)    

  

In [None]:
processor_chain = simple_processor_chain_factory.create()
word2vec_data = Word2VecDataProvider(processor_chain=processor_chain, debug=True).data
w2v_model = Word2Vec()
w2v_model.fit(word2vec_data)

def w2v_vectorizer(X):
  return X.apply(lambda comment: pd.Series(w2v_model.predict(
      nltk.word_tokenize(processor_chain.process(comment)))))


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [None]:
evaluate_models_with_data(models, *prepare_data(w2v_vectorizer,processor_chain=processor_chain, debug=True))




------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.77      0.77      0.77       497
    negative       0.77      0.77      0.77       503

    accuracy                           0.77      1000
   macro avg       0.77      0.77      0.77      1000
weighted avg       0.77      0.77      0.77      1000

Matrix: Confusion
 [[381 116]
 [114 389]]
Accuracy:
 0.77
------Evaluating svm------
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.79      0.82       497
    negative       0.81      0.86      0.83       503

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[394 103]
 [ 71 432]]
Accuracy:
 0.826
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    p