In [1]:
import pandas as pd
from google_drive_downloader import GoogleDriveDownloader as gdd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import advanced_processor_chain_factory
import simple_processor_chain_factory

In [2]:
gdd.download_file_from_google_drive(file_id='15JJ6ZysFM57tlUjXo2nHVhkGwePbVMVV',dest_path='./dataset.csv')

In [3]:
dataset = pd.read_csv('./dataset.csv')
dataset['sentiment'] = dataset['sentiment'].replace(['negative', 'positive'] , [0, 1])
dataset.head()

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",0
1,If you're a layman interested in quantum theor...,0
2,It's amazing that this no talent actor Chapa g...,0
3,This must be one of the most overrated Spanish...,0
4,Some critics have compared Chop Shop with the ...,1


In [4]:
def analysis(labels, predictions):
    print("Report: Classification\n", classification_report(labels, predictions, target_names=["positive", "negative"]))
    print("Matrix: Confusion\n", confusion_matrix(labels, predictions))
    print("Accuracy:\n", accuracy_score(labels, predictions))

In [5]:
def evaluate_models_with_data(models, X_train, X_test, Y_train, Y_test):
    for name, model in models.items():
        print(f'------Evaluating {name}------')
        model.fit(X_train, Y_train)
        pred = model.predict(X_test)
        analysis(Y_test, pred)

In [6]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced'),
          'svm' : svm.SVC(),
          'knn' : KNeighborsClassifier(n_neighbors=8)
         }

In [7]:
def prepare_data(processor_chain = None, debug = False, debug_data_size = 4000):
    X , Y = dataset['comment'], dataset['sentiment']
    if debug:
        X , Y = X[:debug_data_size], Y[:debug_data_size]
    if processor_chain:
        X = X.apply(processor_chain.process)
    vectorizer = CountVectorizer(max_features = 2000)
    X = vectorizer.fit_transform(X)
    return train_test_split(X,Y)

In [8]:
evaluate_models_with_data(models, *prepare_data(debug = True))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.82      0.81      0.81       508
    negative       0.81      0.81      0.81       492

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000

Matrix: Confusion
 [[413  95]
 [ 93 399]]
Accuracy:
 0.812
------Evaluating svm------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Report: Classification
               precision    recall  f1-score   support

    positive       0.78      0.72      0.75       508
    negative       0.73      0.80      0.76       492

    accuracy                           0.76      1000
   macro avg       0.76      0.76      0.76      1000
weighted avg       0.76      0.76      0.76      1000

Matrix: Confusion
 [[364 144]
 [100 392]]
Accuracy:
 0.756
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.62      0.66      0.64       508
    negative       0.62      0.59      0.61       492

    accuracy                           0.62      1000
   macro avg       0.62      0.62      0.62      1000
weighted avg       0.62      0.62      0.62      1000

Matrix: Confusion
 [[333 175]
 [202 290]]
Accuracy:
 0.623


In [9]:
evaluate_models_with_data(models, *prepare_data(processor_chain=simple_processor_chain_factory.create(), debug=True))

------Evaluating logistic regression------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.85      0.84       510
    negative       0.84      0.83      0.83       490

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000

Matrix: Confusion
 [[431  79]
 [ 85 405]]
Accuracy:
 0.836
------Evaluating svm------
Report: Classification
               precision    recall  f1-score   support

    positive       0.79      0.75      0.77       510
    negative       0.75      0.79      0.77       490

    accuracy                           0.77      1000
   macro avg       0.77      0.77      0.77      1000
weighted avg       0.77      0.77      0.77      1000

Matrix: Confusion
 [[380 130]
 [101 389]]
Accuracy:
 0.769
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.62      0.65      0.64    

In [10]:
evaluate_models_with_data(models, *prepare_data(processor_chain=advanced_processor_chain_factory.create('lem'), debug=True))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.83      0.82      0.83       489
    negative       0.83      0.84      0.84       511

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[403  86]
 [ 83 428]]
Accuracy:
 0.831
------Evaluating svm------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.79      0.82       489
    negative       0.81      0.87      0.84       511

    accuracy                           0.83      1000
   macro avg       0.84      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[386 103]
 [ 64 447]]
Accuracy:
 0.833
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.59      0.79      0.68       489
    negative       0.71      0.48      0.57       511

    accuracy                           0.63      1000
   macro avg       0.65      0.63      0.62      1000
weighted avg       0.65      0.63      0.62      1000

Matrix: Confusion
 [[387 102]
 [267 244]]
Accuracy:
 0.631


In [11]:
evaluate_models_with_data(models, *prepare_data(processor_chain=advanced_processor_chain_factory.create('stem'), debug=True))

------Evaluating logistic regression------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.81      0.83       521
    negative       0.81      0.85      0.83       479

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[424  97]
 [ 74 405]]
Accuracy:
 0.829
------Evaluating svm------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.78      0.82       521
    negative       0.78      0.87      0.83       479

    accuracy                           0.82      1000
   macro avg       0.83      0.83      0.82      1000
weighted avg       0.83      0.82      0.82      1000

Matrix: Confusion
 [[406 115]
 [ 61 418]]
Accuracy:
 0.824
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.66      0.77      0.71    