https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [14]:
from utils.loader import DataLoader
import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.svm import LinearSVC

from sklearn.naive_bayes import MultinomialNB

from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron

from sklearn.neighbors import KNeighborsClassifier


In [2]:
def preprocess(panda_data):
    text = []
    labels = []
    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        text.append(np_data[i][8])
        labels.append(np_data[i][1])

    data_dict = {'text': text,
                'labels': labels} 

    return data_dict

In [3]:
# Load datasets
# Amazon: ['DOC_ID', 'LABEL', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY',
# 'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE', 'REVIEW_TEXT']
loader = DataLoader()
train_data = loader.load_amazon(deceptive=False, all=True, test_mode=False)
test_data = loader.load_amazon(deceptive=False, all=True, test_mode=True)

train_data = preprocess(train_data)
test_data = preprocess(test_data)

In [17]:
print('With CountVectorizer & TfidfTransformer')
for clf, name in (
        (LinearSVC(), "LinearSVC"),
        (MultinomialNB(), "Multi  NB"),
        (RidgeClassifier(), "Ridge cls"),
        (SGDClassifier(), "SGD   cls"),
        (PassiveAggressiveClassifier(), "Pass-Aggr"),
        (Perceptron(), "Perceptro"),
        (KNeighborsClassifier(), "k - N - N"),
        ):

        text_clf = Pipeline([
                        ('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', clf),
                        ])

        text_clf.fit(train_data['text'], train_data['labels'])

        predicted = text_clf.predict(test_data['text'])
        acc = np.mean(predicted == test_data['labels'])

        print(name, acc)

LinearSVC 0.6394285714285715
Multi  NB 0.6573333333333333
Ridge cls 0.6508571428571429
SGD   cls 0.6664761904761904
Pass-Aggr 0.583047619047619
Perceptro 0.591047619047619
k - N - N 0.5971428571428572


In [18]:
print('With CountVectorizer only')
for clf, name in (
        (LinearSVC(), "LinearSVC"),
        (MultinomialNB(), "Multi  NB"),
        (RidgeClassifier(), "Ridge cls"),
        (SGDClassifier(), "SGD   cls"),
        (PassiveAggressiveClassifier(), "Pass-Aggr"),
        (Perceptron(), "Perceptro"),
        (KNeighborsClassifier(), "k - N - N"),
        ):

        text_clf = Pipeline([
                        ('vect', CountVectorizer()),
                        ('clf', clf),
                        ])

        text_clf.fit(train_data['text'], train_data['labels'])

        predicted = text_clf.predict(test_data['text'])
        acc = np.mean(predicted == test_data['labels'])

        print(name, acc)

With CountVectorizer only




LinearSVC 0.5977142857142858
Multi  NB 0.6506666666666666
Ridge cls 0.5908571428571429
SGD   cls 0.6152380952380953
Pass-Aggr 0.5946666666666667
Perceptro 0.6123809523809524
k - N - N 0.5436190476190477
