https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
from utils.loader import DataLoader
import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier


In [2]:
def df2dict(panda_data):
    text = []
    labels = []
    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        text.append(np_data[i][8])
        labels.append(np_data[i][1])

    data_dict = {'text': text,
                'labels': labels} 

    return data_dict

# feature_extraction with sklearn methods

In [3]:
def run_classifier(n_range, info):
    print(info)
    best_acc = 0
    for clf, name in (
            (LinearSVC(), "LinearSVC"),
            (MultinomialNB(), "Multi  NB"),
            (RidgeClassifier(), "Ridge cls"),
            (SGDClassifier(), "SGD   cls"),
            # the following 3 preform quite bad
            # (PassiveAggressiveClassifier(), "Pass-Aggr"),
            # (Perceptron(), "Perceptro"),
            # (KNeighborsClassifier(), "k - N - N"),
            ):

        text_clf = Pipeline([
                            ('vect', CountVectorizer(ngram_range=n_range)),
                            ('tfidf', TfidfTransformer()),
                            ('clf', clf),
                            ])

        text_clf.fit(train_data['text'], train_data['labels'])

        predicted = text_clf.predict(test_data['text'])
        acc = np.mean(predicted == test_data['labels'])

        print(name, acc)

        if acc > best_acc:
            best_acc = acc
            best_result = (name, acc)
    
    print('Best result in this run: ', best_result)

In [10]:
# Load amazon datasets
# Amazon: ['DOC_ID', 'LABEL', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY',
# 'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE', 'REVIEW_TEXT']
loader = DataLoader()

#################### Choose which dataset to use ####################
train_data = loader.load_amazon(deceptive=False, all=True, test_mode=False)
test_data = loader.load_amazon(deceptive=False, all=True, test_mode=True)

# train_data = loader.load_clean_amazon(test_mode=False)
# test_data = loader.load_clean_amazon(test_mode=True)
#################### Choose which dataset to use ####################


train_data = df2dict(train_data)
test_data = df2dict(test_data)

run_classifier((1,1), 'ngram range: 1')
run_classifier((1,2), 'ngram range: 1 - 2')
run_classifier((1,3), 'ngram range: 1 - 3')
run_classifier((2,2), 'ngram range: 2')
run_classifier((2,3), 'ngram range: 2 - 3')

ngram range: 1
LinearSVC 0.6394285714285715
Multi  NB 0.6573333333333333
Ridge cls 0.6508571428571429
SGD   cls 0.6676190476190477
Best result in this run:  ('SGD   cls', 0.6676190476190477)
ngram range: 1 - 2
LinearSVC 0.668
Multi  NB 0.6662857142857143
Ridge cls 0.6716190476190477
SGD   cls 0.6687619047619048
Best result in this run:  ('Ridge cls', 0.6716190476190477)
ngram range: 1 - 3
LinearSVC 0.6693333333333333
Multi  NB 0.6697142857142857
Ridge cls 0.6668571428571428
SGD   cls 0.6657142857142857
Best result in this run:  ('Multi  NB', 0.6697142857142857)
ngram range: 2
LinearSVC 0.6377142857142857
Multi  NB 0.6518095238095238
Ridge cls 0.6407619047619048
SGD   cls 0.6426666666666667
Best result in this run:  ('Multi  NB', 0.6518095238095238)
ngram range: 2 - 3
LinearSVC 0.6426666666666667
Multi  NB 0.6558095238095238
Ridge cls 0.6443809523809524
SGD   cls 0.6449523809523809
Best result in this run:  ('Multi  NB', 0.6558095238095238)


In [11]:
# Load amazon datasets
# Amazon: ['DOC_ID', 'LABEL', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY',
# 'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE', 'REVIEW_TEXT']
loader = DataLoader()

#################### Choose which dataset to use ####################
# train_data = loader.load_amazon(deceptive=False, all=True, test_mode=False)
# test_data = loader.load_amazon(deceptive=False, all=True, test_mode=True)

train_data = loader.load_clean_amazon(test_mode=False)
test_data = loader.load_clean_amazon(test_mode=True)
#################### Choose which dataset to use ####################


train_data = df2dict(train_data)
test_data = df2dict(test_data)

run_classifier((1,1), 'ngram range: 1')
run_classifier((1,2), 'ngram range: 1 - 2')
run_classifier((1,3), 'ngram range: 1 - 3')
run_classifier((2,2), 'ngram range: 2')
run_classifier((2,3), 'ngram range: 2 - 3')

ngram range: 1
LinearSVC 0.6394285714285715
Multi  NB 0.6573333333333333
Ridge cls 0.6508571428571429
SGD   cls 0.663047619047619
Best result in this run:  ('SGD   cls', 0.663047619047619)
ngram range: 1 - 2
LinearSVC 0.668
Multi  NB 0.6662857142857143
Ridge cls 0.6716190476190477
SGD   cls 0.6685714285714286
Best result in this run:  ('Ridge cls', 0.6716190476190477)
ngram range: 1 - 3
LinearSVC 0.6693333333333333
Multi  NB 0.6697142857142857
Ridge cls 0.6668571428571428
SGD   cls 0.664952380952381
Best result in this run:  ('Multi  NB', 0.6697142857142857)
ngram range: 2
LinearSVC 0.6377142857142857
Multi  NB 0.6518095238095238
Ridge cls 0.6407619047619048
SGD   cls 0.6424761904761904
Best result in this run:  ('Multi  NB', 0.6518095238095238)
ngram range: 2 - 3
LinearSVC 0.6426666666666667
Multi  NB 0.6558095238095238
Ridge cls 0.6443809523809524
SGD   cls 0.6434285714285715
Best result in this run:  ('Multi  NB', 0.6558095238095238)


In [54]:
from sklearn.neural_network import MLPClassifier
n_range = (1,2)
clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, verbose=True)

text_clf = Pipeline([
                    ('vect', CountVectorizer(ngram_range=n_range)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', clf),
                    ])

text_clf.fit(train_data['text'], train_data['labels'])

predicted = text_clf.predict(test_data['text'])
acc = np.mean(predicted == test_data['labels'])

print('neural_network', acc)

Iteration 1, loss = 0.67614572
Iteration 2, loss = 0.48562705
Iteration 3, loss = 0.26432652
Iteration 4, loss = 0.12981548
Iteration 5, loss = 0.06988418
Iteration 6, loss = 0.04317081
Iteration 7, loss = 0.02999616
Iteration 8, loss = 0.02272141
Iteration 9, loss = 0.01835493
Iteration 10, loss = 0.01550927
Iteration 11, loss = 0.01357627
Iteration 12, loss = 0.01219310
Iteration 13, loss = 0.01117648
Iteration 14, loss = 0.01040037
Iteration 15, loss = 0.00979227
Iteration 16, loss = 0.00930762
Iteration 17, loss = 0.00890958
Iteration 18, loss = 0.00857572
Iteration 19, loss = 0.00829250
Iteration 20, loss = 0.00804665
Iteration 21, loss = 0.00783039
Iteration 22, loss = 0.00763796
Iteration 23, loss = 0.00746264
Iteration 24, loss = 0.00730216
Iteration 25, loss = 0.00715345
Iteration 26, loss = 0.00701508
Iteration 27, loss = 0.00688392
Iteration 28, loss = 0.00675901
Iteration 29, loss = 0.00664033
Iteration 30, loss = 0.00652576
Iteration 31, loss = 0.00641506
Iteration 32, los