https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [50]:
from utils.loader import DataLoader
import numpy as np
import pandas as pd
from random import shuffle

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier, SGDClassifier
# from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron
# from sklearn.neighbors import KNeighborsClassifier


In [93]:
def shuffle_list(*ls):
  l =list(zip(*ls))

  shuffle(l)
  return zip(*l)

def df2dict(panda_data, gpt_data, gpt_type=False, both_labels=False):
    text = []
    labels = []
    
    if (gpt_type) != False:
        gpt_data = gpt_data[gpt_data['SAMPLE_TYPE'].isin(gpt_type)]
        gpt_data = gpt_data.to_numpy()
        for i in range(len(gpt_data)):
            text.append(gpt_data[i][1])
            labels.append('__label1__')

    np_data = panda_data.to_numpy()

    # both real and fake data in amazon
    if both_labels == True:
        for i in range(len(np_data)):
            text.append(np_data[i][8])
            labels.append(np_data[i][1])

    # real data only in amazon
    else:
        for i in range(len(np_data)):
            if np_data[i][1] == '__label2__':
                text.append(np_data[i][8])
                labels.append(np_data[i][1])
    
    text, labels = shuffle_list(text,labels)

    data_dict = {'text': text,
                'labels': labels} 

    return data_dict

# feature_extraction with sklearn methods

In [88]:
def run_classifier(n_range, info):
    print(info)
    best_acc = 0
    for clf, name in (
            # (LinearSVC(), "LinearSVC"),
            (MultinomialNB(), "Multi  NB"),
            (RidgeClassifier(), "Ridge cls"),
            (SGDClassifier(), "SGD   cls"),
            # the following 3 preform quite bad
            # (PassiveAggressiveClassifier(), "Pass-Aggr"),
            # (Perceptron(), "Perceptro"),
            # (KNeighborsClassifier(), "k - N - N"),
            ):

        text_clf = Pipeline([
                            ('vect', CountVectorizer(ngram_range=n_range)),
                            ('tfidf', TfidfTransformer()),
                            ('clf', clf),
                            ])

        text_clf.fit(train_data['text'], train_data['labels'])

        predicted = text_clf.predict(test_data['text'])
        acc = np.mean(predicted == test_data['labels'])

        print(name, acc)

        if acc > best_acc:
            best_acc = acc
            best_result = (name, acc)
    
    print('Best result in this run: ', best_result)

In [96]:
# Load amazon datasets
loader = DataLoader()

path = 'data/gpt/gpt_generated_data.csv'
gpt_data = pd.read_csv(path, sep=',')

train_data = loader.load_clean_amazon(test_mode=False)
test_data = loader.load_clean_amazon(test_mode=True)
print("original pd dataframe", len(train_data), len(test_data))


train_data = df2dict(train_data, gpt_data, gpt_type=False, both_labels=True)
test_data = df2dict(test_data, _, gpt_type=False, both_labels=True)
print("processed data", len(train_data['text']), len(test_data['text']))  # note the testing data should be the same

print('train: fake amazon + real amazon, test: amazon test set')

run_classifier((1,1), 'ngram range: 1')
run_classifier((1,2), 'ngram range: 1 - 2')
run_classifier((1,3), 'ngram range: 1 - 3')


original pd dataframe 15750 5250
processed data 15750 5250
train: fake amazon + real amazon, test: amazon test set
ngram range: 1
Multi  NB 0.652
Ridge cls 0.6455238095238095
SGD   cls 0.6660952380952381
Best result in this run:  ('SGD   cls', 0.6660952380952381)
ngram range: 1 - 2
Multi  NB 0.6622857142857143
Ridge cls 0.6693333333333333
SGD   cls 0.664952380952381
Best result in this run:  ('Ridge cls', 0.6693333333333333)
ngram range: 1 - 3
Multi  NB 0.6643809523809524
Ridge cls 0.6660952380952381
SGD   cls 0.6651428571428571
Best result in this run:  ('Ridge cls', 0.6660952380952381)


In [97]:
# Load amazon datasets
loader = DataLoader()

path = 'data/gpt/gpt_generated_data.csv'
gpt_data = pd.read_csv(path, sep=',')

train_data = loader.load_clean_amazon(test_mode=False)
test_data = loader.load_clean_amazon(test_mode=True)
print("original pd dataframe", len(train_data), len(test_data))


train_data = df2dict(train_data, gpt_data, ['GUIDED'], both_labels=False)
test_data = df2dict(test_data, _, gpt_type=False, both_labels=True)
print("processed data", len(train_data['text']), len(test_data['text']))  # note the testing data should be the same

print('train: fake gpt + real amazon, test: amazon test set')

run_classifier((1,1), 'ngram range: 1')
run_classifier((1,2), 'ngram range: 1 - 2')
run_classifier((1,3), 'ngram range: 1 - 3')


original pd dataframe 15750 5250
processed data 12870 5250
train: fake gpt + real amazon, test: amazon test set
ngram range: 1
Multi  NB 0.4942857142857143
Ridge cls 0.5034285714285714
SGD   cls 0.5055238095238095
Best result in this run:  ('SGD   cls', 0.5055238095238095)
ngram range: 1 - 2
Multi  NB 0.4942857142857143
Ridge cls 0.49676190476190474
SGD   cls 0.49961904761904763
Best result in this run:  ('SGD   cls', 0.49961904761904763)
ngram range: 1 - 3
Multi  NB 0.49447619047619046
Ridge cls 0.4961904761904762
SGD   cls 0.49676190476190474
Best result in this run:  ('SGD   cls', 0.49676190476190474)


In [98]:
# Load amazon datasets
loader = DataLoader()

path = 'data/gpt/gpt_generated_data.csv'
gpt_data = pd.read_csv(path, sep=',')

train_data = loader.load_clean_amazon(test_mode=False)
test_data = loader.load_clean_amazon(test_mode=True)
print("original pd dataframe", len(train_data), len(test_data))


train_data = df2dict(train_data, gpt_data, ['NON-GUIDED'], both_labels=False)
test_data = df2dict(test_data, _, gpt_type=False, both_labels=True)
print("processed data", len(train_data['text']), len(test_data['text']))  # note the testing data should be the same

print('train: fake gpt + real amazon, test: amazon test set')

run_classifier((1,1), 'ngram range: 1')
run_classifier((1,2), 'ngram range: 1 - 2')
run_classifier((1,3), 'ngram range: 1 - 3')

original pd dataframe 15750 5250
processed data 12871 5250
train: fake gpt + real amazon, test: amazon test set
ngram range: 1
Multi  NB 0.49257142857142855
Ridge cls 0.4980952380952381
SGD   cls 0.49885714285714283
Best result in this run:  ('SGD   cls', 0.49885714285714283)
ngram range: 1 - 2
Multi  NB 0.4942857142857143
Ridge cls 0.4956190476190476
SGD   cls 0.4956190476190476
Best result in this run:  ('Ridge cls', 0.4956190476190476)
ngram range: 1 - 3
Multi  NB 0.49847619047619046
Ridge cls 0.496
SGD   cls 0.49523809523809526
Best result in this run:  ('Multi  NB', 0.49847619047619046)


In [99]:
# Load amazon datasets
loader = DataLoader()

path = 'data/gpt/gpt_generated_data.csv'
gpt_data = pd.read_csv(path, sep=',')

train_data = loader.load_clean_amazon(test_mode=False)
test_data = loader.load_clean_amazon(test_mode=True)
print("original pd dataframe", len(train_data), len(test_data))


train_data = df2dict(train_data, gpt_data, ['GUIDED', 'NON-GUIDED'], both_labels=False)
test_data = df2dict(test_data, _, gpt_type=False, both_labels=True)
print("processed data", len(train_data['text']), len(test_data['text']))  # note the testing data should be the same

print('train: fake gpt + real amazon, test: amazon test set')

run_classifier((1,1), 'ngram range: 1')
run_classifier((1,2), 'ngram range: 1 - 2')
run_classifier((1,3), 'ngram range: 1 - 3')

original pd dataframe 15750 5250
processed data 17825 5250
train: fake gpt + real amazon, test: amazon test set
ngram range: 1
Multi  NB 0.5211428571428571
Ridge cls 0.5049523809523809
SGD   cls 0.5104761904761905
Best result in this run:  ('Multi  NB', 0.5211428571428571)
ngram range: 1 - 2
Multi  NB 0.5363809523809524
Ridge cls 0.49752380952380953
SGD   cls 0.5009523809523809
Best result in this run:  ('Multi  NB', 0.5363809523809524)
ngram range: 1 - 3
Multi  NB 0.5335238095238095
Ridge cls 0.49847619047619046
SGD   cls 0.5013333333333333
Best result in this run:  ('Multi  NB', 0.5335238095238095)


In [100]:
# Load amazon datasets
loader = DataLoader()

path = 'data/gpt/gpt_generated_data.csv'
gpt_data = pd.read_csv(path, sep=',')

train_data = loader.load_clean_amazon(test_mode=False)
test_data = loader.load_clean_amazon(test_mode=True)
print("original pd dataframe", len(train_data), len(test_data))


train_data = df2dict(train_data, gpt_data, ['NON-GUIDED'], both_labels=True)
test_data = df2dict(test_data, _, gpt_type=False, both_labels=True)
print("processed data", len(train_data['text']), len(test_data['text']))  # note the testing data should be the same

print('train: fake gpt + real amazon, test: amazon test set')

run_classifier((1,1), 'ngram range: 1')
run_classifier((1,2), 'ngram range: 1 - 2')
run_classifier((1,3), 'ngram range: 1 - 3')

original pd dataframe 15750 5250
processed data 20705 5250
train: fake gpt + real amazon, test: amazon test set
ngram range: 1
Multi  NB 0.5459047619047619
Ridge cls 0.6310476190476191
SGD   cls 0.6438095238095238
Best result in this run:  ('SGD   cls', 0.6438095238095238)
ngram range: 1 - 2
Multi  NB 0.5089523809523809
Ridge cls 0.6653333333333333
SGD   cls 0.652
Best result in this run:  ('Ridge cls', 0.6653333333333333)
ngram range: 1 - 3
Multi  NB 0.5087619047619047
Ridge cls 0.6657142857142857
SGD   cls 0.6518095238095238
Best result in this run:  ('Ridge cls', 0.6657142857142857)
