In [1]:
from utils.loader import DataLoader
import numpy as np
import pandas as pd
import os
from pathlib import Path
from random import shuffle
import copy

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier, SGDClassifier
# from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron
# from sklearn.neighbors import KNeighborsClassifier


def shuffle_dict(data):
    text = list(data['text'])
    labels = list(data['labels'])

    l =list(zip(text, labels))

    shuffle(l)
    text, labels = zip(*l)
    dict = {'text': text,
            'labels': labels} 

    return dict
#   return zip(*l)


def finalize_amazon_test(panda_data):
    text = []
    labels = []
    
    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        # the fake data
        if np_data[i][1] == '__label1__':
            text.append(np_data[i][8])
            labels.append('__label1__')

        # the real data
        else:
            text.append(np_data[i][8])
            labels.append('__label2__')


    dict_test = {'text': text,
                'labels': labels} 

    return dict_test


def finalize_amazon(panda_data):
    text_fake = []
    labels_fake = []

    text_real = []
    labels_real = []
    
    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        # the fake data
        if np_data[i][1] == '__label1__':
            text_fake.append(np_data[i][8])
            labels_fake.append('__label1__')

        # the real data
        else:
            text_real.append(np_data[i][8])
            labels_real.append('__label2__')


    dict_fake = {'text': text_fake,
                'labels': labels_fake} 

    dict_real = {'text': text_real,
                'labels': labels_real} 

    return dict_fake, dict_real


def finalize_gpt(panda_data):
    text_Guide = []
    labels_Guide = []

    text_nonGuide = []
    labels_nonGuide = []

    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        # the guided data
        if np_data[i][2] == 'GUIDED':
            text_Guide.append(np_data[i][1])
            labels_Guide.append('__label1__')

        # the non-guided data
        if np_data[i][2] == 'NON-GUIDED':
            text_nonGuide.append(np_data[i][1])
            labels_nonGuide.append('__label1__')

    dict_Guide = {'text': text_Guide,
                'labels': labels_Guide} 

    dict_nonGuide = {'text': text_nonGuide,
                'labels': labels_nonGuide} 

    return dict_Guide, dict_nonGuide


def finalize_gold(panda_data):
    text_fake = []
    labels_fake = []

    text_real = []
    labels_real = []
    
    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        # the fake data
        if np_data[i][0] == '__label1__':
            text_fake.append(np_data[i][-1])
            labels_fake.append('__label1__')

        # the real data
        else:
            text_real.append(np_data[i][-1])
            labels_real.append('__label2__')


    dict_fake = {'text': text_fake,
                'labels': labels_fake} 

    dict_real = {'text': text_real,
                'labels': labels_real} 

    return dict_fake, dict_real


def merge_data(d1, d2):
    merged_dic = {}
    for key in d1.keys():
        merged_dic[key] = d1[key] + d2[key]
    
    return merged_dic


def merge_shuffle_data(data1, data2):
    data = merge_data(data1, data2)
    data = shuffle_dict(data)
    print('Datasets merged and shuffled, total data: ', len(data['text']))

    return data

def run_classifier(n_range, info, train_data, test_data):
    print(info)
    best_acc = 0
    for clf, name in (
            (LinearSVC(), "LinearSVC"),
            (MultinomialNB(), "Multi  NB"),
            (RidgeClassifier(), "Ridge cls"),
            (SGDClassifier(), "SGD   cls"),
            # the following 3 preform quite bad
            # (PassiveAggressiveClassifier(), "Pass-Aggr"),
            # (Perceptron(), "Perceptro"),
            # (KNeighborsClassifier(), "k - N - N"),
            ):

        text_clf = Pipeline([
                            ('vect', CountVectorizer(ngram_range=n_range)),
                            ('tfidf', TfidfTransformer()),
                            ('clf', clf),
                            ])

        text_clf.fit(train_data['text'], train_data['labels'])

        predicted = text_clf.predict(test_data['text'])
        acc = np.mean(predicted == test_data['labels'])

        print(name, acc)

        if acc > best_acc:
            best_acc = acc
            best_result = (name, acc)
    
    print('Best result in this run: ', best_result)

### Load the data and convert them to dictionaries

In [2]:
loader = DataLoader()

# Load the 3 main datasets: gold, amazon, gpt
path = 'data/gpt/gpt_generated_data.csv'
gpt_data = pd.read_csv(path, sep=',')
gold_data = loader.load_gold_txt(deceptive=False, all=True)
train_amazon = loader.load_clean_amazon(test_mode=False)
test_amazon = loader.load_clean_amazon(test_mode=True)

# process the data into dictionary to get ready to throw into the classifier 
amazon_test = finalize_amazon_test(test_amazon)
print("amazon testing set", len(amazon_test['text']))

amazon_fake, amazon_real = finalize_amazon(train_amazon)
print("amazon fake/real", len(amazon_fake['text']), len(amazon_real['text']))

gold_data_num  = len(gold_data)
gold_fake, gold_real = finalize_gold(gold_data.head(int(gold_data_num * 0.8)))
print("gold fake/real", len(gold_fake['text']), len(gold_real['text']))
gold_fake_test, gold_real_test = finalize_gold(gold_data.tail(int(gold_data_num * 0.2)))
print("gold test fake/real", len(gold_fake_test['text']), len(gold_real_test['text']))

gpt_Guide, gpt_nonGuide = finalize_gpt(gpt_data)
print("gpt Guide/nonGuide", len(gpt_Guide['text']), len(gpt_nonGuide['text']))

  data = pd.read_csv(data_path, sep='   ')


amazon testing set 5250
amazon fake/real 7834 7916
gold fake/real 651 629
gold test fake/real 149 171
gpt Guide/nonGuide 4954 4955


### Run the classifier with different dataset to compare

+ Train: gold--------Test: gold 
+ basic trainig
---
+ Train: gold--------Test: amazon 
+ different test set on trained model
---
+ Train: amazon-----Test: amazon 
+ basic trainig
---
+ Train: amazon-----Test: gold 
+ different test set on trained model
---
+ Train: amazon real + gpt guide----------Test: amazon 
+ Train: amazon real + gpt non-guide------Test: amazon 
+ test gpt data performance, see if gpt can replace real reviews
---
FINAL TEST: LABEL GPT DATA AS REAL!
+ Train: amazon fake + gpt guide (label as real)----------Test: amazon 
---
Added: compare with final test, see if gpt data actually blends in the data
+ Train: gold real + amazon fake --------Test: amazon 
NOTE: Compare gpt with gold dataset is meaningless as we know a model trained by a specific dataset is only capable of classifying that specific dataset. Since gpt is trained by amazon dataset, it should only compare with amazon dataset. 

In [8]:
gold_test = merge_shuffle_data(gold_fake_test, gold_real_test)
gold_mix = merge_shuffle_data(gold_fake, gold_real)

run_classifier((1,2), 'ngram: 1 - 2', gold_mix, gold_test)
run_classifier((1,2), 'ngram: 1 - 2', gold_mix, amazon_test)

Datasets merged and shuffled, total data:  320
Datasets merged and shuffled, total data:  1280
ngram: 1 - 2
LinearSVC 0.909375
Multi  NB 0.73125
Ridge cls 0.9
SGD   cls 0.890625
Best result in this run:  ('LinearSVC', 0.909375)
ngram: 1 - 2
LinearSVC 0.5118095238095238
Multi  NB 0.5499047619047619
Ridge cls 0.5110476190476191
SGD   cls 0.5154285714285715
Best result in this run:  ('Multi  NB', 0.5499047619047619)


In [4]:
amazon_mix = merge_shuffle_data(amazon_fake, amazon_real )

run_classifier((1,2), 'ngram: 1 - 2', amazon_mix, amazon_test)
run_classifier((1,2), 'ngram: 1 - 2', amazon_mix, gold_test)

Datasets merged and shuffled, total data:  15750
ngram: 1 - 2
LinearSVC 0.6651428571428571
Multi  NB 0.6622857142857143
Ridge cls 0.6693333333333333
SGD   cls 0.6643809523809524
Best result in this run:  ('Ridge cls', 0.6693333333333333)
ngram: 1 - 2
LinearSVC 0.575
Multi  NB 0.6125
Ridge cls 0.58125
SGD   cls 0.5875
Best result in this run:  ('Multi  NB', 0.6125)


In [5]:
amazonReal_gptGuide = merge_shuffle_data(amazon_real, gpt_Guide)
amazonReal_gptNonGuide = merge_shuffle_data(amazon_real, gpt_nonGuide)

run_classifier((1,2), 'ngram: 1 - 2', amazonReal_gptGuide, amazon_test)
run_classifier((1,2), 'ngram: 1 - 2', amazonReal_gptNonGuide, amazon_test)

Datasets merged and shuffled, total data:  12870
Datasets merged and shuffled, total data:  12871
ngram: 1 - 2
LinearSVC 0.5001904761904762
Multi  NB 0.4942857142857143
Ridge cls 0.49676190476190474
SGD   cls 0.49942857142857144
Best result in this run:  ('LinearSVC', 0.5001904761904762)
ngram: 1 - 2
LinearSVC 0.4956190476190476
Multi  NB 0.4942857142857143
Ridge cls 0.4956190476190476
SGD   cls 0.49523809523809526
Best result in this run:  ('LinearSVC', 0.4956190476190476)


In [6]:
goldReal_gptGuide = merge_shuffle_data(gold_real, gpt_Guide)
run_classifier((1,2), 'ngram: 1 - 2', goldReal_gptGuide, gold_test)

Datasets merged and shuffled, total data:  5583
ngram: 1 - 2
LinearSVC 0.534375
Multi  NB 0.5625
Ridge cls 0.534375
SGD   cls 0.534375
Best result in this run:  ('Multi  NB', 0.5625)


In [7]:
gpt_as_real = copy.deepcopy(gpt_nonGuide)
for i in range(len(gpt_as_real['labels'])):
    gpt_as_real['labels'][i] = '__label2__'

amazonFake_gptGuide = merge_shuffle_data(gpt_as_real, amazon_fake)
run_classifier((1,2), 'ngram: 1 - 2', amazonFake_gptGuide, amazon_test)

Datasets merged and shuffled, total data:  12789
ngram: 1 - 2
LinearSVC 0.506095238095238
Multi  NB 0.5104761904761905
Ridge cls 0.5070476190476191
SGD   cls 0.5072380952380953
Best result in this run:  ('Multi  NB', 0.5104761904761905)
