# Libraries

In [1]:
# for local import
import sys
if "../../" not in sys.path:
    sys.path.append("../../")

In [2]:
# for working with data
import pandas as pd
import numpy as np

# for visulization
import matplotlib.pyplot as plt
import seaborn as sns

# for preprocessing
from src.preprocessing import BasicTextCleaning
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, f_classif, VarianceThreshold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

# for modelling
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier

# for evaluation
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score

# Load Data

In [3]:
osf = pd.read_csv("../../data/fake_reviews_dataset.csv")
print("Shape of OFS: ", osf.shape)
# osf.head()
osf

Shape of OFS:  (40432, 4)


Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...


# Preprocessing

### Basic Text Cleaning

In [4]:
cleaner = BasicTextCleaning()

In [5]:
try:
    osf_cleaned = pd.read_csv("../../data/cleaned/osf_cleaned.csv")
    osf_cleaned = osf_cleaned.replace(np.nan, '')
except:
    osf_cleaned = pd.DataFrame()
    osf_cleaned['length'] = osf['text_'].apply(lambda x: len(x))
    osf_cleaned['texts'] = cleaner.text_cleaning(osf['text_'])

    ordinal = OrdinalEncoder(categories=[['OR', 'CG']], dtype=int)
    osf_cleaned['labels'] = ordinal.fit_transform(osf[['label']])
    osf_cleaned.to_csv("../../data/cleaned/osf_cleaned.csv", index=False)

In [6]:
def avg_word2vec(sentences, w2v_model):
    avg_sentences = []
    for sentence in sentences:
        if sentence:
            avg_sentence = np.mean([w2v_model.wv.get_vector(word) for word in sentence if word in w2v_model.wv.key_to_index], axis=0)
        else:
            avg_sentence = np.zeros(w2v_model.vector_size)
        avg_sentences.append(avg_sentence)
    return np.array(avg_sentences)

def text_extractor(X_train, X_test, extractor):
    if isinstance(extractor, Word2Vec):
        vector_size = extractor.vector_size
        window = extractor.window
        sg = extractor.sg
        extractor = Word2Vec(vector_size=vector_size, sg=sg, window=window, min_count=1, workers=5, seed=42)

        cleaner = BasicTextCleaning()
        X_train = cleaner.text_cleaning(texts=X_train, methods=['tokenization'])
        X_test = cleaner.text_cleaning(texts=X_test, methods=['tokenization'])

        extractor.build_vocab(X_train)
        extractor.train(X_train, total_examples=extractor.corpus_count, epochs=30)
        X_train = avg_word2vec(X_train, extractor)
        X_train = pd.DataFrame(X_train, columns=[str(i) for i in range(extractor.vector_size)])
        X_test = avg_word2vec(X_test, extractor)
        X_test = pd.DataFrame(X_test, columns=[str(i) for i in range(extractor.vector_size)])
    else:
        X_train = extractor.fit_transform(X_train).toarray()
        X_test = extractor.transform(X_test).toarray()
        X_train = pd.DataFrame(X_train, columns=extractor.get_feature_names_out())
        X_test = pd.DataFrame(X_test, columns=extractor.get_feature_names_out())
    
    variance = VarianceThreshold()
    X_train = variance.fit_transform(X_train)
    X_test = variance.transform(X_test)
    X_train = pd.DataFrame(X_train, columns=variance.get_feature_names_out())
    X_test = pd.DataFrame(X_test, columns=variance.get_feature_names_out())

    return X_train, X_test

def feature_selection(X_train, X_test, selector):
    X_train = selector.fit_transform(X_train)
    X_test = selector.transform(X_test)
    X_train = pd.DataFrame(X_train, columns=selector.get_feature_names_out())
    X_test = pd.DataFrame(X_test, columns=selector.get_feature_names_out())
    
    return X_train, X_test

In [7]:
def modelling(model, X_train, y_train, X_test, probability=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    if probability:
        y_pred_proba = model.predict_proba(X_test)
        return y_pred, y_pred_proba
    return y_pred

def evaluation(y_true, y_pred, y_pred_prob, scoring=['accuracy', 'f1', 'recall', 'precision', 'roc_auc']):
    scores = {'accuracy': accuracy_score,
              'f1': f1_score,
              'recall': recall_score,
              'precision': precision_score,
              'roc_auc': roc_auc_score}
    
    result = {}
    for method in scoring:
        if method == 'roc_auc':
            result[method] = scores[method](y_true, y_pred_prob.T[1])
        else:
            result[method] = scores[method](y_true, y_pred)

    return result

In [8]:
def cross_validation(data, extractor, model=None, selector=None, length_scaler=None,
                     scoring=['accuracy', 'f1', 'recall', 'precision', 'roc_auc'], cv=5,
                     avg_output=True, quiet=True):
    kfolds = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = {method: [] for method in scoring}
    round = 1

    for train_indices, test_indices in kfolds.split(data.iloc[:, :-1], data.iloc[:, -1]):
        train_set, test_set = data.iloc[train_indices, :-1], data.iloc[test_indices, :-1]
        y_train, y_test = data.iloc[train_indices, -1], data.iloc[test_indices, -1]

        X_train, X_test = text_extractor(X_train=train_set['texts'], X_test=test_set['texts'], extractor=extractor)

        if length_scaler is not None:
            X_train['length'] = length_scaler.fit_transform(train_set[['length']])
            X_test['length'] = length_scaler.transform(test_set[['length']])
        
        if selector is not None:
            X_train, X_test = feature_selection(X_train, X_test, selector)

        y_pred, y_pred_prob = modelling(model, X_train, y_train, X_test)

        result = evaluation(y_true=y_test, y_pred=y_pred, y_pred_prob=y_pred_prob, scoring=scoring)
        for method in scoring:
            scores[method].append(result[method])
            
        if not quiet:
            print(f"round {round}: done")

        round += 1
        
    if avg_output:
        avg_scores = {key: np.mean(values) for key, values in scores.items()}

    return avg_scores

In [9]:
extractor = Word2Vec(window=5,
                     vector_size=50,
                     seed=42,
                     sg=1)
model = LogisticRegression(max_iter=1000, class_weight="balanced")
# model = SVC(probability=True, class_weight='balanced')
extractor = TfidfVectorizer(min_df=0.001, ngram_range=(1, 1))
# model = SVC()

cross_validation(data=osf_cleaned, length_scaler=StandardScaler(),
                 model=model, extractor=extractor, quiet=False)

round 1: done
round 2: done
round 3: done
round 4: done
round 5: done


{'accuracy': 0.8606548082210462,
 'f1': 0.8603401641065223,
 'recall': 0.8586468095233762,
 'precision': 0.862047397940788,
 'roc_auc': 0.9367511305235844}

In [10]:
testcases = {'data': {'osf': osf_cleaned},
             'length_used': {'None': None,
                             'StandardScaler': StandardScaler(),
                             'MinMaxScaler': MinMaxScaler()},
             'feature_extraction': ['Word2Vec(vector_size={}, window={})'],
             'feature selection': {'None': None,
                                   'PCA': PCA,
                                   'SelectKBest(score_func={}, k={})': SelectKBest},
             'model': {'LogisticRegression(max_iter=1000, class_weight="balanced")': LogisticRegression(max_iter=1000, class_weight='balanced')}}

In [11]:
output = {'data': [],
          'length_used': [],
          'feature_extraction': [],
          'feature_selection': [],
          'model': [],
          'accuracy': [],
          'f1': [],
          'recall': [],
          'precision': [],
          'roc_auc': [],
          'notes': []}

In [29]:
# output = {'data': [],
#           'length_used': [],
#           'feature_extraction': [],
#           'feature_selection': [],
#           'model': [],
#           'accuracy': [],
#           'f1': [],
#           'recall': [],
#           'precision': [],
#           'roc_auc': [],
#           'notes': []}

# testcases = {'data': {'osf': osf_cleaned},
#              'length_used': {'None': None,
#                              'StandardScaler': StandardScaler(),
#                              'MinMaxScaler': MinMaxScaler()},
#              'feature_extraction': ['Word2Vec(vector_size={}, window={})'],
#              'feature selection': {'None': None},
#              'model': {'LogisticRegression(max_iter=1000, class_weight="balanced")': LogisticRegression(max_iter=1000, class_weight='balanced')}}

# for data_name in testcases['data']:
#     for length in testcases['length_used']:
#         if length == 'None':
#             data = testcases['data'][data_name].iloc[:, 1:].copy()
#         else:
#             data = testcases['data'][data_name].copy()
#         for size in np.arange(100, 1100, 100):
#             for window in range(3, 9, 2):
#                 extractor = Word2Vec(vector_size=size, window=window, workers=5, min_count=1, seed=42)
#                 for selector_name in testcases['feature selection']:
#                     selector = testcases['feature selection'][selector_name]
#                     for model_name in testcases['model']:
#                         model = testcases['model'][model_name]
#                         scores = cross_validation(data=data,
#                                                   length_scaler=testcases['length_used'][length],
#                                                   extractor=extractor,
#                                                   selector=selector,
#                                                   model=model)
                        
#                         output['data'].append(data_name)
#                         output['length_used'].append(length)
#                         output['feature_extraction'].append(f'Word2Vec(vector_size={size}, window={window})')
#                         output['feature_selection'].append(selector_name)
#                         output['model'].append(model_name)
#                         for key, values in scores.items():
#                             output[key].append(values)

output = {'data': [],
          'length_used': [],
          'feature_extraction': [],
          'feature_selection': [],
          'model': [],
          'accuracy': [],
          'f1': [],
          'recall': [],
          'precision': [],
          'roc_auc': [],
          'notes': []}

testcases = {'data': {'osf': osf_cleaned},
             'length_used': {'None': None},
             'feature_extraction': ['Word2Vec(vector_size={}, window={})'],
             'feature selection': {'None': None},
             'model': {'LogisticRegression(max_iter=1000, class_weight="balanced")': LogisticRegression(max_iter=1000, class_weight='balanced')}}

for data_name in testcases['data']:
    for length in testcases['length_used']:
        if length == 'None':
            data = testcases['data'][data_name].iloc[:, 1:].copy()
        else:
            data = testcases['data'][data_name].copy()
        for size in np.arange(300, 310, 100):
            for window in range(3, 9, 2):
                extractor = Word2Vec(vector_size=size, window=window, workers=5, min_count=1, seed=42, sg=1)
                for selector_name in testcases['feature selection']:
                    selector = testcases['feature selection'][selector_name]
                    for model_name in testcases['model']:
                        model = testcases['model'][model_name]
                        scores = cross_validation(data=data,
                                                  length_scaler=testcases['length_used'][length],
                                                  extractor=extractor,
                                                  selector=selector,
                                                  model=model)
                        
                        output['data'].append(data_name)
                        output['length_used'].append(length)
                        output['feature_extraction'].append(f'Word2Vec(vector_size={size}, window={window}, sg=1)')
                        output['feature_selection'].append(selector_name)
                        output['model'].append(model_name)
                        for key, values in scores.items():
                            output[key].append(values)

In [30]:
output_saved = output.copy()
output_saved['notes'] = ['']*len(output_saved['data'])
# # # pd.DataFrame(output_saved).to_csv("data/result/test_result.csv", index=False)

In [31]:
output_saved_df = pd.DataFrame(output_saved)
output_saved_df

Unnamed: 0,data,length_used,feature_extraction,feature_selection,model,accuracy,f1,recall,precision,roc_auc,notes
0,osf,,"Word2Vec(vector_size=300, window=3, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.85279,0.853824,0.860172,0.847592,0.916757,
1,osf,,"Word2Vec(vector_size=300, window=5, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.856104,0.856744,0.860741,0.852789,0.918914,
2,osf,,"Word2Vec(vector_size=300, window=7, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.856995,0.857416,0.860076,0.854797,0.918974,


In [32]:
# result = pd.read_csv('data/result/test_result.csv')
# result = pd.concat([result, output_saved_df], axis=0, ignore_index=True)
# # result.sort_values(by='accuracy', ascending=True)
# # result[result['feature_extraction']=='Word2Vec(vector_size=100, window=5)']
# result

Unnamed: 0,data,length_used,feature_extraction,feature_selection,model,accuracy,f1,recall,precision,roc_auc,notes
0,osf,,"Word2Vec(vector_size=100, window=3)",,"LogisticRegression(max_iter=1000, class_weight...",0.842649,0.843850,0.850587,0.837234,0.906563,
1,osf,,"Word2Vec(vector_size=100, window=5)",,"LogisticRegression(max_iter=1000, class_weight...",0.847769,0.848826,0.855068,0.842681,0.911585,
2,osf,,"Word2Vec(vector_size=100, window=7)",,"LogisticRegression(max_iter=1000, class_weight...",0.849649,0.850687,0.856728,0.844732,0.913208,
3,osf,,"Word2Vec(vector_size=200, window=3)",,"LogisticRegression(max_iter=1000, class_weight...",0.852518,0.853635,0.860505,0.846878,0.915526,
4,osf,,"Word2Vec(vector_size=200, window=5)",,"LogisticRegression(max_iter=1000, class_weight...",0.854348,0.855324,0.861271,0.849461,0.917639,
...,...,...,...,...,...,...,...,...,...,...,...
94,osf,,"Word2Vec(vector_size=200, window=5, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.852394,0.852961,0.856477,0.849482,0.916127,
95,osf,,"Word2Vec(vector_size=200, window=7, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.852493,0.852787,0.854665,0.850929,0.916700,
96,osf,,"Word2Vec(vector_size=300, window=3, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.852790,0.853824,0.860172,0.847592,0.916757,
97,osf,,"Word2Vec(vector_size=300, window=5, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.856104,0.856744,0.860741,0.852789,0.918914,


In [33]:
# result.to_csv('data/result/test_result.csv', index=False)

In [12]:
pd.read_csv(r"../../output/csv/test_result.csv")

Unnamed: 0,data,length_used,feature_extraction,feature_selection,model,accuracy,f1,recall,precision,roc_auc,notes
0,osf,,"Word2Vec(vector_size=100, window=3)",,"LogisticRegression(max_iter=1000, class_weight...",0.842649,0.843850,0.850587,0.837234,0.906563,
1,osf,,"Word2Vec(vector_size=100, window=5)",,"LogisticRegression(max_iter=1000, class_weight...",0.847769,0.848826,0.855068,0.842681,0.911585,
2,osf,,"Word2Vec(vector_size=100, window=7)",,"LogisticRegression(max_iter=1000, class_weight...",0.849649,0.850687,0.856728,0.844732,0.913208,
3,osf,,"Word2Vec(vector_size=200, window=3)",,"LogisticRegression(max_iter=1000, class_weight...",0.852518,0.853635,0.860505,0.846878,0.915526,
4,osf,,"Word2Vec(vector_size=200, window=5)",,"LogisticRegression(max_iter=1000, class_weight...",0.854348,0.855324,0.861271,0.849461,0.917639,
...,...,...,...,...,...,...,...,...,...,...,...
94,osf,,"Word2Vec(vector_size=200, window=5, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.852394,0.852961,0.856477,0.849482,0.916127,
95,osf,,"Word2Vec(vector_size=200, window=7, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.852493,0.852787,0.854665,0.850929,0.916700,
96,osf,,"Word2Vec(vector_size=300, window=3, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.852790,0.853824,0.860172,0.847592,0.916757,
97,osf,,"Word2Vec(vector_size=300, window=5, sg=1)",,"LogisticRegression(max_iter=1000, class_weight...",0.856104,0.856744,0.860741,0.852789,0.918914,
