* [Libraries](#s1)
* [Basic Function](#s2)
* [Initialize](#s3)
* [CountVectorizer](#s4)
* [Results](#sn)

# Libraries <a class="anchor"  id="s1"></a>

In [2]:
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

# text NLP
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer
import string
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from unidecode import unidecode
import contractions


# Preprocessing
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

# model
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve

In [1]:
nltk.download('all')

# Basic Function <a class="anchor"  id="s2"></a>

In [None]:
class BasicTextCleaning:
    def __init__(self):
        # define some necessary elements
        self.stopwords = set(stopwords.words('english'))
        self.words_corpus = set(words.words())
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

        # dictionary of methods can be used
        self.methods = {'lowercase': str.lower,
                        'accent_removal': self.accent_removal,
                        'strip': str.strip,
                        'nice_display': self.nice_display,
                        'tokenization': nltk.word_tokenize,
                        'stemming': self.stemming,
                        'lemmatization': self.lemmatization,
                        'punctuation_removal': self.punctuation_removal,
                        'stopwords_removal': self.stopwords_removal,
                        'contractions_expand': self.contractions_expand,
                        'nonsense_removal': self.nonsense_removal,
                        'number_removal': self.number_removal}

        self.punctuations = '[%s]' % re.escape(string.punctuation)

    def text_cleaning(self, texts, methods=None):
        if not methods:
            methods = ['accent_removal', 'lowercase', 'nice_display', 'punctuation_removal',
                       'stopwords_removal', 'lemmatization', 'stemming']
        if isinstance(texts, str):
            texts = [texts]
        cleaned_texts = []
        for text in texts:
            for method in methods:
                if method not in self.methods.keys():
                    raise Warning('Invalid method "{}". Basic text cleaning methods available: {}'.format(method, ", ".join(self.methods.keys())))
                text = self.methods[method](text)
            cleaned_texts.append(text)
        return cleaned_texts

    def strip_text(self, text):
        return text.strip()

    def lowercase(self, text):
        return text.lower()

    def contractions_expand(self, text):
        return contractions.fix(text)

    def number_removal(self, text):
        text = re.sub(r"[^a-zA-Z]", " ", text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def nice_display(self, text):
        text = re.sub(r"([^\w\s([{\'])(\w)", r"\1 \2", text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def accent_removal(self, text):
        text = unidecode(text)
        return text

    def punctuation_removal(self, text):
        text = re.sub(self.punctuations, ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def stopwords_removal(self, text):
        return " ".join([word for word in text.split() if word not in self.stopwords])

    def stemming(self, text):
        return " ".join([self.stemmer.stem(word) for word in text.split()])

    def lemmatization(self, text):
        return " ".join([self.lemmatizer.lemmatize(word) for word in text.split()])

    def tokenization(self, text):
        return nltk.word_tokenize(text)

    def nonsense_removal(self, text):
        return " ".join([word for word in text.split() if wordnet.synsets(word)])

In [3]:
# data_save = pd.DataFrame(columns=['data','length_used', 'feature_extraction', 'feature_selection', 'model', 'accuracy', 'f1', 'recall', 'precision', 'roc_auc', 'notes'])
# data_save.to_csv('results.csv', index=False)
def save_and_print(data, length_used, feature_extraction, feature_selection, model, accuracy, f1, recall, precision, roc_auc, notes=None):
    print('Accuracy:', accuracy)
    print('F1:', f1)
    print('Recall:', recall)
    print('Precision:', precision)
    print('ROC AUC:', roc_auc)
    data_save = pd.read_csv('/kaggle/working/results.csv')
    new_row = {'data': data, 'length_used': length_used, 
               'feature_extraction': feature_extraction, 
               'feature_selection': feature_selection, 
               'model': model, 'accuracy': accuracy, 
               'f1': f1, 'recall': recall, 
               'precision': precision, 'roc_auc': roc_auc, 
               'notes': notes}
    data_save.loc[len(data_save)] = new_row
    data_save.to_csv('/kaggle/working/results.csv', index=False)

# Initialize <a class="anchor"  id="s3"></a>

In [4]:
data = pd.read_csv('/kaggle/input/fake-review-dataset/data_input.csv')
data['text'] = data['text'].fillna('')

In [None]:
# delete output data in kaggle
file_path = "/kaggle/working/results.csv"
if os.path.exists(file_path):
    os.remove("/kaggle/working/results.csv")

In [5]:
# copy results file from input to output in kaggle for updating
src_path = r"/kaggle/input/fake-review-dataset/results.csv"
dst_path = r"/kaggle/working/"
shutil.copy(src_path, dst_path)

'/kaggle/working/results.csv'

# CountVectorizer <a class="anchor"  id="s4"></a>

[sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html): Convert a collection of text documents to a matrix of token counts.
- binary: 1 vs 0
- ngram_range: (1,1), (1,2),(1,3),(2,2),(2,3),(3,3)
- min_df: 0 or 0.001

In [None]:
length_used = None #modify
if length_used == None:
    X = data[['text']]
    y = data[['label']]
if length_used == 'MinMaxScaler':
    X = data[['text', 'length_minmax']]
    y = data['label']
if length_used == 'StandardScaler':
    X = data[['text', 'length_std']]
    y = data['label']

In [None]:
vectorizer = CountVectorizer(binary=False, min_df=0, ngram_range=(1,1)) #modify
f_selection = PCA(n_components=500, random_state=42) #modify

## Logistic

In [None]:
# Logistic Regression
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy, f1, recall, precision, roc_auc = [], [], [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # length = None
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    # length = 'MinMaxScaler' or 'StandardScaler'
#     X_train_text = vectorizer.fit_transform(X_train['text'])
#     X_test_text = vectorizer.transform(X_test['text'])

#     X_train = np.hstack((X_train_text.toarray(), X_train['length_minmax'].values.reshape(-1, 1)))
#     X_test = np.hstack((X_test_text.toarray(), X_test['length_minmax'].values.reshape(-1, 1)))

    

    X_train = f_selection.fit_transform(X_train.toarray())
    X_test = f_selection.transform(X_test.toarray())

    model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    roc_auc.append(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
    
save_and_print(data='fake_reviews_dataset',
               length_used=None,
               feature_extraction='CountVectorizer(binary=False, min_df=0, ngram_range=(1,1))',
               feature_selection='PCA(n_components=500, random_state=42)',
               model="LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)",
               accuracy=np.mean(accuracy).round(5),
               f1=np.mean(f1).round(5),
               recall=np.mean(recall).round(5),
               precision=np.mean(precision).round(5),
               roc_auc=np.mean(roc_auc).round(5))

## KNN

In [None]:
# KNeighborsClassifier(n_neighbors=1/3/5,metric='euclidean'/ 'manhattan'/ 'minkowski'/'cosine')
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy, f1, recall, precision, roc_auc = {}, {}, {}, {}, {}
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # length = None
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    # length = 'MinMaxScaler' or 'StandardScaler'
#     X_train_text = vectorizer.fit_transform(X_train['text'])
#     X_test_text = vectorizer.transform(X_test['text'])

#     X_train = np.hstack((X_train_text.toarray(), X_train['length_minmax'].values.reshape(-1, 1)))
#     X_test = np.hstack((X_test_text.toarray(), X_test['length_minmax'].values.reshape(-1, 1)))

    X_train = f_selection.fit_transform(X_train.toarray())
    X_test = f_selection.transform(X_test.toarray())

    for n in [1,3,5]:
        for metric in ['euclidean', 'manhattan', 'cosine']:
            model = KNeighborsClassifier(n_neighbors=n, metric=metric)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            if n not in accuracy.keys():
                accuracy[n] = {}
                f1[n] = {}
                recall[n] = {}
                precision[n] = {}
                roc_auc[n] = {}
            if metric not in accuracy[n].keys():
                accuracy[n][metric] = []
                f1[n][metric] = []
                recall[n][metric] = []
                precision[n][metric] = []
                roc_auc[n][metric] = []

            accuracy[n][metric].append(accuracy_score(y_test, y_pred))
            f1[n][metric].append(f1_score(y_test, y_pred))
            recall[n][metric].append(recall_score(y_test, y_pred))
            precision[n][metric].append(precision_score(y_test, y_pred))
            roc_auc[n][metric].append(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
            
for n in [1,3,5]:
    for metric in ['euclidean', 'manhattan', 'cosine']:
        save_and_print(data='fake_reviews_dataset',
                       length_used=None,
                       feature_extraction='CountVectorizer(binary=False, min_df=0, ngram_range=(1,1))',
                       feature_selection='PCA(n_components=500, random_state=42)',
                       model=f"KNeighborsClassifier(n_neighbors={n}, metric='{metric}')",
                       accuracy=np.mean(accuracy[n][metric]).round(5),
                       f1=np.mean(f1[n][metric]).round(5),
                       recall=np.mean(recall[n][metric]).round(5),
                       precision=np.mean(precision[n][metric]).round(5),
                       roc_auc=np.mean(roc_auc[n][metric]).round(5))

## 

## SVC

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy, f1, recall, precision, roc_auc = [], [], [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # length = None
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    # length = 'MinMaxScaler' or 'StandardScaler'
#     X_train_text = vectorizer.fit_transform(X_train['text'])
#     X_test_text = vectorizer.transform(X_test['text'])

#     X_train = np.hstack((X_train_text.toarray(), X_train['length_minmax'].values.reshape(-1, 1)))
#     X_test = np.hstack((X_test_text.toarray(), X_test['length_minmax'].values.reshape(-1, 1)))

    X_train = f_selection.fit_transform(X_train.toarray())
    X_test = f_selection.transform(X_test.toarray())

    model = SVC(probability=True, class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    roc_auc.append(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

save_and_print(data='fake_reviews_dataset',
                length_used=None,
                feature_extraction='CountVectorizer(ngram_range=(1,1))',
                feature_selection='PCA(n_components=500, random_state=42)',
                model="SVC(probability=True, class_weight='balanced', random_state=42)",
                accuracy=np.mean(accuracy).round(5),
                f1=np.mean(f1).round(5),
                recall=np.mean(recall).round(5),
                precision=np.mean(precision).round(5),
                roc_auc=np.mean(roc_auc).round(5))

## Gausian NB

In [None]:
# Gaussian NB()

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy, f1, recall, precision, roc_auc = [], [], [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # length = None
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

# length = 'MinMaxScaler' or 'StandardScaler'
#     X_train_text = vectorizer.fit_transform(X_train['text'])
#     X_test_text = vectorizer.transform(X_test['text'])

#     X_train = np.hstack((X_train_text.toarray(), X_train['length_minmax'].values.reshape(-1, 1)))
#     X_test = np.hstack((X_test_text.toarray(), X_test['length_minmax'].values.reshape(-1, 1)))

    X_train = f_selection.fit_transform(X_train.toarray())
    X_test = f_selection.transform(X_test.toarray())

    model = GaussianNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    roc_auc.append(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

save_and_print(data='fake_reviews_dataset',
                length_used=None,
                feature_extraction='CountVectorizer(binary=False, min_df=0, ngram_range=(1,1))',
                feature_selection='PCA(n_components=500, random_state=42)',
                model="GaussianNB()",
                accuracy=np.mean(accuracy).round(5),
                f1=np.mean(f1).round(5),
                recall=np.mean(recall).round(5),
                precision=np.mean(precision).round(5),
                roc_auc=np.mean(roc_auc).round(5))

## Multinomial NB

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy, f1, recall, precision, roc_auc = [], [], [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # length = None
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    # length = 'MinMaxScaler' or 'StandardScaler'
#     X_train_text = vectorizer.fit_transform(X_train['text'])
#     X_test_text = vectorizer.transform(X_test['text'])

#     X_train = np.hstack((X_train_text.toarray(), X_train['length_minmax'].values.reshape(-1, 1)))
#     X_test = np.hstack((X_test_text.toarray(), X_test['length_minmax'].values.reshape(-1, 1)))

    X_train = f_selection.fit_transform(X_train.toarray())
    X_test = f_selection.transform(X_test.toarray())

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    roc_auc.append(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

save_and_print(data='fake_reviews_dataset',
                length_used=None,
                feature_extraction='CountVectorizer(binary=False, min_df=0, ngram_range=(1,1))',
                feature_selection='PCA(n_components=500, random_state=42)',
                model="MultinomialNB()",
                accuracy=np.mean(accuracy).round(5),
                f1=np.mean(f1).round(5),
                recall=np.mean(recall).round(5),
                precision=np.mean(precision).round(5),
                roc_auc=np.mean(roc_auc).round(5))

## Bernoulli NB

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy, f1, recall, precision, roc_auc = [], [], [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # length = None
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    # length = 'MinMaxScaler' or 'StandardScaler'
#     X_train_text = vectorizer.fit_transform(X_train['text'])
#     X_test_text = vectorizer.transform(X_test['text'])

#     X_train = np.hstack((X_train_text.toarray(), X_train['length_minmax'].values.reshape(-1, 1)))
#     X_test = np.hstack((X_test_text.toarray(), X_test['length_minmax'].values.reshape(-1, 1)))

    X_train = f_selection.fit_transform(X_train.toarray())
    X_test = f_selection.transform(X_test.toarray())

    model = BernoulliNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    roc_auc.append(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

save_and_print(data='fake_reviews_dataset',
                length_used=None,
                feature_extraction='CountVectorizer(binary=False, min_df=0, ngram_range=(1,1))',
                feature_selection='PCA(n_components=500, random_state=42)',
                model="BernoulliNB()",
                accuracy=np.mean(accuracy).round(5),
                f1=np.mean(f1).round(5),
                recall=np.mean(recall).round(5),
                precision=np.mean(precision).round(5),
                roc_auc=np.mean(roc_auc).round(5))

# Results <a class="anchor"  id="sn"></a>

In [6]:
pd.read_csv('/kaggle/working/results.csv')

Unnamed: 0,data,length_used,feature_extraction,feature_selection,model,accuracy,f1,recall,precision,roc_auc,notes
0,fake_reviews_dataset,,"CountVectorizer(binary=False, min_df=0, ngram_...","PCA(n_components=500, random_state=42)","LogisticRegression(max_iter=1000, class_weight...",0.82855,0.82873,0.82988,0.82759,0.91569,
1,fake_reviews_dataset,,"CountVectorizer(binary=False, min_df=0, ngram_...","PCA(n_components=500, random_state=42)","KNeighborsClassifier(n_neighbors=1, metric='eu...",0.66299,0.69726,0.77629,0.63289,0.66301,
2,fake_reviews_dataset,,"CountVectorizer(binary=False, min_df=0, ngram_...","PCA(n_components=500, random_state=42)","KNeighborsClassifier(n_neighbors=1, metric='ma...",0.65913,0.69170,0.76482,0.63139,0.65915,
3,fake_reviews_dataset,,"CountVectorizer(binary=False, min_df=0, ngram_...","PCA(n_components=500, random_state=42)","KNeighborsClassifier(n_neighbors=1, metric='mi...",0.66299,0.69726,0.77629,0.63289,0.66301,
4,fake_reviews_dataset,,"CountVectorizer(binary=False, min_df=0, ngram_...","PCA(n_components=500, random_state=42)","KNeighborsClassifier(n_neighbors=1, metric='co...",0.67711,0.72248,0.84062,0.63349,0.67711,
...,...,...,...,...,...,...,...,...,...,...,...
270,fake_reviews_dataset,,"CountVectorizer(binary=True, min_df=0.001, ngr...","PCA(n_components=500, random_state=42)","KNeighborsClassifier(n_neighbors=3, metric='ma...",0.71364,0.67571,0.59876,0.77727,0.74053,
271,fake_reviews_dataset,,"CountVectorizer(binary=True, min_df=0.001, ngr...","PCA(n_components=500, random_state=42)","KNeighborsClassifier(n_neighbors=3, metric='co...",0.73798,0.71670,0.66286,0.78009,0.75273,
272,fake_reviews_dataset,,"CountVectorizer(binary=True, min_df=0.001, ngr...","PCA(n_components=500, random_state=42)","KNeighborsClassifier(n_neighbors=5, metric='eu...",0.73716,0.70838,0.63835,0.79620,0.75573,
273,fake_reviews_dataset,,"CountVectorizer(binary=True, min_df=0.001, ngr...","PCA(n_components=500, random_state=42)","KNeighborsClassifier(n_neighbors=5, metric='ma...",0.70499,0.65627,0.56464,0.78494,0.74067,


In [None]:
# # Drop sth
# data_save = pd.read_csv('/kaggle/working/results.csv')
# data_save = data_save.drop(0)
# data_save.to_csv('/kaggle/working/results.csv', index=False)