# Fake Reviews - Data Cleaning

---

### Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

import spacy
nlp = spacy.load('en_core_web_sm')

import re, contractions, string, unicodedata
from num2words import num2words

from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest, chi2
from scipy.spatial.distance import cdist
import networkx as nx
from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec

---
### Load Dataset

In [None]:
# load dataset
df = pd.read_csv('deceptive.csv')

# lowercase columns
df.columns = df.columns.str.lower()

# select target class and text column
df = df[['text', 'class']]

# show two rows
df.head(2)

---
### Data Cleaning

- [Source for Chat Expressions](https://raw.githubusercontent.com/MFuchs1989/Datasets-and-Miscellaneous/main/datasets/NLP/Text%20Pre-Processing%20VII%20(Special%20Cases)/chat_expressions.csv)

In [None]:
# define nltk & spacy stop words
nltk_stopwords = set(stopwords.words('english'))
spacy_stopwords = set(spacy.load('en_core_web_sm').Defaults.stop_words)

# union between two sets of stop words
stopwords = nltk_stopwords.union(spacy_stopwords)

# dictionary of short forms to original forms
chat_expressions = pd.read_csv('chat_expressions.csv', on_bad_lines='error')
chat_expressions_dict = dict(zip(chat_expressions.Chat_Words, chat_expressions.Chat_Words_Extended))

In [None]:
def get_wordnet_pos_func(word):
    '''
    Maps the respective POS tag of a word to the format accepted by the lemmatizer of wordnet
    
    Args:
        word (str): Word to which the function is to be applied, string
    
    Returns:
        POS tag, readable for the lemmatizer of wordnet
    '''     
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
def clean_text(
    text: str,    
    normalization: List[str]=[],
    stop: str='',
    expand_contractions=False,
    numbers_to_words=False,
    expand_short_forms=False,
    remove_special_characters=False,
    remove_punctuation=False,
    seperate_numbers_from_text=False,
    convert_numbers_to_text=False,
    only_alphabets=False,
    only_alphabets_numbers=False, 
    custom_words=[]
    )-> str:
    '''
    Goes through a series of preprocessing steps to clean the supplied text.
    
    Args:
        text (str): Text to be cleaned
        normalization (List[str], optional): List of normalization functions to apply
                                             [stem|lem|lem_v|lem_a|lem_pos|spacy]. Defaults to [].
        stop (str, optional): Choice of stop words dictionary to use [nltk|spacy|both|custom]. Defaults to ''.
        expand_contractions (bool, optional): Expand contractions. Defaults to False.
        numbers_to_words (bool, optional): Convert numbers to words. Defaults to False.
        expand_short_forms (bool, optional): Expand short forms. Defaults to False.
        remove_special_characters (bool, optional): Remove special characters. Defaults to False.
        remove_punctuation (bool, optional): Remove punctuation. Defaults to False.
        seperate_numbers_from_text (bool, optional): Seperate numbers from text. Defaults to False.
        convert_numbers_to_text (bool, optional): Convert numbers to text. Defaults to False.
        only_alphabets (bool, optional): Remove all characters except alphabets. Defaults to False.
        only_alphabets_numbers (bool, optional): Remove all characters except alphabets and numbers. Defaults to False.
        custom_words (list, optional): List of custom words to be added to the stop words dictionary. Defaults to [].
    
    Returns:
        str: Cleaned text
    '''
    
    # remove emails
    text = ' '.join([i for i in text.split() if '@' not in i])
    
    # remove web address
    text = re.sub('http[s]?://\S+', '', text)
    
    # remove line breaks
    text = re.sub(r'\n', '', text)
    
    # removing accented characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # expand contractions
    if expand_contractions:
        text = contractions.fix(text)
    
    #short forms to normal forms
    if expand_short_forms:
        text = re.sub(r'\S+', lambda m: chat_expressions_dict.get(m.group().upper(), m.group()) , text)
    
    # make text lower
    text = text.lower()

    #remove brackets and parantheses (and removing the text inside the brackets and parantheses)
    if remove_punctuation:
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        text = text.translate(translator)
        
    # remove multiple punctuations
    text = re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', text)
    
    # separate numbers from text
    if seperate_numbers_from_text:
        text = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", text).strip() 
    
    # convert numbers to textual representation
    if convert_numbers_to_text:
        after_spliting = text.split()
        for index in range(len(after_spliting)):
            if after_spliting[index].isdigit():
                after_spliting[index] = num2words(after_spliting[index])
        text = ' '.join(after_spliting)
    
    # filter to allow only alphabets
    if only_alphabets:
        text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    # filter to allow numbers and alphabets
    elif only_alphabets_numbers:   
        text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text)
    
    # remove unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    #remove double spaces 
    text = re.sub('\s+', ' ', text)
    
    # stop words [options: nltk, spacy, both, custom]
    if stop == 'nltk':
        text = text.split()
        text = [word for word in text if not word in nltk_stopwords]
    elif stop == 'spacy':
        text = text.split()
        text = [word for word in text if not word in spacy_stopwords]
    elif stop == 'both':
        text = text.split()
        text = [word for word in text if not word in stopwords]
    elif stop == 'custom':
        text = text.split()
        stopwords_list = list(stopwords)
        stopwords_list.extend(custom_words)
        text = [word for word in text if not word in stopwords_list]
    
    # stemming & lemmatization
    for setting in normalization:
        if setting == 'stem':
            stemmer = PorterStemmer() 
            text = [stemmer.stem(y) for y in text]
        if setting == 'lan':
            stemmer = LancasterStemmer() 
            text = [stemmer.stem(y) for y in text]
        if setting == 'snow':
            stemmer = EnglishStemmer() 
            text = [stemmer.stem(y) for y in text]
        if setting == 'lem':
            lem = WordNetLemmatizer()
            text = [lem.lemmatize(y) for y in text]
        if setting == 'lem_v':
            lem = WordNetLemmatizer()
            text = [lem.lemmatize(y, pos='v') for y in text]
        if setting == 'lem_a':
            lem = WordNetLemmatizer()
            text = [lem.lemmatize(y, pos='a') for y in text]
        if setting == 'lem_pos':
            lem = WordNetLemmatizer()
            text = [lem.lemmatize(y, get_wordnet_pos_func(y)) for y in text]
        if setting == 'spacy':
            text = nlp(' '.join(text))
            text = [y.lemma_ for y in text]
        
    if stop != '':
        return ' '.join(text)
    else:
        return ''.join(text)

### Run Data Cleaning

- We create a series of various pre-processed text to assess which pre-processing steps provide the best performance.
- Please refer the the parameters inside the `clean_text` function to see which pre-processing steps are applied.
- Certain pre-processing steps inside the `clean_text` function are always applied, those are the ones without `if-else` statements.
- `text` column is the original review, `tN` where `N` is the code to distinguish between different pre-processed texts.

In [None]:
pd.set_option('display.max_colwidth', 0)

df['t01'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, only_alphabets=True))
df[['text', 't01', 'class']].head(2)

In [None]:
df['t02'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=False, only_alphabets=False))
df[['text', 't02', 'class']].head(2)

In [None]:
df['t03'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True))
df[['text', 't03', 'class']].head(2)

In [None]:
df['t04'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, stop='nltk'))
df[['text', 't04', 'class']].head(2)

In [None]:
df['t05'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, stop='spacy'))
df[['text', 't05', 'class']].head(2)

In [None]:
df['t06'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, stop='both'))
df[['text', 't06', 'class']].head(2)

In [None]:
df['t07'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom', 
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't07', 'class']].head(2)

In [None]:
df['t08'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom',
                                                  normalization=['stem'],
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't08', 'class']].head(2)

In [None]:
df['t09'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom',
                                                  normalization=['lem'],
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't09', 'class']].head(2)

In [None]:
df['t10'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom',
                                                  normalization=['lem_pos'],
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't10', 'class']].head(2)

In [None]:
df['t11'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom',
                                                  normalization=['spacy'],
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't11', 'class']].head(2)

In [None]:
df['t12'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom',
                                                  normalization=['lem', 'stem'],
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't12', 'class']].head(2)

In [None]:
df['t13'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom',
                                                  normalization=['lem_pos', 'stem'],
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't13', 'class']].head(2)

In [None]:
df['t14'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom',
                                                  normalization=['lan'],
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't14', 'class']].head(2)

In [None]:
df['t15'] = df['text'].apply(lambda x: clean_text(x, expand_contractions=True,
                                                  expand_short_forms=True,
                                                  remove_punctuation=True, 
                                                  only_alphabets=True, 
                                                  seperate_numbers_from_text=True, 
                                                  convert_numbers_to_text=True, 
                                                  stop='custom',
                                                  normalization=['snow'],
                                                  custom_words=['hotel', 'room', 'chicago', 'great', 'stay',
                                                                'hard', 'rock', 'fairmont', 'millennium', 
                                                                'sheraton', 'towers', 'sofitel', 'knickerbocker', 
                                                                'water']))
df[['text', 't15', 'class']].head(2)

### Save Pre-Processed Dataset

In [None]:
df.to_csv('deceptive-cleaned.csv', index=False)

---
### Feature Extraction

In [None]:
# read the cleaned dataset
df = pd.read_csv("deceptive-cleaned.csv")

# drop duplicates and keep the last one
df = df.drop_duplicates(subset=['t10'], keep='last')

# drop rows with empty text
df = df[~df['t10'].str.contains(r'^\s*$', na=False)]

# reset the index
df.reset_index(drop=True, inplace=True)

In [None]:
# split the dataset: X = text, y = class
X = df['t10']
y = df['class']

# generate the tf-idf matrix
tfid_vec = TfidfVectorizer()
corpus = tfid_vec.fit_transform(X)
feature_names = tfid_vec.get_feature_names_out()
corpus_arr = pd.DataFrame(corpus.toarray(), columns=feature_names)

In [None]:
# reduce the dimensionality of the tf-idf matrix
tsne = TSNE(n_components=2, init='pca', learning_rate='auto', n_jobs=-1)
tfidf_2d = tsne.fit_transform(corpus_arr)

In [None]:
# draw the points
alpha = 0.5
label_map = {l: i for i, l in enumerate(np.unique(df['class']))}

# map colours to the target labels
node_colours = [label_map[target] for target in df['class']]

# generate the plot
plt.figure(figsize=(8, 8))
plt.scatter(
    tfidf_2d[:, 0],
    tfidf_2d[:, 1],
    c=node_colours,
    cmap="jet",
    alpha=alpha,
)

---
### Feature Selection

In [None]:
def select_best_features(X, y, k = 2000, convert_to_frame=False):
    '''
    Selects the best features using the chi-squared test
    
    Args:
        k (int, optional): Number of features to select. Defaults to 2000.
        
    Returns:
        Array of selected features
    '''
    
    chi2_features = SelectKBest(chi2, k=k)
    X_new = chi2_features.fit_transform(X, y)
    
    if convert_to_frame:
        chi_support = chi2_features.get_support()
        chi_feature = X.loc[:, chi_support].columns.tolist()
        X_new = pd.DataFrame(X_new, columns=chi_feature)
    
    return X_new

# select the best features using chi-squared test (default k=2000)
X_new = select_best_features(corpus_arr, y, convert_to_frame=True)
print(f'Before feature selection: {corpus_arr.shape}\nAfter feature selection: {X_new.shape}')

---

**Distance measures tested:**
- Euclidean distance
- Cosine similarity
- Hamming distance
- Chebyshev distance
- Jaccard distance
- Cityblock distance
- Minkowski distance
- Canberra distance
- Braycurtis distance

In [None]:
# uncomment the distance metric to use
# generate the adjacency matrix for distance metrics

adj_X = cdist(X_new, X_new, 'hamming')
#adj_X = cdist(X_new, X_new, 'cosine')
#adj_X = cdist(X_new, X_new, 'chebyshev')
#adj_X = cdist(X_new, X_new, 'jaccard')
#adj_X = cdist(X_new, X_new, 'cityblock')
#adj_X = cdist(X_new, X_new, 'minkowski')
#adj_X = cdist(X_new, X_new, 'canberra')
#adj_X = cdist(X_new, X_new, 'braycurtis')
#adj_X = cdist(X_new, X_new, 'euclidean')

# calculate the average of the adjacency matrix
avg_adj_X = np.average(adj_X)

# convert the adjacency matrix to binary
adj_X = np.where(adj_X >= avg_adj_X, 1, 0)

# convert the adjacency matrix type to int8 to reduce memory usage
adj_X = adj_X.astype('int8')

In [None]:
# generate the networkx graph from the binarized adjacency matrix
X_netwrk = nx.from_numpy_matrix(adj_X)

In [None]:
# generate the graph from the networkx graph
X_graph = StellarGraph.from_networkx(X_netwrk)
print(X_graph.info())

In [None]:
# create a BiasedRandomWalk object
brw = BiasedRandomWalk(X_graph)

# run BiasedRandomWalk on the graph nodes and print the number of random walks
walks = brw.run(
    nodes=list(X_graph.nodes()),  # root nodes
    length=100,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=1.0,  # Defines (unormalised) probability, 1/q, for moving away from source node 
            # -> check with the t-SNE figure below to see if the nodes are well separated, otherwise adjust the p & q values accordingly
)
print("Number of random walks: {}".format(len(walks)))

In [None]:
# convert the random walks to strings
str_walks = [[str(n) for n in walk] for walk in walks]

# train the Word2Vec model on the random walks
model = Word2Vec(str_walks, window=5, min_count=0, sg=1, workers=5, seed=10)

In [None]:
# get the node embedding indexes
node_ids = model.wv.index_to_key

# get the node embeddings
node_embeddings = (
    model.wv.vectors
)  # numpy.ndarray of size number of nodes times embeddings dimensionality

# get the node target classes
node_targets = df['class'][[int(node_id) for node_id in node_ids]]

In [None]:
# apply t-SNE transformation on node embeddings
tsne = TSNE(n_components=2, init='pca', learning_rate='auto', n_jobs=-1)
node_embeddings_2d = tsne.fit_transform(node_embeddings)

# draw the points
alpha = 0.7
label_map = {l: i for i, l in enumerate(np.unique(node_targets))}
node_colours = [label_map[target] for target in df['class']]

# generate the plot
plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    c=node_colours,
    cmap="jet",
    alpha=alpha,
)

In [None]:
# create a DataFrame of node embeddings and node target classes
node_embeddings_df = pd.DataFrame(node_embeddings)
node_targets_df = pd.DataFrame(node_targets)
node_targets_df.sort_index()

node_embeddings_df['class'] = node_targets_df
    
# save as pickle to reserve the data as it is
node_embeddings_df.to_pickle('t10.pkl')

---
### **Basic Modeling to Test after Feature Extraction**

In [None]:
X = node_embeddings_df.drop(['class'], axis=1)
y = node_embeddings_df['class']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import confusion_matrix, recall_score, precision_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)

In [None]:
log = LogisticRegression(max_iter=10000) 
log.fit(x_train, y_train)
y_pred=log.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('accuracy_score: ', accuracy_score(y_test, y_pred))

In [None]:
lsvm = LinearSVC()
lsvm.fit(x_train, y_train)
y_pred=lsvm.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('accuracy_score: ', accuracy_score(y_test, y_pred))