# Preprocessing

In [2]:
import pandas as pd
from datasets import load_dataset
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
import string

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Tokenization (after Lowercasing, removing extra Whitespace)

In [3]:
from datasets import load_dataset

op_dataset_raw = load_dataset("csv" , data_files=r"D:\Virtual Environments\Thesis\datasets\OpSpam dataset.csv")
op_dataset_raw = op_dataset_raw["train"].remove_columns(['hotel' , 'source'])

In [4]:
def combine_labels(record):
    record["label"] = record["deceptive"] + "_" + record["polarity"]
    return record
op_dataset_raw = op_dataset_raw.map(combine_labels)
op_dataset_raw = op_dataset_raw.remove_columns(["deceptive" , "polarity"])

unique_classes = list(set(op_dataset_raw["label"]))
unique_classes.sort(reverse=True)
print(unique_classes)

def one_hot_encoding(record):
    record["label"] = unique_classes.index(record["label"])
    return record

op_dataset_raw = op_dataset_raw.map(one_hot_encoding)

['truthful_positive', 'truthful_negative', 'deceptive_positive', 'deceptive_negative']


In [5]:
tokenized_text_1 = []
for text in op_dataset_raw['text']:
  text = text.lower()
  text = text.strip()
  tokenized_text_1 += [nltk.word_tokenize(text)]
tokenized_text_1[:1]

[['we',
  'stayed',
  'for',
  'a',
  'one',
  'night',
  'getaway',
  'with',
  'family',
  'on',
  'a',
  'thursday',
  '.',
  'triple',
  'aaa',
  'rate',
  'of',
  '173',
  'was',
  'a',
  'steal',
  '.',
  '7th',
  'floor',
  'room',
  'complete',
  'with',
  '44in',
  'plasma',
  'tv',
  'bose',
  'stereo',
  ',',
  'voss',
  'and',
  'evian',
  'water',
  ',',
  'and',
  'gorgeous',
  'bathroom',
  '(',
  'no',
  'tub',
  'but',
  'was',
  'fine',
  'for',
  'us',
  ')',
  'concierge',
  'was',
  'very',
  'helpful',
  '.',
  'you',
  'can',
  'not',
  'beat',
  'this',
  'location',
  '...',
  'only',
  'flaw',
  'was',
  'breakfast',
  'was',
  'pricey',
  'and',
  'service',
  'was',
  'very',
  'very',
  'slow',
  '(',
  '2hours',
  'for',
  'four',
  'kids',
  'and',
  'four',
  'adults',
  'on',
  'a',
  'friday',
  'morning',
  ')',
  'even',
  'though',
  'there',
  'were',
  'only',
  'two',
  'other',
  'tables',
  'in',
  'the',
  'restaurant',
  '.',
  'food',
  'was

## Punctuation and Stop word Removal

In [6]:
tokenized_text_2 = []
for text in tokenized_text_1:
  filtered_tokens = []
  for token in text:
    if (token not in string.punctuation) and (token not in nltk.corpus.stopwords.words('english')):
      filtered_tokens += [token]
  tokenized_text_2 += [filtered_tokens]

tokenized_text_2[:2]

[['stayed',
  'one',
  'night',
  'getaway',
  'family',
  'thursday',
  'triple',
  'aaa',
  'rate',
  '173',
  'steal',
  '7th',
  'floor',
  'room',
  'complete',
  '44in',
  'plasma',
  'tv',
  'bose',
  'stereo',
  'voss',
  'evian',
  'water',
  'gorgeous',
  'bathroom',
  'tub',
  'fine',
  'us',
  'concierge',
  'helpful',
  'beat',
  'location',
  '...',
  'flaw',
  'breakfast',
  'pricey',
  'service',
  'slow',
  '2hours',
  'four',
  'kids',
  'four',
  'adults',
  'friday',
  'morning',
  'even',
  'though',
  'two',
  'tables',
  'restaurant',
  'food',
  'good',
  'worth',
  'wait',
  'would',
  'return',
  'heartbeat',
  'gem',
  'chicago',
  '...'],
 ['triple',
  'rate',
  'upgrade',
  'view',
  'room',
  'less',
  '200',
  'also',
  'included',
  'breakfast',
  'vouchers',
  'great',
  'view',
  'river',
  'lake',
  'wrigley',
  'bldg',
  'tribune',
  'bldg',
  'major',
  'restaurants',
  'shopping',
  'sightseeing',
  'attractions',
  'within',
  'walking',
  'distan

## Frequent Word Removal

In [7]:
raw_text = []
for sentence in tokenized_text_2:
  for token in sentence:
    raw_text += [token]

frequency_distribution = nltk.FreqDist(text)

frequency_distribution.most_common(10) # 10 most common words
most_common_words = [item[0] for item in frequency_distribution.most_common(10)]  # extracting from tuples
print(most_common_words)

tokenized_text_3 = []
for text in tokenized_text_2:
  uncommon_words = []
  for token in text:
    if token not in most_common_words:
      uncommon_words += [token]
  tokenized_text_3 += [uncommon_words]

tokenized_text_3[:2]

['i', 'the', ',', '.', 'was', 'to', 'of', 'a', 'and', 'my']


[['stayed',
  'one',
  'night',
  'getaway',
  'family',
  'thursday',
  'triple',
  'aaa',
  'rate',
  '173',
  'steal',
  '7th',
  'floor',
  'room',
  'complete',
  '44in',
  'plasma',
  'tv',
  'bose',
  'stereo',
  'voss',
  'evian',
  'water',
  'gorgeous',
  'bathroom',
  'tub',
  'fine',
  'us',
  'concierge',
  'helpful',
  'beat',
  'location',
  '...',
  'flaw',
  'breakfast',
  'pricey',
  'service',
  'slow',
  '2hours',
  'four',
  'kids',
  'four',
  'adults',
  'friday',
  'morning',
  'even',
  'though',
  'two',
  'tables',
  'restaurant',
  'food',
  'good',
  'worth',
  'wait',
  'would',
  'return',
  'heartbeat',
  'gem',
  'chicago',
  '...'],
 ['triple',
  'rate',
  'upgrade',
  'view',
  'room',
  'less',
  '200',
  'also',
  'included',
  'breakfast',
  'vouchers',
  'great',
  'view',
  'river',
  'lake',
  'wrigley',
  'bldg',
  'tribune',
  'bldg',
  'major',
  'restaurants',
  'shopping',
  'sightseeing',
  'attractions',
  'within',
  'walking',
  'distan

## Lemmatization

In [8]:
lemmatizer = nltk.stem.WordNetLemmatizer()

tokenized_text_4 = []
for sentence in tokenized_text_3:
  lemmatized_tokens = []
  for token in sentence:
    lemmatized_tokens += [lemmatizer.lemmatize(token)]
  tokenized_text_4 += [lemmatized_tokens]

tokenized_text_4[:2]

[['stayed',
  'one',
  'night',
  'getaway',
  'family',
  'thursday',
  'triple',
  'aaa',
  'rate',
  '173',
  'steal',
  '7th',
  'floor',
  'room',
  'complete',
  '44in',
  'plasma',
  'tv',
  'bose',
  'stereo',
  'voss',
  'evian',
  'water',
  'gorgeous',
  'bathroom',
  'tub',
  'fine',
  'u',
  'concierge',
  'helpful',
  'beat',
  'location',
  '...',
  'flaw',
  'breakfast',
  'pricey',
  'service',
  'slow',
  '2hours',
  'four',
  'kid',
  'four',
  'adult',
  'friday',
  'morning',
  'even',
  'though',
  'two',
  'table',
  'restaurant',
  'food',
  'good',
  'worth',
  'wait',
  'would',
  'return',
  'heartbeat',
  'gem',
  'chicago',
  '...'],
 ['triple',
  'rate',
  'upgrade',
  'view',
  'room',
  'less',
  '200',
  'also',
  'included',
  'breakfast',
  'voucher',
  'great',
  'view',
  'river',
  'lake',
  'wrigley',
  'bldg',
  'tribune',
  'bldg',
  'major',
  'restaurant',
  'shopping',
  'sightseeing',
  'attraction',
  'within',
  'walking',
  'distance',
  

# Feature Extraction

## TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

recompiled_sentences = []
for sentence in tokenized_text_4:
  recompiled_sentences += [' '.join(sentence)]

recompiled_sentences[:2]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(recompiled_sentences)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()


## Sentence Vectorization

In [10]:
df_tfidf = pd.DataFrame(tfidf_scores, columns=tfidf_feature_names )
df_tfidf['label'] = op_dataset_raw['label']
df_tfidf.head()


Unnamed: 0,00,000,00a,00am,00pm,03,04,05,06,07,...,yummo,yummy,yunan,yup,zagat,zest,zipped,zone,zoo,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


# Training and Evaluation of ML models

## Splitting Dataset

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming you already have a DataFrame df_tfidf with TF-IDF features and a 'label' column
X = df_tfidf.drop("label", axis=1)
y = df_tfidf["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Evaluation Function

In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='macro')
    precision = precision_score(y_test, preds, average='macro')
    recall = recall_score(y_test, preds, average='macro')
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Precision:", precision)
    print("Recall:", recall)

## Linear Support Vector Machine

In [13]:
from sklearn.svm import LinearSVC

LSVM = LinearSVC(random_state=42, max_iter=10000)
LSVM.fit(X_train, y_train)

print("Linear SVM Performance:")
evaluate_model(LSVM, X_test, y_test)

Linear SVM Performance:
Accuracy: 0.828125
F1 Score: 0.8285604350536871
Precision: 0.8284853422844995
Recall: 0.8317986745606937


## Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=1000, random_state=42)
LR.fit(X_train, y_train)

print("Logistic Regression Performance:")
evaluate_model(LR, X_test, y_test)

Logistic Regression Performance:
Accuracy: 0.840625
F1 Score: 0.8406648144426555
Precision: 0.8408320155020395
Recall: 0.8422208863980669


## Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)

print("Random Forest Performance:")
evaluate_model(RF, X_test, y_test)

Random Forest Performance:
Accuracy: 0.771875
F1 Score: 0.7730836597871628
Precision: 0.7739543252681654
Recall: 0.7760885342512527


## Light Gradient Boosting Machine

In [32]:
import lightgbm as lgb

LGB = lgb.LGBMClassifier( n_estimators=1000,
                          boosting_type ='gbdt',
                          num_leaves = 31,                
                          max_depth = -1,       
                          learning_rate = 0.003, 
                          objective = 'multiclass', 
                          num_class = 4, 
                          random_state = 42)
LGB.fit(X_train, y_train)

print("LightGBM Performance:")
evaluate_model(LGB, X_test, y_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20235
[LightGBM] [Info] Number of data points in the train set: 1280, number of used features: 818
[LightGBM] [Info] Start training from score -1.414822
[LightGBM] [Info] Start training from score -1.383174
[LightGBM] [Info] Start training from score -1.392564
[LightGBM] [Info] Start training from score -1.355523
LightGBM Performance:
Accuracy: 0.7125
F1 Score: 0.7133595755889404
Precision: 0.7141694248702308
Recall: 0.7132990311200225


## Gradient Boosting

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
GB.fit(X_train, y_train)

print("Gradient Boosting Performance:")
evaluate_model(GB, X_test, y_test)

Gradient Boosting Performance:
Accuracy: 0.665625
F1 Score: 0.6654964267814825
Precision: 0.6695928347781916
Recall: 0.6666243866758313


## Passive Aggressive Classifier

In [18]:
from sklearn.linear_model import PassiveAggressiveClassifier

PAC = PassiveAggressiveClassifier(max_iter=1000, random_state=42)
PAC.fit(X_train, y_train)

print("Passive Aggressive Classifier Performance:")
evaluate_model(PAC, X_test, y_test)

Passive Aggressive Classifier Performance:
Accuracy: 0.828125
F1 Score: 0.8287702588889704
Precision: 0.827764413019831
Recall: 0.830607954717213
