# Installing and importing necessary libraries - experiment 2

In [1]:
!pip install transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_scheduler


# Data Ingestion and train-valid-test split - experiment 2

In [4]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/datasets/deceptive-opinion.csv')

print("Dataset Head:")
print(df.head())

# Label encoding (truthful -> 1, deceptive -> 0)
df['deceptive'] = df['deceptive'].apply(lambda x: 1 if x == 'truthful' else 0)

# Split data into train, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'], df['deceptive'], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)


Dataset Head:
  deceptive   hotel  polarity       source  \
0  truthful  conrad  positive  TripAdvisor   
1  truthful   hyatt  positive  TripAdvisor   
2  truthful   hyatt  positive  TripAdvisor   
3  truthful    omni  positive  TripAdvisor   
4  truthful   hyatt  positive  TripAdvisor   

                                                text  
0  We stayed for a one night getaway with family ...  
1  Triple A rate with upgrade to view room was le...  
2  This comes a little late as I'm finally catchi...  
3  The Omni Chicago really delivers on all fronts...  
4  I asked for a high floor away from the elevato...  


# Tokenization using BERT tokenizer - experiment 2

In [5]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(texts):
    return tokenizer(list(texts), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings = tokenize_function(test_texts)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Create Dataset and DataLoader - experiment 2

In [6]:
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = ReviewDataset(train_encodings, train_labels.tolist())
val_dataset = ReviewDataset(val_encodings, val_labels.tolist())
test_dataset = ReviewDataset(test_encodings, test_labels.tolist())

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


# Initialize BERT Model - experiment 2

In [7]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Optimizer and Scheduler - experiment 2

In [8]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)




# Training Loop - experiment 2

In [9]:
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1} completed.")


Epoch 1 completed.
Epoch 2 completed.
Epoch 3 completed.


# Evaluation(Valid) - experiment 2

In [10]:
# Evaluate on the validation set
model.eval()
val_preds, val_labels_list = [], []
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        val_labels_list.extend(batch['labels'].cpu().numpy())

# Validation metrics
print("Validation Classification Report:")
print(classification_report(val_labels_list, val_preds, target_names=['Deceptive', 'Truthful']))

val_accuracy = accuracy_score(val_labels_list, val_preds)
print(f"Validation Accuracy: {val_accuracy}")


Validation Classification Report:
              precision    recall  f1-score   support

   Deceptive       0.85      0.92      0.88       122
    Truthful       0.91      0.83      0.87       118

    accuracy                           0.88       240
   macro avg       0.88      0.87      0.87       240
weighted avg       0.88      0.88      0.87       240

Validation Accuracy: 0.875


# Evaluation(Test) - experiment 2


In [11]:
# Evaluate on the test set
model.eval()
test_preds, test_labels_list = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        test_labels_list.extend(batch['labels'].cpu().numpy())

# Test metrics
print("Test Classification Report:")
print(classification_report(test_labels_list, test_preds, target_names=['Deceptive', 'Truthful']))

test_accuracy = accuracy_score(test_labels_list, test_preds)
print(f"Test Accuracy: {test_accuracy}")


Test Classification Report:
              precision    recall  f1-score   support

   Deceptive       0.79      0.90      0.84       125
    Truthful       0.88      0.74      0.80       115

    accuracy                           0.82       240
   macro avg       0.83      0.82      0.82       240
weighted avg       0.83      0.82      0.82       240

Test Accuracy: 0.825


# **# Experiment - 3**

# Data Ingestion and train-valid-test split - experiment 3

In [44]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/datasets/deceptive-opinion.csv")

# Split the data into training, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Extract text and labels from the datasets
train_texts = train_df['text'].tolist()
train_labels = train_df['deceptive'].apply(lambda x: 1 if x == 'deceptive' else 0).tolist()

val_texts = val_df['text'].tolist()
val_labels = val_df['deceptive'].apply(lambda x: 1 if x == 'deceptive' else 0).tolist()

test_texts = test_df['text'].tolist()
test_labels = test_df['deceptive'].apply(lambda x: 1 if x == 'deceptive' else 0).tolist()


# Data Modification - experiment 3

In [45]:
# Add polarity to the text data for training, validation, and test sets
train_texts_with_polarity = [text + " [SEP] Polarity: " + polarity for text, polarity in zip(train_texts, train_df['polarity'])]
val_texts_with_polarity = [text + " [SEP] Polarity: " + polarity for text, polarity in zip(val_texts, val_df['polarity'])]
test_texts_with_polarity = [text + " [SEP] Polarity: " + polarity for text, polarity in zip(test_texts, test_df['polarity'])]

print(train_texts_with_polarity[:5])


["when i first checked the hotel's website and reviews i was completely sure that it would be a great hotel and i would have a great time there. but i was totally disappointed once i got there, first of all i asked for a nonsmoking room because i hate the odor , and in fact they gave me a non smoking room which smelled worse than a smoking one, it was smelly in a bad way it smelled like sweat and dirty laundry or something like that. then when i was trying to enjoy the view, they windows were all dusty and dirty. so i decided to get out of the room , and i went to the pool and it wasnt what i expected either it has nothing to do with the picture . so i went back to my room and called room service for them to complain about the smell and dirt so they gave me another room which smelled even worse!. i was about to freak out so they gave me some deodorant and it was tolerable; the bed was uncomfortable and the bathroom eww!, its all i can say about this hotel.\n [SEP] Polarity: negative", 

# Tokenize the Data - experiment 3

In [46]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings_with_polarity = tokenizer(train_texts_with_polarity, padding=True, truncation=True, max_length=128, return_tensors="pt")
val_encodings_with_polarity = tokenizer(val_texts_with_polarity, padding=True, truncation=True, max_length=128, return_tensors="pt")
test_encodings_with_polarity = tokenizer(test_texts_with_polarity, padding=True, truncation=True, max_length=128, return_tensors="pt")

print(train_encodings_with_polarity['input_ids'][:5])  # Printing the tokenized ids for first 5 examples


tensor([[  101,  2043,  1045,  2034,  7039,  1996,  3309,  1005,  1055,  4037,
          1998,  4391,  1045,  2001,  3294,  2469,  2008,  2009,  2052,  2022,
          1037,  2307,  3309,  1998,  1045,  2052,  2031,  1037,  2307,  2051,
          2045,  1012,  2021,  1045,  2001,  6135,  9364,  2320,  1045,  2288,
          2045,  1010,  2034,  1997,  2035,  1045,  2356,  2005,  1037,  2512,
         25855,  6834,  2282,  2138,  1045,  5223,  1996, 19255,  1010,  1998,
          1999,  2755,  2027,  2435,  2033,  1037,  2512,  9422,  2282,  2029,
          9557,  4788,  2084,  1037,  9422,  2028,  1010,  2009,  2001,  5437,
          2100,  1999,  1037,  2919,  2126,  2009,  9557,  2066,  7518,  1998,
          6530, 14533,  2030,  2242,  2066,  2008,  1012,  2059,  2043,  1045,
          2001,  2667,  2000,  5959,  1996,  3193,  1010,  2027,  3645,  2020,
          2035, 12727,  1998,  6530,  1012,  2061,  1045,  2787,  2000,  2131,
          2041,  1997,  1996,  2282,  1010,  1998,  

# Create a Dataset Class - Experiment 3

In [47]:
import torch
from torch.utils.data import Dataset, DataLoader


class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset_with_polarity = ReviewDataset(train_encodings_with_polarity, train_labels)
val_dataset_with_polarity = ReviewDataset(val_encodings_with_polarity, val_labels)
test_dataset_with_polarity = ReviewDataset(test_encodings_with_polarity, test_labels)

# Create DataLoaders
train_loader_with_polarity = DataLoader(train_dataset_with_polarity, batch_size=16, shuffle=True)
val_loader_with_polarity = DataLoader(val_dataset_with_polarity, batch_size=16)
test_loader_with_polarity = DataLoader(test_dataset_with_polarity, batch_size=16)


# Defining the BERT Model for Training - expereiment 3

In [48]:
from transformers import BertForSequenceClassification, AdamW
from transformers import get_scheduler

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader_with_polarity) * 3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training the Model(training loop) - experimemnt 3

In [49]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader_with_polarity:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # Validation loop
    model.eval()
    val_preds, val_labels_list = [], []
    with torch.no_grad():
        for batch in val_loader_with_polarity:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_labels_list.extend(batch['labels'].cpu().numpy())

    # Calculate and print evaluation metrics
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels_list, val_preds, average='binary')
    accuracy = accuracy_score(val_labels_list, val_preds)
    print(f"Epoch {epoch + 1}:")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1-Score: {f1:.4f}")
    print(confusion_matrix(val_labels_list, val_preds))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1:
Validation Accuracy: 0.8708
Validation Precision: 0.8527
Validation Recall: 0.9016
Validation F1-Score: 0.8765
[[ 99  19]
 [ 12 110]]


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2:
Validation Accuracy: 0.9000
Validation Precision: 0.8551
Validation Recall: 0.9672
Validation F1-Score: 0.9077
[[ 98  20]
 [  4 118]]


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3:
Validation Accuracy: 0.8875
Validation Precision: 0.8862
Validation Recall: 0.8934
Validation F1-Score: 0.8898
[[104  14]
 [ 13 109]]


# Test Evaluation - experiment 3

In [50]:
# Test Evaluation
model.eval()
test_preds, test_labels_list = [], []
with torch.no_grad():
    for batch in test_loader_with_polarity:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        test_labels_list.extend(batch['labels'].cpu().numpy())

# Calculate evaluation metrics for the test set
test_accuracy = accuracy_score(test_labels_list, test_preds)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(test_labels_list, test_preds, average='binary')

# Print results
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(test_labels_list, test_preds))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.8333
Test Precision: 0.8295
Test Recall: 0.8560
Test F1-Score: 0.8425
Confusion Matrix:
[[ 93  22]
 [ 18 107]]
