In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!ln -s "/content/drive/My Drive/Borderline/" "/content/"

In [3]:
%cd Borderline

/content/drive/My Drive/Borderline


# Classification

## Imports

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import Adam
from torch.utils.data import Subset, DataLoader, random_split, Dataset
from torch.optim import lr_scheduler
import torch.utils.data as data
from torchvision import transforms
from transformers import BertTokenizer, AdamW, BertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import numpy as np
import os
import time
import copy
import random
import pandas as pd

#Dataset

In [5]:
class TweetsDataset(Dataset):
    def __init__(self, filename, tokenizer, max_len):
        self.data = pd.read_csv(filename)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.LABEL_MAP = {'hate': 1, 'nothate': 0}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        tweet = self.data.loc[index, 'text']
        label = self.LABEL_MAP[self.data.loc[index, 'label']]

        encodings = self.tokenizer(tweet, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        return {
            'input_ids': encodings['input_ids'].squeeze(),  # Use squeeze to remove batch dimension
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)  # Use torch.long for labels
        }


#Hyperparameters

In [6]:
MAX_LEN = 128
BATCH_SIZE = 72
LEARNING_RATE = 2e-5
NUM_EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset init and Dataloaders

In [7]:
csv_path = 'data/data.csv'
tokenizer = tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = TweetsDataset(csv_path, tokenizer, MAX_LEN)

dataset_size = int(len(dataset)/8)
dataset_indices = list(range(dataset_size))
train_indices, temp_indices = train_test_split(dataset_indices, test_size=0.2, random_state=42)
val_indices, test_indices = train_test_split(temp_indices, test_size=0.5, random_state=42)


train_subset = Subset(dataset, train_indices)
val_subset = Subset(dataset, val_indices)
test_subset = Subset(dataset, test_indices)

train_loader = DataLoader(train_subset, BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_subset, BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_subset, BATCH_SIZE, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

#Model

In [None]:
classifier = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
classifier.to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Optimizier and Learning Rate Scheduler

In [None]:
optimizer = AdamW(classifier.parameters(), lr=LEARNING_RATE)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)



In [None]:
classifier.train()

for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    for idx, batch in enumerate(train_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = classifier(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()
        epoch_loss += loss.item()

    epoch_loss = epoch_loss/dataset_size
    print(f'Epoch {epoch}', 'Epoch loss :', epoch_loss )


In [None]:
from sklearn.metrics import classification_report
classifier.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = classifier(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch['labels'].cpu().numpy())

print(classification_report(true_labels, predictions))


              precision    recall  f1-score   support

           0       0.78      0.81      0.79       242
           1       0.82      0.79      0.81       272

    accuracy                           0.80       514
   macro avg       0.80      0.80      0.80       514
weighted avg       0.80      0.80      0.80       514



In [None]:
torch.save(classifier.state_dict(), 'classifier.pth')

# Opposite Generator

In [15]:
import json

def convert_and_save_json(input_filename, output_filename):
    # Load the original JSON data from the file
    with open(input_filename, 'r') as file:
        original_data = json.load(file)

    # Transform the data into a list of dictionaries with 'Sentence' and 'Opposite' keys
    transformed_data = [{'Sentence': k, 'Opposite': v} for k, v in original_data.items()]

    # Save the transformed data to a new JSON file
    with open(output_filename, 'w') as outfile:
        json.dump(transformed_data, outfile, indent=4)

# Specify the input and output file names
input_filename = '/content/Borderline/data/dataset.json'
output_filename = '/content/Borderline/data/transformed_dataset.json'

# Run the conversion function
convert_and_save_json(input_filename, output_filename)

print(f"Data has been successfully converted and saved to {output_filename}")


Data has been successfully converted and saved to /content/Borderline/data/transformed_dataset.json


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.optim.lr_scheduler import StepLR
import pandas as pd

In [18]:
class PhraseDataset(Dataset):
    def __init__(self, filename, tokenizer, max_len):
        # Load data from a JSON file. It's directly usable since the format matches DataFrame expectations
        self.data = pd.read_json(filename)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        phrase = self.data.loc[index, 'Sentence']
        target = self.data.loc[index, 'Opposite']

        # Tokenize the phrase and its opposite using the provided tokenizer
        source_encoding = self.tokenizer(phrase, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(target, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")

        return {
            'input_ids': source_encoding['input_ids'].squeeze(),  # Remove the batch dimension
            'attention_mask': source_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }



In [19]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
dataset_path = '/content/Borderline/data/transformed_dataset.json'
oppo_dataset = PhraseDataset(dataset_path, tokenizer, max_len=512)
train_loader = DataLoader(oppo_dataset, batch_size=2, shuffle=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
opposite_maker = T5ForConditionalGeneration.from_pretrained('t5-base')
opposite_maker.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [21]:
optimizer = AdamW(opposite_maker.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=1000, gamma=0.95)



In [22]:
len(train_loader)

229

In [23]:
opposite_maker.train()
for epoch in range(4):  # number of epochs
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(opposite_maker.device)
        attention_mask = batch['attention_mask'].to(opposite_maker.device)
        labels = batch['labels'].to(opposite_maker.device)
        outputs = opposite_maker(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")


Epoch: 0, Loss: 22.639623641967773
Epoch: 0, Loss: 20.804317474365234
Epoch: 0, Loss: 19.313417434692383
Epoch: 0, Loss: 16.17634391784668
Epoch: 0, Loss: 15.057358741760254
Epoch: 0, Loss: 14.785515785217285
Epoch: 0, Loss: 13.262310981750488
Epoch: 0, Loss: 12.998199462890625
Epoch: 0, Loss: 10.261573791503906
Epoch: 0, Loss: 13.279834747314453
Epoch: 0, Loss: 11.648869514465332
Epoch: 0, Loss: 8.512600898742676
Epoch: 0, Loss: 9.447626113891602
Epoch: 0, Loss: 8.993467330932617
Epoch: 0, Loss: 7.436216831207275
Epoch: 0, Loss: 7.776683807373047
Epoch: 0, Loss: 6.737668037414551
Epoch: 0, Loss: 8.097447395324707
Epoch: 0, Loss: 6.072287082672119
Epoch: 0, Loss: 6.433104038238525
Epoch: 0, Loss: 4.63501501083374
Epoch: 0, Loss: 5.075361728668213
Epoch: 0, Loss: 5.452223777770996
Epoch: 0, Loss: 5.049422264099121
Epoch: 0, Loss: 3.7162294387817383
Epoch: 0, Loss: 3.645092725753784
Epoch: 0, Loss: 2.668123483657837
Epoch: 0, Loss: 3.298931360244751
Epoch: 0, Loss: 1.9298179149627686
Epo

In [24]:
torch.save(opposite_maker.state_dict(), 'opposite_maker.pth')

## Test Opposite Maker

In [25]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Initialize the model with the same configuration it was trained with
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.load_state_dict(torch.load('opposite_maker.pth', map_location=DEVICE))
model.to(DEVICE)
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [27]:
def get_opposite_phrase(model, tokenizer, phrase, max_length=512):
    try:
        # Encode the input phrase to tensor of IDs
        input_ids = tokenizer.encode(phrase, return_tensors="pt", max_length=max_length, truncation=True).to(DEVICE)


        # Generate output IDs from the model
        outputs = model.generate(input_ids, max_length=max_length)


        # Decode the generated IDs back to a string
        opposite_phrase = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return opposite_phrase
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""

In [28]:
print("Type 'quit' to exit.")
while True:
    input_phrase = input("Enter a phrase: ")
    if input_phrase.lower() == 'quit':
        break

    # Get the opposite phrase
    opposite_phrase = get_opposite_phrase(opposite_maker, tokenizer, input_phrase)
    print("Opposite Phrase:", opposite_phrase)


Type 'quit' to exit.
Enter a phrase: I am a boy
Opposite Phrase: i am a girl
Enter a phrase: I am happy
Opposite Phrase: I am sad I am apprehensive
Enter a phrase: I am an only child
Opposite Phrase: I am a skeptic
Enter a phrase: I go to school 
Opposite Phrase: a   Cess to
Enter a phrase: I am not a student
Opposite Phrase: a student. I am not a student.
Enter a phrase: I hate cows.
Opposite Phrase: I love cows
Enter a phrase: I love eating pancakes
Opposite Phrase: I hate pancakes


KeyboardInterrupt: Interrupted by user