In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!ln -s "/content/drive/My Drive/Borderline/" "/content/"

In [3]:
%cd Borderline

/content/drive/My Drive/Borderline


# Classification

In [9]:
pip install SpeechRecognition



In [10]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

## Imports

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import Adam
from torch.utils.data import Subset, DataLoader, random_split, Dataset
from torch.optim import lr_scheduler
import torch.utils.data as data
from torchvision import transforms
from transformers import BertTokenizer, AdamW, BertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from transformers import TrainingArguments

import speech_recognition as sr
import moviepy.audio as audio
import moviepy.editor as mp

from unsloth import FastLanguageModel

import json
from transformers import AutoTokenizer
from datasets import Dataset
import numpy as np
import os
import time
import copy
import random
import pandas as pd

In [None]:
import json

def convert_to_alpaca_format(input_filename, output_filename):
    # Load the original JSON data from the file
    with open(input_filename, 'r') as file:
        data = json.load(file)

    # Prepare data in the new Alpaca prompt format
    alpaca_data = []
    for entry in data:
        alpaca_prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Turn the input phrase into its opposite phrase

### Input:
{entry['Sentence']}

### Response:
{entry['Opposite']}
"""
        alpaca_data.append(alpaca_prompt)

    # Save the transformed data to a new JSON file
    with open(output_filename, 'w') as outfile:
        json.dump(alpaca_data, outfile, indent=4)

# Specify the input and output file names
input_filename = 'data/dataset.json'
output_filename = 'final_dataset.json'

# Run the conversion function
convert_to_alpaca_format(input_filename, output_filename)

print(f"Data has been successfully converted and saved to {output_filename}")

Data has been successfully converted and saved to final_dataset.json


#Dataset

In [None]:
import pandas as pd
from torch.utils.data import Dataset
from transformers import BertTokenizer

class TweetsDataset(Dataset):
    def __init__(self, filename, tokenizer, max_len):
        self.data = pd.read_csv(filename)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.LABEL_MAP = {'hate': 1, 'nothate': 0}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        tweet = self.data.loc[index, 'text']
        label = self.LABEL_MAP[self.data.loc[index, 'label']]

        encodings = self.tokenizer(tweet, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        return {
            'input_ids': encodings['input_ids'].squeeze(),  # Use squeeze to remove batch dimension
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)  # Use torch.long for labels
        }

#Hyperparameters

In [None]:
MAX_LEN = 128
BATCH_SIZE = 32
LEARNING_RATE = 1e-5
NUM_EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset init and Dataloaders

In [None]:
csv_path = 'data/data.csv'
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
dataset = TweetsDataset(csv_path, tokenizer, MAX_LEN)

dataset_size = int(len(dataset)/8)
dataset_indices = list(range(dataset_size))
train_indices, test_indices = train_test_split(dataset_indices, test_size=0.2, random_state=42)

train_subset = Subset(dataset, train_indices)
test_subset = Subset(dataset, test_indices)

train_loader = DataLoader(train_subset, BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_subset, BATCH_SIZE, shuffle=False)

#Pre-trained Model

In [None]:
classifier = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
classifier.to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Optimizier and Learning Rate Scheduler

In [None]:
optimizer = AdamW(classifier.parameters(), lr=LEARNING_RATE)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)




In [None]:
classifier.train()

for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    for idx, batch in enumerate(train_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = classifier(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()
        epoch_loss += loss.item()

    epoch_loss = epoch_loss/BATCH_SIZE
    print(f'Epoch {epoch}', 'Epoch loss :', epoch_loss )


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report
classifier.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = classifier(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch['labels'].cpu().numpy())

print(classification_report(true_labels, predictions))


              precision    recall  f1-score   support

           0       0.79      0.82      0.80       494
           1       0.82      0.80      0.81       535

    accuracy                           0.81      1029
   macro avg       0.81      0.81      0.81      1029
weighted avg       0.81      0.81      0.81      1029



In [None]:
torch.save(classifier.state_dict(), 'classifier_v4.pth')

#Testing Classifier

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
model.load_state_dict(torch.load('classifier_v4.pth', map_location=device))
model.to(device)
model.eval()

def predict_hate_speech(text):
    # Preprocess text
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Predict
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]  # Get the index of the max logit
        return "Hate Speech" if prediction == 1 else "Not Hate Speech"

# Running the prediction loop
print("Hate Speech Classification Model")
print("Type 'quit' to exit.\n")

while True:
    user_input = input("Enter a phrase to classify: ")
    if user_input.lower() == 'quit':
        break

    prediction = predict_hate_speech(user_input)
    print(f"The phrase is classified as: {prediction}\n")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Hate Speech Classification Model
Type 'quit' to exit.



KeyboardInterrupt: Interrupted by user