In [6]:
print(df['label'].isnull().sum())


0


In [13]:
import re
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch


def clean_text(text):

    text = BeautifulSoup(text, "html.parser").get_text()


    text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text)


    text = re.sub(r"[^a-zA-Z0-9]", " ", text)



    return text


from google.colab import drive
drive.mount('/content/drive')


file_path = '/content/drive/MyDrive/admin.csv'  # Update with the correct path
df = pd.read_csv(file_path)


print(df.head())


df['cleaned_text'] = df['text'].apply(clean_text)


label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['label'])  # Assuming 'class' is the column with your classes


train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))


train_encodings = tokenizer(list(train_df['cleaned_text']), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(list(test_df['cleaned_text']), truncation=True, padding=True, return_tensors='pt')


train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(train_df['labels'].values)
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    torch.tensor(test_df['labels'].values)
)


train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])


train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
     id                                               text   label  Comments
0  7789  @AAlwuhaib1977 Muslim mob violence against Hin...  racism       NaN
1  7790             @Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG  normal       NaN
2  7791  @jncatron @isra_jourisra @AMPalestine Islamoph...  racism       NaN
3  7792  Finally I'm all caught up, and that sudden dea...  normal       NaN
4  7793             @carolinesinders @herecomesfran *hugs*  normal       NaN


  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [15]:
num_epochs = 70

optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)


for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()


model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in tqdm(val_loader, desc='Evaluating'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f'Validation Accuracy: {accuracy * 100:.2f}%')


model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in tqdm(test_loader, desc='Evaluating'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Epoch 1/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 2/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 3/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 4/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 5/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 6/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 7/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 8/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 9/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 10/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 11/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 12/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 13/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 14/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 15/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 16/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 17/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 18/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 19/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 20/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 21/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 22/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 23/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 24/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 25/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 26/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 27/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 28/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 29/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 30/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 31/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 32/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 33/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 34/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 35/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 36/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 37/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 38/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 39/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 40/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 41/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 42/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 43/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 44/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 45/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 46/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 47/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 48/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 49/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 50/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 51/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 52/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 53/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 54/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 55/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 56/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 57/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 58/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 59/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 60/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 61/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 62/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 63/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 64/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 65/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 66/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 67/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 68/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 69/70:   0%|          | 0/80 [00:00<?, ?it/s]

Epoch 70/70:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

Validation Accuracy: 88.75%


Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

Test Accuracy: 73.00%


In [12]:
# Load the CSV file into a Pandas DataFrame
file_path = '/content/drive/MyDrive/admin.csv'  # Update with your actual file path
df = pd.read_csv(file_path)

# Assuming your class column is named 'class' (replace with the actual column name)
class_column = 'label'

# Count the unique values for each class
class_counts = df[class_column].value_counts()

# Display the counts
print(class_counts)

normal    263
toxic     252
sexism    244
racism    237
Name: label, dtype: int64
