In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_union
import pickle

In [None]:
  #1 Code to read csv file into colaboratory:
  !pip install -U -q PyDrive
  from pydrive.auth import GoogleAuth
  from pydrive.drive import GoogleDrive
  from google.colab import auth
  from oauth2client.client import GoogleCredentials

  # 1. Authenticate and create the PyDrive client.
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)



In [None]:
#2. Get the file
#make sure you upload all your data files to your Google drive and change share->Advanced->change->anyone with the link can view
downloaded = drive.CreateFile({'id':'1b0BcOqd8kv_6qhtKdBEeVPt7XjoqSego'}) # replace the id with id of file you want to access
downloaded.GetContentFile('train.csv')

In [None]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
targets = list(df.columns[2:])
df_targets = df[targets].copy()

count_dic = {}
for comment_type in targets:
    counts = list()
    others = list(targets)
    # filtering out the rows based on the comment type (toxic, severe toxic, etc..,)
    df_selection = df_targets[(df_targets[comment_type]==1)]
    # Removing current comment_type from others
    others.remove(comment_type)
    # Counting the matched rows with the say toxic comment_type etc...,
    counts.append(('total', len(df_selection)))
    for other in others:
        counts.append((other, df_selection[other].sum()))
    print(f"Counts of other type of statements with comment type {comment_type} is {counts}")
    count_dic[comment_type] = counts

# totals = []
# for key, value in count_dic.items():
#     totals.append(value[0][1])
#     print('\n%d %s comments. (%.2f%% of all data.)' % (value[0][1], key, (value[0][1]/len(df))*100))
#     for cnt in value[1:]:
#         print('- %d or %.2f%% were also %s.' % (cnt[1], (cnt[1]/value[0][1])*100, cnt[0]))



Counts of other type of statements with comment type toxic is [('total', 15294), ('severe_toxic', 1595), ('obscene', 7926), ('threat', 449), ('insult', 7344), ('identity_hate', 1302)]
Counts of other type of statements with comment type severe_toxic is [('total', 1595), ('toxic', 1595), ('obscene', 1517), ('threat', 112), ('insult', 1371), ('identity_hate', 313)]
Counts of other type of statements with comment type obscene is [('total', 8449), ('toxic', 7926), ('severe_toxic', 1517), ('threat', 301), ('insult', 6155), ('identity_hate', 1032)]
Counts of other type of statements with comment type threat is [('total', 478), ('toxic', 449), ('severe_toxic', 112), ('obscene', 301), ('insult', 307), ('identity_hate', 98)]
Counts of other type of statements with comment type insult is [('total', 7877), ('toxic', 7344), ('severe_toxic', 1371), ('obscene', 6155), ('threat', 307), ('identity_hate', 1160)]
Counts of other type of statements with comment type identity_hate is [('total', 1405), ('t

In [None]:
def feature_engineering(df, sparse=0):

    # Comment length
    df['length'] = df.comment_text.apply(lambda x: len(x))


    # Capitalization percentage
    def pct_caps(s):
        return sum([1 for c in s if c.isupper()]) / (sum(([1 for c in s if c.isalpha()])) + 1)
    df['caps'] = df.comment_text.apply(lambda x: pct_caps(x))

    # Mean Word length
    def word_length(s):
        s = s.split(' ')
        return np.mean([len(w) for w in s if w.isalpha()])
    df['word_length'] = df.comment_text.apply(lambda x: word_length(x))

    # Average number of exclamation points
    df['exclamation'] = df.comment_text.apply(lambda s: len([c for c in s if c == '!']))

    # Average number of question marks
    df['question'] = df.comment_text.apply(lambda s: len([c for c in s if c == '?']))

    # Strip IP Addresses
    ip = re.compile('(([2][5][0-5]\.)|([2][0-4][0-9]\.)|([0-1]?[0-9]?[0-9]\.)){3}'
                    +'(([2][5][0-5])|([2][0-4][0-9])|([0-1]?[0-9]?[0-9]))')
    def strip_ip(s, ip):
        try:
            found = ip.search(s)
            return s.replace(found.group(), ' ')
        except:
            return s

    df.comment_text = df.comment_text.apply(lambda x: strip_ip(x, ip))

    return df

def merge_features(comment_text, data, engineered_features):
    new_features = sparse.csr_matrix(data[engineered_features].values)
    if np.isnan(new_features.data).any():
        new_features.data = np.nan_to_num(new_features.data)
    return sparse.hstack([comment_text, new_features])

In [None]:
df = feature_engineering(df)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,length,caps,word_length,exclamation,question
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,264,0.083333,4.5,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112,0.108108,6.25,1,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233,0.02139,4.638889,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,622,0.022587,4.212766,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67,0.039216,4.125,0,1


In [None]:
print('Training labels:')
print(list(df_targets.columns))
print(df_targets.shape)

print('\nTraining data')
df.drop(list(df_targets.columns), inplace=True, axis=1)
df.drop('id', inplace=True, axis=1)
print(list(df.columns))
print(df.shape)

toxic_rows = df_targets.sum(axis=1)
toxic_rows = (toxic_rows > 0)
targets.append('any_label')
df_targets['any_label'] = toxic_rows.astype(int)

new_features = list(df.columns[1:])

Training labels:
['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
(159571, 6)

Training data
['comment_text', 'length', 'caps', 'word_length', 'exclamation', 'question']
(159571, 6)


In [None]:
df.head()

Unnamed: 0,comment_text,length,caps,word_length,exclamation,question
0,Explanation\nWhy the edits made under my usern...,264,0.083333,4.5,0,1
1,D'aww! He matches this background colour I'm s...,112,0.108108,6.25,1,0
2,"Hey man, I'm really not trying to edit war. It...",233,0.02139,4.638889,0,0
3,"""\nMore\nI can't make any real suggestions on ...",622,0.022587,4.212766,0,0
4,"You, sir, are my hero. Any chance you remember...",67,0.039216,4.125,0,1


In [None]:
df_targets.head()


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,any_label
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.optim.lr_scheduler import StepLR
import numpy as np
from sklearn.metrics import classification_report

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Freeze all the parameters except for the last layer and the classifier layer
for name, param in model.named_parameters():
    if 'classifier' not in name and 'layer.11' not in name:  # Only train classifier and the last layer
        param.requires_grad = False

# Check the device and move the model over
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Assuming df and df_targets are your DataFrames
texts = df['comment_text'].tolist()
labels = df_targets[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

# Create datasets
dataset = TextDataset(texts, labels, tokenizer)

# Splitting the dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.utils.data import DataLoader, random_split, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

# Assuming df and df_targets as shown in your data
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # Ensure labels are the correct type
        return item

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Freeze all layers except classifier and last layer
for name, param in model.named_parameters():
    if 'classifier' not in name and 'layer.11' not in name:
        param.requires_grad = False

# Tokenize text data
texts = df['comment_text'].tolist()
labels = df_targets.iloc[:, :6].values  # Adjust according to your data structure
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Create dataset
dataset = TextDataset(encodings, labels)

# Split dataset into training and validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Prepare for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
num_epochs = 3
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    scheduler.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Evaluation on validation set
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

print(classification_report(all_labels, all_preds))