## Download and Install Required Libraries

In [None]:
!pip install scikit-learn
!pip install nltk



In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

import nltk
from nltk.corpus import stopwords
import nltk
from nltk.corpus import stopwords
from collections import defaultdict

## Loading the Dataset

In [None]:
cd jigsaw-toxic-comment-classification-challenge/jigsaw-toxic-comment-classification-challenge/

/app/DL/Question2/jigsaw-toxic-comment-classification-challenge/jigsaw-toxic-comment-classification-challenge


In [None]:
# Load the dataset
df = pd.read_csv(r"train/train.csv")

In [None]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [None]:
df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


We can see thet the dataset has two object type columns - the comment id and
 comment text contents. Thw labels for the dataset are present as binary values in a one-hot encoded format for 6 classes - toxic	severe_toxic	obscene	threat	insult	identity_hate.

 There are no missing values in the dataset but most comments don't seem to belong to any of the 6 classes

## Data Preprocessing Steps

In the following code we preprocess textual comments stored in a DataFrame. We begin by converting the text to lowercase and splitting it into individual words. Then, we filter out common English stopwords.

In [None]:
df['comment_text'] = df['comment_text'].str.lower().str.split()

In [None]:
# Download the stopwords list if not already downloaded
nltk.download('stopwords')

# Get the English stopwords list
stop_words = set(stopwords.words('english'))

# Create vocabulary with stop word removal
word_to_index = {}
index_to_word = {}
word_counts = defaultdict(int)  # Count occurrences of each word

# Iterate over comments to build vocabulary
for comment in df['comment_text']:
    for word in comment:
        if word.lower() not in stop_words:  # Check if the word is not a stop word
            word_counts[word] += 1  # Increment word count
            if word not in word_to_index:
                index = len(word_to_index)
                word_to_index[word] = index
                index_to_word[index] = word

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
len(word_to_index ) #Total word occurences

470169

Next, we build a vocabulary of words based on their frequency, discarding those that occur infrequently. Finally, we convert the text into sequences of indices corresponding to words in the filtered vocabulary and pad these sequences to ensure uniform length for further processing in natural language processing task.

In [None]:
# Filter out words with low frequency
min_word_frequency = 5  # Adjust as needed
word_to_index = {word: index for word, index in word_to_index.items() if word_counts[word] >= min_word_frequency}

# Reindex vocabulary after filtering
word_to_index = {word: idx for idx, (word, _) in enumerate(sorted(word_to_index.items(), key=lambda x: x[1]))}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Convert text to sequences using updated vocabulary
sequences = [[word_to_index[word] for word in comment if word in word_to_index] for comment in df['comment_text']]
# Pad sequences to ensure uniform length
max_sequence_length = 100  # You can adjust this based on your data
padded_sequences = pad_sequence([torch.tensor(seq) for seq in sequences], batch_first=True, padding_value=0)

In [None]:
len(word_to_index ) #Unique words vocabulary

65068

## Creating Training and Validation Datasets

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df.iloc[:, 2:], test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float)

# Create DataLoader for train and test sets
train_dataset = TensorDataset(X_train, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = TensorDataset(X_test, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
len(train_loader)

3990

In [None]:
len(test_loader)

998

## Building the LSTM Model

This code defines an LSTM-based neural network model for text classification. The model consists of an embedding layer, an LSTM layer, and a fully connected layer with a sigmoid activation function. During the forward pass, input sequences are embedded, passed through the LSTM layer, and then the output is fed into the fully connected layer for classification.

In [None]:
# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_output, _ = self.lstm(embedded)
        output = self.fc(lstm_output[:, -1, :])
        output = self.sigmoid(output)
        return output


The model is instantiated with adjustable parameters like vocabulary size, embedding dimension, hidden dimension, and output dimension, along with loss and optimizer functions.

In [None]:
# Instantiate the model
vocab_size = len(word_to_index)
embedding_dim = 200
hidden_dim = 128
output_dim = 6  # Number of labels
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Model Training and Evaluation

This LSTM model is trained for 20 epochs and prints out the loss for every 100 batches and the average loss for the entire epoch. After training, it evaluates the model's performance on the test data, calculating the test loss and accuracy for each label.

In [None]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item()}')

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(train_loader)}')


Epoch 1/20, Batch 0/3990, Loss: 0.0643635168671608
Epoch 1/20, Batch 100/3990, Loss: 0.19180144369602203
Epoch 1/20, Batch 200/3990, Loss: 0.05892128869891167
Epoch 1/20, Batch 300/3990, Loss: 0.08162010461091995
Epoch 1/20, Batch 400/3990, Loss: 0.1019001230597496
Epoch 1/20, Batch 500/3990, Loss: 0.05386796593666077
Epoch 1/20, Batch 600/3990, Loss: 0.2185239940881729
Epoch 1/20, Batch 700/3990, Loss: 0.12495443969964981
Epoch 1/20, Batch 800/3990, Loss: 0.16791123151779175
Epoch 1/20, Batch 900/3990, Loss: 0.17504119873046875
Epoch 1/20, Batch 1000/3990, Loss: 0.15867926180362701
Epoch 1/20, Batch 1100/3990, Loss: 0.20147164165973663
Epoch 1/20, Batch 1200/3990, Loss: 0.14217238128185272
Epoch 1/20, Batch 1300/3990, Loss: 0.21017779409885406
Epoch 1/20, Batch 1400/3990, Loss: 0.07023362070322037
Epoch 1/20, Batch 1500/3990, Loss: 0.19596214592456818
Epoch 1/20, Batch 1600/3990, Loss: 0.12837444245815277
Epoch 1/20, Batch 1700/3990, Loss: 0.1533288210630417
Epoch 1/20, Batch 1800/399

In [None]:
# Evaluate the model
model.eval()
total_loss = 0
correct = np.zeros(6)  # Initialize an array to store the number of correct predictions for each label
total_samples = np.zeros(6)  # Initialize an array to store the total number of samples for each label

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        total_loss += criterion(outputs, labels).item()
        predicted = torch.round(outputs)
        correct += (predicted == labels).sum(dim=0).cpu().numpy()  # Sum along axis 0 (labels)
        total_samples += labels.size(0)

test_loss = total_loss / len(test_loader)
test_accuracy = correct / total_samples
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')


Test Loss: 0.12937157309132744, Test Accuracy: [0.93808554 0.98759204 0.97161209 0.99733668 0.96352812 0.98909604]


## Saving Model for Inferencing

In [None]:
torch.save(model.state_dict(), 'model_weights_20epoch.pth')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/DeepLearning/Lab 2/model_weights_20epoch.pth'))

# Ensure the model is in evaluation mode
model.eval()

LSTMModel(
  (embedding): Embedding(65068, 200)
  (lstm): LSTM(200, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=6, bias=True)
  (sigmoid): Sigmoid()
)

## Making Predictions using Test Data

The following steps will load the test dataset and preprocess it according to the model expected input

In [None]:
import pandas as pd

test_df = pd.read_csv(r"/content/drive/MyDrive/DeepLearning/Lab 2/test.csv")
#display(test_df.head(5))

test_df_labels = pd.read_csv(r"/content/drive/MyDrive/DeepLearning/Lab 2/test_labels.csv")
#display(test_df_labels.head(5))

#Removing the columns containing -1
cleaned_df = test_df_labels[(test_df_labels != -1).all(axis=1)]
#print(cleaned_df)
print("Length of test_df after dropping -1 :", len(cleaned_df))

#filtering the columns from test.csv
cleaned_ids = cleaned_df['id']
test_df = test_df[test_df['id'].isin(cleaned_ids)]
test_df = test_df.reset_index(drop=True)


# Display the filtered 'id' column
print(test_df)
print(len(test_df))

Length of test_df after dropping -1 : 63978
                     id                                       comment_text
0      0001ea8717f6de06  Thank you for understanding. I think very high...
1      000247e83dcc1211                   :Dear god this site is horrible.
2      0002f87b16116a7f  "::: Somebody will invariably try to add Relig...
3      0003e1cccfd5a40a  " \n\n It says it right there that it IS a typ...
4      00059ace3e3e9a53  " \n\n == Before adding a new product to the l...
...                 ...                                                ...
63973  fff8f64043129fa2  :Jerome, I see you never got around to this…! ...
63974  fff9d70fe0722906  ==Lucky bastard== \n http://wikimediafoundatio...
63975  fffa8a11c4378854  ==shame on you all!!!== \n\n You want to speak...
63976  fffac2a094c8e0e2  MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...
63977  fffb5451268fb5ba  " \n\n == Unicorn lair discovery == \n\n Suppo...

[63978 rows x 2 columns]
63978


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63978 entries, 0 to 63977
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            63978 non-null  object
 1   comment_text  63978 non-null  object
dtypes: object(2)
memory usage: 999.8+ KB


In [None]:
test_df['comment_text'] = test_df['comment_text'].str.lower().str.split()

In [None]:
# Download the stopwords list if not already downloaded
nltk.download('stopwords')

# Get the English stopwords list
stop_words = set(stopwords.words('english'))

# Create vocabulary with stop word removal
word_to_index = {}
index_to_word = {}
word_counts = defaultdict(int)  # Count occurrences of each word

# Iterate over comments to build vocabulary
for comment in test_df['comment_text']:
    for word in comment:
        if word.lower() not in stop_words:  # Check if the word is not a stop word
            word_counts[word] += 1  # Increment word count
            if word not in word_to_index:
                index = len(word_to_index)
                word_to_index[word] = index
                index_to_word[index] = word

# Filter out words with low frequency
min_word_frequency = 5  # Adjust as needed
word_to_index = {word: index for word, index in word_to_index.items() if word_counts[word] >= min_word_frequency}

# Reindex vocabulary after filtering
word_to_index = {word: idx for idx, (word, _) in enumerate(sorted(word_to_index.items(), key=lambda x: x[1]))}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Convert text to sequences using updated vocabulary
sequences = [[word_to_index[word] for word in comment if word in word_to_index] for comment in test_df['comment_text']]
# Pad sequences to ensure uniform length
max_sequence_length = 100
padded_sequences = pad_sequence([torch.tensor(seq) for seq in sequences], batch_first=True, padding_value=0)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Making test dataloaders

In [None]:
test = padded_sequences
# Convert test data to PyTorch tensor
X_test_tensor = torch.tensor(test, dtype=torch.long)  # Assuming 'test' contains your padded sequences
test_dataset = TensorDataset(X_test_tensor)
batch_size = 32
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  X_test_tensor = torch.tensor(test, dtype=torch.long)  # Assuming 'test' contains your padded sequences


Put the model in inferencing mode and display predicted results

In [None]:
import torch

# Put the model in evaluation mode
model.eval()

# Initialize empty list to store predicted probabilities
all_probabilities = []

# Iterate over batches in the test DataLoader
for inputs in test_loader:
    # Forward pass to get model outputs
    with torch.no_grad():
        outputs = model(inputs[0])

    # Apply sigmoid activation to convert outputs to probabilities
    probabilities = torch.sigmoid(outputs)

    # Append probabilities to the list
    all_probabilities.append(probabilities)

# Concatenate probabilities from all batches
all_probabilities = torch.cat(all_probabilities, dim=0)

# Convert probabilities to numpy array
probabilities_array = all_probabilities.cpu().numpy()

# Print the shape of the probabilities array
print("Shape of predicted probabilities array:", probabilities_array.shape)

Shape of predicted probabilities array: (63978, 6)


In [None]:
for data in test_loader:
  print(data)
  break

[tensor([[  0,   1,   2,  ...,   0,   0,   0],
        [  8,   9,  10,  ...,   0,   0,   0],
        [ 12,  13,  14,  ...,   0,   0,   0],
        ...,
        [590, 591, 592,  ...,   0,   0,   0],
        [594,   0,   0,  ...,   0,   0,   0],
        [ 41, 595, 596,  ...,   0,   0,   0]])]


In [None]:
test_ids = test_df['id']
probabilities_array = all_probabilities.cpu().numpy()
probabilities_df = pd.DataFrame(probabilities_array, columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])  # Adjust column names as per your labels
probabilities_df['id'] = test_ids
probabilities_df = probabilities_df[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
probabilities_df.to_csv('/content/drive/MyDrive/DeepLearning/Lab 2/submission_final_1.csv', index=False)

In [None]:
probabilities_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,0.500623,0.5,0.500027,0.5,0.500002,0.5
1,000247e83dcc1211,0.500000,0.5,0.500000,0.5,0.500001,0.5
2,0002f87b16116a7f,0.500003,0.5,0.500000,0.5,0.500000,0.5
3,0003e1cccfd5a40a,0.500002,0.5,0.500000,0.5,0.500000,0.5
4,00059ace3e3e9a53,0.500001,0.5,0.500000,0.5,0.500000,0.5
...,...,...,...,...,...,...,...
63973,fff8f64043129fa2,0.500016,0.5,0.500003,0.5,0.500000,0.5
63974,fff9d70fe0722906,0.500000,0.5,0.500000,0.5,0.500001,0.5
63975,fffa8a11c4378854,0.500079,0.5,0.500012,0.5,0.500020,0.5
63976,fffac2a094c8e0e2,0.500000,0.5,0.500000,0.5,0.500000,0.5


In [None]:
df_sub = pd.read_csv("/content/drive/MyDrive/DeepLearning/Lab 2/submission_final_1.csv")
display(df_sub)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,0.500623,0.5,0.500027,0.5,0.500002,0.5
1,000247e83dcc1211,0.500000,0.5,0.500000,0.5,0.500001,0.5
2,0002f87b16116a7f,0.500003,0.5,0.500000,0.5,0.500000,0.5
3,0003e1cccfd5a40a,0.500002,0.5,0.500000,0.5,0.500000,0.5
4,00059ace3e3e9a53,0.500001,0.5,0.500000,0.5,0.500000,0.5
...,...,...,...,...,...,...,...
63973,fff8f64043129fa2,0.500016,0.5,0.500003,0.5,0.500000,0.5
63974,fff9d70fe0722906,0.500000,0.5,0.500000,0.5,0.500001,0.5
63975,fffa8a11c4378854,0.500079,0.5,0.500012,0.5,0.500020,0.5
63976,fffac2a094c8e0e2,0.500000,0.5,0.500000,0.5,0.500000,0.5


In [None]:
for column in df_sub.columns[1:]:
    # Set values greater than 0.5 to 1, else 0
    df_sub[column] = df_sub[column].apply(lambda x: 1 if x > 0.50002 else 0)

display(df_sub)
df_sub.to_csv('/content/drive/MyDrive/DeepLearning/Lab 2/result_1.csv', index=False)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,1,0,1,0,0,0
1,000247e83dcc1211,0,0,0,0,0,0
2,0002f87b16116a7f,0,0,0,0,0,0
3,0003e1cccfd5a40a,0,0,0,0,0,0
4,00059ace3e3e9a53,0,0,0,0,0,0
...,...,...,...,...,...,...,...
63973,fff8f64043129fa2,0,0,0,0,0,0
63974,fff9d70fe0722906,0,0,0,0,0,0
63975,fffa8a11c4378854,1,0,0,0,0,0
63976,fffac2a094c8e0e2,0,0,0,0,0,0


In [None]:
test_lab = pd.read_csv(r"/content/drive/MyDrive/DeepLearning/Lab 2/test_labels.csv")
test_lab = test_lab[test_lab['id'].isin(cleaned_ids)]
test_lab = test_lab.reset_index(drop=True)

In [None]:
print(len(test_lab))
display(test_lab)

63978


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,0,0,0,0,0,0
1,000247e83dcc1211,0,0,0,0,0,0
2,0002f87b16116a7f,0,0,0,0,0,0
3,0003e1cccfd5a40a,0,0,0,0,0,0
4,00059ace3e3e9a53,0,0,0,0,0,0
...,...,...,...,...,...,...,...
63973,fff8f64043129fa2,0,0,0,0,0,0
63974,fff9d70fe0722906,0,0,0,0,0,0
63975,fffa8a11c4378854,0,0,0,0,0,0
63976,fffac2a094c8e0e2,1,0,1,0,1,0


In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
precision_values = []
recall_values = []
accuracy_values = []
f1_values = []

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for label in labels:
    precision = precision_score(test_lab[label], df_sub[label])
    recall = recall_score(test_lab[label], df_sub[label])
    accuracy = accuracy_score(test_lab[label], df_sub[label])
    f1 = f1_score(test_lab[label], df_sub[label])
    precision_values.append(precision)
    recall_values.append(recall)
    accuracy_values.append(accuracy)
    f1_values.append(f1)

average_precision = sum(precision_values) / len(precision_values)
average_recall = sum(recall_values) / len(recall_values)
average_accuracy = sum(accuracy_values) / len(accuracy_values)
average_f1 = sum(f1_values) / len(f1_values)

print("Average Precision:", average_precision)
print("Average Recall:", average_recall)
print("Average Accuracy:", average_accuracy)

Average Precision: 0.041547921101409556
Average Recall: 0.21325125553660665
Average Accuracy: 0.7930877280731918


In [None]:
df_sub.columns = ['id','toxic_pred', 'severe_toxic_pred', 'obscene_pred', 'threat_pred', 'insult_pred', 'identity_hate_pred']

In [None]:
df_sub

Unnamed: 0,id,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred
0,0001ea8717f6de06,1,0,1,0,0,0
1,000247e83dcc1211,0,0,0,0,0,0
2,0002f87b16116a7f,0,0,0,0,0,0
3,0003e1cccfd5a40a,0,0,0,0,0,0
4,00059ace3e3e9a53,0,0,0,0,0,0
...,...,...,...,...,...,...,...
63973,fff8f64043129fa2,0,0,0,0,0,0
63974,fff9d70fe0722906,0,0,0,0,0,0
63975,fffa8a11c4378854,1,0,0,0,0,0
63976,fffac2a094c8e0e2,0,0,0,0,0,0


In [None]:
test_lab.columns = ['id','toxic_actual', 'severe_toxic_actual', 'obscene_actual', 'threat_actual', 'insult_actual', 'identity_hate_actual']

In [None]:
test_lab

Unnamed: 0,id,toxic_actual,severe_toxic_actual,obscene_actual,threat_actual,insult_actual,identity_hate_actual
0,0001ea8717f6de06,0,0,0,0,0,0
1,000247e83dcc1211,0,0,0,0,0,0
2,0002f87b16116a7f,0,0,0,0,0,0
3,0003e1cccfd5a40a,0,0,0,0,0,0
4,00059ace3e3e9a53,0,0,0,0,0,0
...,...,...,...,...,...,...,...
63973,fff8f64043129fa2,0,0,0,0,0,0
63974,fff9d70fe0722906,0,0,0,0,0,0
63975,fffa8a11c4378854,0,0,0,0,0,0
63976,fffac2a094c8e0e2,1,0,1,0,1,0


In [None]:
combined_df = pd.merge(test_lab,df_sub, on= 'id')

In [None]:
print(combined_df)

                     id  toxic_actual  severe_toxic_actual  obscene_actual  \
0      0001ea8717f6de06             0                    0               0   
1      000247e83dcc1211             0                    0               0   
2      0002f87b16116a7f             0                    0               0   
3      0003e1cccfd5a40a             0                    0               0   
4      00059ace3e3e9a53             0                    0               0   
...                 ...           ...                  ...             ...   
63973  fff8f64043129fa2             0                    0               0   
63974  fff9d70fe0722906             0                    0               0   
63975  fffa8a11c4378854             0                    0               0   
63976  fffac2a094c8e0e2             1                    0               1   
63977  fffb5451268fb5ba             0                    0               0   

       threat_actual  insult_actual  identity_hate_actual  toxi

In [None]:
combined_df.to_csv('/content/drive/MyDrive/DeepLearning/Lab 2/labels_compare.csv')