In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# import all the models,scorers,cross_validate,pipeline and scalers
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer

from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [2]:
# Read the csv and turn it into a panda dataframe
df = pd.read_csv('quora.csv')
# Show the 5 first rows to see all the features 
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
# Remove the three first columns 
df1 = df.drop(columns=["id", "qid1", "qid2"])
df1

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...
404285,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,What is one coin?,What's this coin?,0
404288,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [17]:
df2 = df1.iloc[:1000].copy()
stop_word = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at', 'to', 'of', 'is', 'it'}
# Function to remove stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def remove_stopwords(text):
    if not isinstance(text, str):
        return ""
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_word]
    return ' '.join(filtered_words)

# Apply to both questions
df2['question1'] = df2['question1'].apply(remove_stopwords)
df2['question2'] = df2['question2'].apply(remove_stopwords)
df2

Unnamed: 0,question1,question2,is_duplicate
0,what step by step guide invest share market in...,what step by step guide invest share market ?,0
1,what story kohinoor ( koh-i-noor ) diamond ?,what would happen if indian government stole k...,0
2,how can i increase speed my internet connectio...,how can internet speed be increased by hacking...,0
3,why am i mentally very lonely ? how can i solve ?,find remainder when [ math ] 23^ { 24 } [ /mat...,0
4,"which one dissolve water quikly sugar , salt ,...",which fish would survive salt water ?,0
...,...,...,...
995,i am straight student but have no motivation w...,"my fiancée died recently pains my heart , how ...",0
996,which best shares purchase sale daily trading ?,"sydney , which company would be best get advic...",0
997,i my girlfriends private partstouched each oth...,why most cosmetic products do n't have price t...,0
998,could we use cherenkov atmosphere radiation ( ...,can we map surface ( subsurface ) planet using...,1


In [7]:
pip install transformers datasets scikit-learn torch


Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# 1. Load and preprocess data
df1['question1'] = df1['question1'].astype(str).fillna("")
df1['question2'] = df1['question2'].astype(str).fillna("")
df1 = df1[['question1', 'question2', 'is_duplicate']].dropna()

# 2. Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    list(zip(df1['question1'], df1['question2'])), df1['is_duplicate'], test_size=0.2, random_state=42)

# 3. Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 4. Dataset class
class QuestionPairDataset(Dataset):
    def __init__(self, question_pairs, labels, tokenizer, max_len=128):
        self.pairs = question_pairs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        q1, q2 = self.pairs[idx]
        encoding = self.tokenizer(
            q1, q2,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }

# 5. Create datasets and dataloaders
train_dataset = QuestionPairDataset(train_texts, train_labels, tokenizer)
val_dataset = QuestionPairDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# 6. Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 7. Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# 8. Training loop (1 epoch for simplicity)
model.train()
for batch in tqdm(train_loader, desc="Training"):
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

import torch.nn.functional as F
from sklearn.metrics import log_loss

model.eval()
all_probs = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating Log Loss"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)

        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute log loss
loss = log_loss(all_labels, all_probs)
print(f"Log Loss: {loss:.4f}")

  torch.utils._pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training:   0%|          | 30/20215 [05:23<60:25:43, 10.78s/it]


KeyboardInterrupt: 

In [16]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from tqdm import tqdm
import torch.nn.functional as F
import random

# 1. Load and preprocess data
df1['question1'] = df1['question1'].astype(str).fillna("")
df1['question2'] = df1['question2'].astype(str).fillna("")
df1 = df1[['question1', 'question2', 'is_duplicate']].dropna()

print("✅ Total samples:", len(df1))  # Should be 20215

# 2. Split dataset (convert labels to list to avoid .iloc issues)
all_pairs = list(zip(df1['question1'], df1['question2']))
all_labels = df1['is_duplicate'].tolist()

train_pairs, val_pairs, train_labels, val_labels = train_test_split(
    all_pairs, all_labels, test_size=0.2, random_state=42
)

# ✅ Limit validation to 1000 samples
val_subset = random.sample(list(zip(val_pairs, val_labels)), 1000)
val_pairs, val_labels = zip(*val_subset)
val_pairs = list(val_pairs)
val_labels = list(val_labels)

print(f"✅ Train samples: {len(train_pairs)}")  # ~16172
print(f"✅ Val samples: {len(val_pairs)}")      # 1000

# 3. Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 4. Dataset class
class QuestionPairDataset(Dataset):
    def __init__(self, question_pairs, labels, tokenizer, max_len=128):
        self.pairs = question_pairs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        q1, q2 = self.pairs[idx]
        encoding = self.tokenizer(
            q1, q2,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 5. Create datasets and dataloaders
train_dataset = QuestionPairDataset(train_pairs, train_labels, tokenizer)
val_dataset = QuestionPairDataset(val_pairs, val_labels, tokenizer)

print("✅ train_dataset length:", len(train_dataset))  # ~16172
print("✅ val_dataset length:", len(val_dataset))      # 1000

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# 6. Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 7. Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# 8. Training loop
model.train()
for batch in tqdm(train_loader, desc="Training"):
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# 9. Evaluation
model.eval()
all_probs = []
all_true = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating Log Loss"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)

        all_probs.extend(probs.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

loss = log_loss(all_true, all_probs)
print(f"✅ Log Loss on 1000 validation samples: {loss:.4f}")

✅ Total samples: 404290
✅ Train samples: 323432
✅ Val samples: 1000




✅ train_dataset length: 323432
✅ val_dataset length: 1000


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training:   0%|          | 2/20215 [00:24<68:01:28, 12.12s/it]


KeyboardInterrupt: 

In [18]:
print("✅ Length of train_loader.dataset:", len(train_loader.dataset))


✅ Length of train_loader.dataset: 323432
