In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ===============================================================
# üì¶ Imports & GPU setup
# ===============================================================
!pip install transformers xgboost beautifulsoup4 --quiet

import numpy as np
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("‚úÖ Using device:", device)


In [None]:
# ===============================================================
# üßπ 1. Text Preprocessing
# ===============================================================
def preprocess_text(q):
    q = str(q).lower().strip()
    q = q.replace('%', ' percent').replace('$', ' dollar ').replace('‚Çπ', ' rupee ').replace('‚Ç¨', ' euro ').replace('@', ' at ')
    q = q.replace('[math]', '')
    q = q.replace(',000,000,000 ', 'b ').replace(',000,000 ', 'm ').replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

    contractions = {
        "can't": "cannot", "won't": "will not", "i'm": "i am", "you're": "you are",
        "he's": "he is", "she's": "she is", "it's": "it is", "that's": "that is",
        "they're": "they are", "isn't": "is not", "aren't": "are not", "wasn't": "was not",
        "weren't": "were not", "haven't": "have not", "hasn't": "has not", "didn't": "did not"
    }

    q = ' '.join([contractions[word] if word in contractions else word for word in q.split()])
    q = BeautifulSoup(q, 'html.parser').get_text()
    q = re.sub(r'\W', ' ', q).strip()
    return q


In [None]:
df = pd.read_csv("/kaggle/input/test-csv/test.csv")  # adjust path
df = df.dropna(subset=['question1', 'question2'])

df['question1'] = df['question1'].apply(preprocess_text)
df['question2'] = df['question2'].apply(preprocess_text)

print(df.head(2))
print("‚úÖ Preprocessing done.")


In [None]:
!ping -c 2 huggingface.co


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()


In [None]:
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling of token embeddings                                                                                                                                                                                                                                                                                                     
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()


In [None]:
q1_embeddings = np.vstack([get_sentence_embedding(q) for q in df['question1']])
q2_embeddings = np.vstack([get_sentence_embedding(q) for q in df['question2']])

# Save for reuse
np.save("/kaggle/working/q1_embeddings.npy", q1_embeddings)
np.save("/kaggle/working/q2_embeddings.npy", q2_embeddings)
print("‚úÖ Embeddings saved in /kaggle/working/")

In [None]:
 X_pair = np.hstack((q1_embeddings, q2_embeddings, np.abs(q1_embeddings - q2_embeddings))).reshape(1, -1)
#X = np.abs(q1_embeddings - q2_embeddings)
y = df['is_duplicate'].values


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm
import os

# -------------------------------
# 1Ô∏è‚É£ Load DistilBERT Embeddings
# -------------------------------
q1_embeddings = np.load("/kaggle/input/que1-2-embeds/q1_embeddings.npy")
q2_embeddings = np.load("/kaggle/input/que1-2-embeds/q2_embeddings.npy")

print("‚úÖ Q1 embeddings:", q1_embeddings.shape)
print("‚úÖ Q2 embeddings:", q2_embeddings.shape)

# -------------------------------
# 2Ô∏è‚É£ Feature Engineering
# -------------------------------

# Absolute difference
feat_abs_diff = np.abs(q1_embeddings - q2_embeddings)

# Element-wise product
feat_product = q1_embeddings * q2_embeddings

# Cosine similarity (scalar feature per pair)
cos_sim = np.array([
    cosine_similarity(q1_embeddings[i].reshape(1, -1),
                      q2_embeddings[i].reshape(1, -1))[0][0]
    for i in range(len(q1_embeddings))
]).reshape(-1, 1)

# Euclidean distance (scalar feature per pair)
euclid_dist = np.array([
    norm(q1_embeddings[i] - q2_embeddings[i])
    for i in range(len(q1_embeddings))
]).reshape(-1, 1)

# Concatenate both question embeddings
feat_concat = np.concatenate([q1_embeddings, q2_embeddings], axis=1)

# -------------------------------
# 3Ô∏è‚É£ Combine All Features
# -------------------------------
X_final = np.concatenate([
    feat_concat,      # 1536 dims
    feat_abs_diff,    # 768 dims
    feat_product,     # 768 dims
    cos_sim,          # 1 dim
    euclid_dist       # 1 dim
], axis=1)

print("‚úÖ Final feature matrix shape:", X_final.shape)

# -------------------------------
# 4Ô∏è‚É£ Save for future use
# -------------------------------
save_path = "/kaggle/working/X_final.npy"
np.save(save_path, X_final)
print(f"üíæ Saved final combined features to: {save_path}")


In [None]:
X_final=np.load("/kaggle/input/text-feat-embeds/X_final.npy")

In [None]:
import pandas as pd
import numpy as np

# Load your original train.csv
df = pd.read_csv("/kaggle/input/quora-duplicates/train.csv")

# Drop rows with missing questions ‚Äî must match the preprocessing used for embeddings
df = df[['question1', 'question2', 'is_duplicate']].dropna().reset_index(drop=True)

# Ensure same number of samples as embeddings
y = df['is_duplicate'].values

print("‚úÖ After cleaning:")
print("Labels shape:", y.shape)
print("X_final shape:", X_combined.shape)


In [None]:
min_len = min(len(X_final), len(y))
X_final = X_final[:min_len]
y = y[:min_len]
print("‚úÖ Shapes aligned:", X_final.shape, y.shape)


In [None]:
df = pd.read_csv("/kaggle/input/quora-duplicates/train.csv")  # adjust path

y = df['is_duplicate'].values
y.shape

In [None]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

# -------------------------------
# Load CSV
# -------------------------------
df = pd.read_csv("/kaggle/input/quora-duplicates/train.csv")
df = df[['question1','question2','is_duplicate']].dropna().reset_index(drop=True)

# -------------------------------
# Handcrafted + Fuzzy Features
# -------------------------------
def extract_features(row):
    q1 = str(row['question1'])
    q2 = str(row['question2'])
    
    # Length features
    len_q1 = len(q1)
    len_q2 = len(q2)
    char_count_diff = abs(len_q1 - len_q2)
    
    # Word count
    wc_q1 = len(q1.split())
    wc_q2 = len(q2.split())
    word_count_diff = abs(wc_q1 - wc_q2)
    
    # Common words
    common_words = len(set(q1.lower().split()) & set(q2.lower().split()))
    common_word_ratio = common_words / (wc_q1 + wc_q2 + 1e-5)  # avoid div 0
    
    # Fuzzy features
    fuzz_ratio = fuzz.ratio(q1, q2)
    fuzz_partial_ratio = fuzz.partial_ratio(q1, q2)
    token_sort_ratio = fuzz.token_sort_ratio(q1, q2)
    token_set_ratio = fuzz.token_set_ratio(q1, q2)
    
    return pd.Series([
        len_q1, len_q2, char_count_diff,
        wc_q1, wc_q2, word_count_diff,
        common_words, common_word_ratio,
        fuzz_ratio, fuzz_partial_ratio, token_sort_ratio, token_set_ratio
    ])

# Apply features
handcrafted_feats = df.apply(extract_features, axis=1)
handcrafted_feats.columns = [
    'len_q1','len_q2','char_count_diff',
    'wc_q1','wc_q2','word_count_diff',
    'common_words','common_word_ratio',
    'fuzz_ratio','fuzz_partial_ratio','token_sort_ratio','token_set_ratio'
]

print("‚úÖ Handcrafted features shape:", handcrafted_feats.shape)

# -------------------------------
# Combine with existing embeddings features
# -------------------------------
# Assuming X_final.npy is your DistilBERT + engineered features (3074 dims)
X_final = np.load("/kaggle/input/text-feat-embeds/X_final.npy")

# Align lengths
min_len = min(len(X_final), len(handcrafted_feats))  
X_final = X_final[:min_len]
handcrafted_feats = handcrafted_feats.iloc[:min_len]
y = df['is_duplicate'].values[:min_len]

# Combine
X_combined = np.concatenate([X_final, handcrafted_feats.values], axis=1)
print("‚úÖ Final combined feature shape:", X_combined.shape)

import numpy as np
save_path = "/kaggle/working/X_combined.npy"
np.save(save_path,X_combined )
print(f"üíæ Saved final combined features to: {save_path}")


In [None]:
X_combined=np.load("/kaggle/input/combined-embeds/X_combined.npy")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ Train shape:", X_train.shape)
print("‚úÖ Test shape:", X_test.shape)


In [None]:
y_train.shape

In [None]:

model_xgb = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='gpu_hist'
)
model_xgb.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = (model_xgb.predict_proba(X_test)[:, 1] > 0.5).astype(int)

print("‚úÖ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
import joblib

joblib.dump(model_xgb , "/kaggle/working/xgb_quora_model_combined.pkl")
print("üíæ Model saved successfully at /kaggle/working/xgb_quora_model_combined.pkl")


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm
import joblib

# -----------------------------------------------------------
# 1Ô∏è‚É£ Load saved model and tokenizer
# -----------------------------------------------------------
xgb_model = joblib.load("/kaggle/working/xgb_quora_model.pkl")

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
bert_model.to('cuda')
bert_model.eval()

# -----------------------------------------------------------
# 2Ô∏è‚É£ Preprocessing function (same as training)
# -----------------------------------------------------------
import re
from bs4 import BeautifulSoup

def preprocess(q):
    q = str(q).lower().strip()
    q = q.replace('%', ' percent').replace('$', ' dollar ').replace('‚Çπ', ' rupee ')
    q = q.replace('‚Ç¨', ' euro ').replace('@', ' at ')
    q = BeautifulSoup(q, 'html.parser').get_text()
    q = re.sub(r'[^a-zA-Z0-9\s]', ' ', q)
    q = re.sub(r'\s+', ' ', q).strip()
    return q

# -----------------------------------------------------------
# 3Ô∏è‚É£ Generate DistilBERT embeddings
# -----------------------------------------------------------
def get_bert_embedding(text):
    tokens = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=64
    ).to('cuda')

    with torch.no_grad():
        outputs = bert_model(**tokens)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # mean pooling
    return embedding

  # Concatenate both question embeddings
  
# -----------------------------------------------------------
# 4Ô∏è‚É£ Feature Engineering for single pair
# -----------------------------------------------------------
def make_features(q1_emb, q2_emb, q1, q2):
    # Basic numerical features
    feat_abs_diff = np.abs(q1_emb - q2_emb)
    feat_product = q1_emb * q2_emb

    # Similarities
    cos_sim = cosine_similarity(q1_emb, q2_emb)[0][0]
    euclid_dist = norm(q1_emb - q2_emb)

    # Combine embeddings
    feat_concat = np.concatenate([q1_emb, q2_emb], axis=1)

    # ‚úÖ Final feature vector
    X_final = np.concatenate(
        [
            feat_concat,              # (1, 1536)
            feat_abs_diff,            # (1, 768)
            feat_product,             # (1, 768)               # (1, 12)
            np.array([[cos_sim]]),    # (1, 1)
            np.array([[euclid_dist]]) # (1, 1)
        ],
        axis=1
    )

    return X_final


# -----------------------------------------------------------
# 5Ô∏è‚É£ Prediction function
# -----------------------------------------------------------
def predict_duplicate(q1, q2):
    q1_prep = preprocess(q1)
    q2_prep = preprocess(q2)

    q1_emb = get_bert_embedding(q1_prep)
    q2_emb = get_bert_embedding(q2_prep)

    X_input = make_features(q1_emb, q2_emb,q1,q2)
    
    pred = xgb_model.predict(X_input)[0]
    prob = xgb_model.predict_proba(X_input)[0][1]

    label = "‚úÖ Duplicate" if pred == 1 else "‚ùå Not Duplicate"
    print(f"\nPrediction: {label} (Confidence: {prob:.2f})")

# -----------------------------------------------------------
# üî• Example test
# ----------------------------------------------------------
game=True
while game==True:
    end=input("DO YOU WANNA END?,type Y for yes and N for NO").lower()
    if end=='y':
        game=False 
    else:
        que1=input("enter question 1:")
        que2=input("enter question 2:")
        predict_duplicate(que1,que2)
    

    
    

In [None]:
from sklearn.preprocessing import StandardScaler
import joblib


X_combined=np.load("/kaggle/working/X_combined.npy")
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Save for inference use
joblib.dump(scaler, "/kaggle/working/feature_scaler.pkl")

np.save("/kaggle/working/X_combined_scaled.npy", X_combined_scaled)
print("‚úÖ Scaled & saved features + scaler.")


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm
import joblib
from fuzzywuzzy import fuzz
import re
from bs4 import BeautifulSoup

# -------------------------------
# Load saved XGBoost model and tokenizer
# -------------------------------
xgb_model = joblib.load("/kaggle/working/xgb_quora_model2.pkl")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to('cuda')
bert_model.eval()

# -------------------------------
# Preprocessing function
# -------------------------------
def preprocess(q):
    q = str(q).lower().strip()
    q = q.replace('%',' percent').replace('$',' dollar ').replace('‚Çπ',' rupee ')
    q = q.replace('‚Ç¨',' euro ').replace('@',' at ')
    q = BeautifulSoup(q,'html.parser').get_text()
    q = re.sub(r'[^a-zA-Z0-9\s]',' ',q)
    q = re.sub(r'\s+',' ',q).strip()
    return q

# -------------------------------
# Generate BERT embedding
# -------------------------------
def get_bert_embedding(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=64).to('cuda')
    with torch.no_grad():
        outputs = bert_model(**tokens)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding

# -------------------------------
# Handcrafted + Fuzzy Features
# -------------------------------
def extract_features(q1, q2):
    len_q1, len_q2 = len(q1), len(q2)
    char_diff = abs(len_q1 - len_q2)
    wc_q1, wc_q2 = len(q1.split()), len(q2.split())
    word_diff = abs(wc_q1 - wc_q2)
    common_words = len(set(q1.lower().split()) & set(q2.lower().split()))
    common_ratio = common_words / (wc_q1 + wc_q2 + 1e-5)
    
    fuzz_ratio = fuzz.ratio(q1, q2)
    fuzz_partial = fuzz.partial_ratio(q1, q2)
    token_sort = fuzz.token_sort_ratio(q1, q2)
    token_set = fuzz.token_set_ratio(q1, q2)
    
    return np.array([[len_q1, len_q2, char_diff,
                      wc_q1, wc_q2, word_diff,
                      common_words, common_ratio,
                      fuzz_ratio, fuzz_partial, token_sort, token_set]])

# -------------------------------
# Create features exactly as training
# -------------------------------
def make_features(q1, q2):
    # Preprocess
    q1_p, q2_p = preprocess(q1), preprocess(q2)
    # BERT embeddings
    q1_emb, q2_emb = get_bert_embedding(q1_p), get_bert_embedding(q2_p)
    
    # Training pipeline features
    feat_concat = np.concatenate([q1_emb, q2_emb], axis=1)
    feat_abs_diff = np.abs(q1_emb - q2_emb)
    feat_product = q1_emb * q2_emb
    cos_sim = np.array([[cosine_similarity(q1_emb, q2_emb)[0][0]]])
    euclid_dist = np.array([[norm(q1_emb - q2_emb)]])
    
    # Handcrafted + fuzzy
    hand_feat = extract_features(q1, q2)
    
    # Concatenate all
    X_input = np.concatenate([feat_concat, feat_abs_diff, feat_product, cos_sim, euclid_dist, hand_feat], axis=1)
    return X_input

# -----------------------------------------------------------
# 6Ô∏è‚É£ Prediction Function with adjustable threshold
# -----------------------------------------------------------
def predict_duplicate(q1, q2, threshold=0.48):
    # Preprocess
    q1_prep = preprocess(q1)
    q2_prep = preprocess(q2)

    # Get embeddings
    q1_emb = get_bert_embedding(q1_prep)
    q2_emb = get_bert_embedding(q2_prep)

    # Create feature vector
    X_input = make_features(q1_emb, q2_emb)

    # Predict probability
    prob = xgb_model.predict_proba(X_input)[0][1]

    # Apply threshold
    label = "‚úÖ Duplicate" if prob >= threshold else "‚ùå Not Duplicate"

    # Display
    print(f"\nPrediction: {label} (Confidence: {prob:.2f}, Threshold: {threshold})")

# -------------------------------
# Interactive Testing
# -------------------------------
while True:
    end = input("DO YOU WANNA END? Type Y for yes and N for NO: ").lower()
    if end=='y':
        break
    q1 = input("Enter Question 1: ")
    q2 = input("Enter Question 2: ")
    predict_duplicate(q1, q2)


In [None]:
min_len = min(len(X_combined), len(handcrafted_feats))


In [None]:
def make_features(q1_emb, q2_emb,q1,q2):
    # Absolute difference
    feat_abs_diff = np.abs(q1_embeddings - q2_embeddings)
    
    # Element-wise product
    feat_product = q1_embeddings * q2_embeddings
    
    # Cosine similarity (scalar feature per pair)
    cos_sim = np.array([
        cosine_similarity(q1_embeddings[i].reshape(1, -1),
                          q2_embeddings[i].reshape(1, -1))[0][0]
        for i in range(len(q1_embeddings))
    ]).reshape(-1, 1)
    
    # Euclidean distance (scalar feature per pair)
    euclid_dist = np.array([
        norm(q1_embeddings[i] - q2_embeddings[i])
        for i in range(len(q1_embeddings))
    ]).reshape(-1, 1)
    # -------------------------------
    # 3Ô∏è‚É£ Combine All Features
    # -------------------------------
    X_final = np.concatenate([
        feat_concat,      # 1536 dims
        feat_abs_diff,    # 768 dims
        feat_product,     # 768 dims
        cos_sim,          # 1 dim
        euclid_dist,
        hand_craft.values# 1 dim
    ], axis=1)
    
    return X_final

In [None]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

In [None]:
# -------------------------------
# Handcrafted + Fuzzy Features
# -------------------------------
def extract_features(que1,que2):
    q1 = que1
    q2 = que2
    
    # Length features
    len_q1 = len(q1)
    len_q2 = len(q2)
    char_count_diff = abs(len_q1 - len_q2)
    
    # Word count
    wc_q1 = len(q1.split())
    wc_q2 = len(q2.split())
    word_count_diff = abs(wc_q1 - wc_q2)
    
    # Common words
    common_words = len(set(q1.lower().split()) & set(q2.lower().split()))
    common_word_ratio = common_words / (wc_q1 + wc_q2 + 1e-5)  # avoid div 0
    
    # Fuzzy features
    fuzz_ratio = fuzz.ratio(q1, q2)
    fuzz_partial_ratio = fuzz.partial_ratio(q1, q2)
    token_sort_ratio = fuzz.token_sort_ratio(q1, q2)
    token_set_ratio = fuzz.token_set_ratio(q1, q2)
    
    return pd.Series([
        len_q1, len_q2, char_count_diff,
        wc_q1, wc_q2, word_count_diff,
        common_words, common_word_ratio,
        fuzz_ratio, fuzz_partial_ratio, token_sort_ratio, token_set_ratio
    ])

# Apply features

# -------------------------------