In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

#Load dataset

df = pd.read_csv('train.csv')

#Drop rows with missing values

df = df.dropna()

df = df.head(100000)

#Create engineered features

df['q1_length'] = df['question1'].apply(len)
df['q2_length'] = df['question2'].apply(len)
df['length_diff'] = abs(df['q1_length'] - df['q2_length'])

chunk_size = 5000
similarities = []
q1_embeddings_list = []
q2_embeddings_list = []

def process_chunk(chunk):
    # Encode questions into embeddings
    q1_embeddings = model.encode(chunk['question1'].tolist(), convert_to_tensor=True)
    q2_embeddings = model.encode(chunk['question2'].tolist(), convert_to_tensor=True)
    q1_embeddings_list.append(q1_embeddings.cpu().numpy())
    q2_embeddings_list.append(q2_embeddings.cpu().numpy())
    # Compute cosine similarity
    return util.cos_sim(q1_embeddings, q2_embeddings).cpu().numpy()

# Iterate over chunks
for start in range(0, len(df), chunk_size):
    end = min(start + chunk_size, len(df))
    chunk = df.iloc[start:end]
    chunk_similarities = process_chunk(chunk)
    similarities.append(np.diagonal(chunk_similarities))

# Combine all similarities
df['cosine_similarity'] = np.concatenate(similarities)

#Display basic information about the dataset

#print(df.info())

#Check for missing values

#print(df.isnull().sum())

#Summary statistics of selected numerical features

features = ['q1_length', 'q2_length', 'length_diff', 'cosine_similarity']

for col in features:
    print(f"Feature: {col}")
    print(f"Mean: {df[col].mean():.2f}")
    print(f"Std Dev: {df[col].std():.2f}")
    print(f"Min: {df[col].min()}")
    print(f"Max: {df[col].max()}")
    print("-")

#Check for potential transformations

for col in ['q1_length', 'q2_length', 'length_diff']:
    if (df[col] > 0).all():  # Log transformation works only for positive values
        df[f'{col}_log'] = np.log1p(df[col])
        print(f"Log transformation applied to {col}")

#Display updated dataset

print(df.head())

Feature: q1_length
Mean: 59.39
Std Dev: 29.95
Min: 1
Max: 623
-
Feature: q2_length
Mean: 60.02
Std Dev: 34.05
Min: 1
Max: 1169
-
Feature: length_diff
Mean: 20.16
Std Dev: 25.76
Min: 0
Max: 1041
-
Feature: cosine_similarity
Mean: 0.67
Std Dev: 0.27
Min: -0.26585355401039124
Max: 1.0000004768371582
-
Log transformation applied to q1_length
Log transformation applied to q2_length
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  q1_length  \
0  What is the step by step guide to invest in sh...             0         66   
1  What would happen if t