In [2]:
!pip install sentence_transformers --quiet

In [4]:
import pandas as pd

# Load Dataset
df = pd.read_csv("train.csv")

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# compute length of questions
df["q1_length"] = df["question1"].astype(str).apply(len)
df["q2_length"] = df["question2"].astype(str).apply(len)
df["length_diff"] = abs(df["q1_length"] - df["q2_length"])

# compute semantic similarity using sentence embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
q1_embeddings = model.encode(df["question1"].astype(str).tolist(), convert_to_tensor=True)
q2_embeddings = model.encode(df["question2"].astype(str).tolist(), convert_to_tensor=True)

# Comput cosine similarity
df["cosine_similarity"] = [cosine_similarity([q1], [q2])[0][0] for q1, q2 in zip(q1_embeddings, q2_embeddings)]

In [7]:
# Select features and target variable
features = ["q1_length", "q2_length", "length_diff"]
target = "cosine_similarity"

from sklearn.model_selection import train_test_split

#Split data
X_train, X_temp, y_train, y_temp = train_test_split(df[features], df[target], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

# Decision Tree
dt = DecisionTreeRegressor(max_depth=5, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_val)
print("Decision Tree R²:", r2_score(y_val, y_pred_dt))

# KNN
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_val)
print("KNN R²:", r2_score(y_val, y_pred_knn))

Decision Tree R²: 0.21107602879919862
KNN R²: 0.06305843591690063


In [10]:
!pip install xgboost

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.1.4-py3-none-manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m111.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4


In [11]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Random Forest
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
print("Random Forest R²:", r2_score(y_val, y_pred_rf))

# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)
print("XGBoost R²:", r2_score(y_val, y_pred_xgb))

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


Random Forest R²: 0.21794970238601918
XGBoost R²: 0.21928000450134277


In [13]:
!pip install tensorflow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.1.24-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-2.5.0-py3-none-any.whl.metadata (6.1 kB)
Collecting grpcio<2.0,>=1.24.3 (from t

In [14]:
import tensorflow as tf
from tensorflow import keras

# Define Neural Network
nn = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(len(features),)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)  # Output layer
])

nn.compile(optimizer='adam', loss='mse', metrics=['mae'])
nn.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), batch_size=16, verbose=1)

# Evaluate
y_pred_nn = nn.predict(X_val).flatten()
print("Neural Network R²:", r2_score(y_val, y_pred_nn))

2025-02-10 20:38:44.166160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739219924.186814   20541 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739219924.192720   20541 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-10 20:38:44.211897: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Matplotlib is building the font cache; this may take a moment.


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism 

[1m17688/17688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1ms/step - loss: 3.6229 - mae: 0.4354 - val_loss: 0.0616 - val_mae: 0.2036
Epoch 2/50
[1m17688/17688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1ms/step - loss: 0.0639 - mae: 0.1960 - val_loss: 0.0571 - val_mae: 0.1864
Epoch 3/50
[1m17688/17688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1ms/step - loss: 0.0579 - mae: 0.1875 - val_loss: 0.0576 - val_mae: 0.1922
Epoch 4/50
[1m17688/17688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1ms/step - loss: 0.0574 - mae: 0.1865 - val_loss: 0.0568 - val_mae: 0.1866
Epoch 5/50
[1m17688/17688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1ms/step - loss: 0.0575 - mae: 0.1867 - val_loss: 0.0569 - val_mae: 0.1848
Epoch 6/50
[1m17688/17688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 1ms/step - loss: 0.0573 - mae: 0.1866 - val_loss: 0.0571 - val_mae: 0.1824
Epoch 7/50
[1m17688/17688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [15]:
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Define a function to train and evaluate models
def train_and_evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
   
    # Predict on training, validation, and test sets
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
   
    # Compute R² scores
    r2_train = r2_score(y_train, y_train_pred)
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
   
    # Print the results
    print(f"{model_name} Performance:")
    print(f"Training R²: {r2_train:.4f}")
    print(f"Validation R²: {r2_val:.4f}")
    print(f"Test R²: {r2_test:.4f}")
    print("="*40)
   
    return model, r2_train, r2_val, r2_test

# Initialize models
models = {
    "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5),  # Default: 5 neighbors
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Neural Network (MLP)": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    trained_model, r2_train, r2_val, r2_test = train_and_evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, model_name)
    results[model_name] = {"train": r2_train, "val": r2_val, "test": r2_test}

# Print summary of all model performances
print("Model Comparison Summary:")
for model_name, scores in results.items():
    print(f"{model_name}: Train R²: {scores['train']:.4f}, Val R²: {scores['val']:.4f}, Test R²: {scores['test']:.4f}")

K-Nearest Neighbors Performance:
Training R²: 0.1544
Validation R²: 0.0631
Test R²: 0.0592
Decision Tree Performance:
Training R²: 0.2855
Validation R²: 0.1329
Test R²: 0.1293
Random Forest Performance:
Training R²: 0.2818
Validation R²: 0.1600
Test R²: 0.1547
Neural Network (MLP) Performance:
Training R²: 0.1997
Validation R²: 0.2085
Test R²: 0.2008
Model Comparison Summary:
K-Nearest Neighbors: Train R²: 0.1544, Val R²: 0.0631, Test R²: 0.0592
Decision Tree: Train R²: 0.2855, Val R²: 0.1329, Test R²: 0.1293
Random Forest: Train R²: 0.2818, Val R²: 0.1600, Test R²: 0.1547
Neural Network (MLP): Train R²: 0.1997, Val R²: 0.2085, Test R²: 0.2008
