## CSC 575 HW#4 Learning To Rank

## 1 Loading Libraries and Dataset

In [4]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import math
import warnings
warnings.filterwarnings('ignore')

# File paths
pkl_train_x = "C:/Users/nachi/Desktop/Info_Retrieval/Info_Assign_4/train_x.pkl"
pkl_train_y = "C:/Users/nachi/Desktop/Info_Retrieval/Info_Assign_4/train_y.pkl"
pkl_test = "C:/Users/nachi/Desktop/Info_Retrieval/Info_Assign_4/test.pkl"

# Load pickle files
train_x = pd.read_pickle(pkl_train_x)
train_y = pd.read_pickle(pkl_train_y)
test_x = pd.read_pickle(pkl_test)


In [5]:
train_x.columns

Index(['id', 'product_uid', 'product_title', 'search_term',
       'product_description', 'attributes'],
      dtype='object')

In [6]:
train_x.shape

(74067, 6)

In [7]:
test_x.shape

(112067, 6)

In [8]:
train_x.head() 

Unnamed: 0,id,product_uid,product_title,search_term,product_description,attributes
0,2,100001,"[simpson, strongti, 12gaug, angl]","[angl, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
1,3,100001,"[simpson, strongti, 12gaug, angl]","[l, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
2,9,100002,"[behr, premium, textur, deckov, 1gal, sc141, t...",[deck],"[behr, premium, textur, deckov, innov, solid, ...","[applic, method, brushrollerspray, assembl, de..."
3,16,100005,"[delta, vero, 1handl, shower, faucet, trim, ki...","[rain, shower, head]","[updat, bathroom, delta, vero, singlehandl, sh...","[bath, faucet, type, combo, tub, shower, built..."
4,17,100005,"[delta, vero, 1handl, shower, faucet, trim, ki...","[shower, faucet]","[updat, bathroom, delta, vero, singlehandl, sh...","[bath, faucet, type, combo, tub, shower, built..."


In [9]:
train_y.head()

0    3.00
1    2.50
2    3.00
3    2.33
4    2.67
Name: relevance, dtype: float64

In [11]:
test_x.shape

(112067, 6)

In [12]:
test_x.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description,attributes
0,4,100001,"[simpson, strongti, 12gaug, angl]","[metal, l, bracket]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
1,5,100001,"[simpson, strongti, 12gaug, angl]","[simpson, sku, abl]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
2,6,100001,"[simpson, strongti, 12gaug, angl]","[simpson, strong, tie]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
3,7,100001,"[simpson, strongti, 12gaug, angl]","[simpson, strong, tie, hcc668]","[angl, make, joint, stronger, also, provid, co...","[bullet01, versatil, connector, variou, 90, co..."
4,10,100003,"[sterl, ensembl, 3314, x, 60, x, 7514, bath, s...","[bath, shower, kit]","[classic, architectur, meet, contemporari, des...","[builtin, flang, ye, bullet01, slightli, narro..."


## 2. TFxIDF  Vectorizing the Data

In [13]:

train_x['product_title'] = train_x['product_title'].apply(lambda x: ' '.join(x))
train_x['search_term'] = train_x['search_term'].apply(lambda x: ' '.join(x))
train_x['product_description'] = train_x['product_description'].apply(lambda x: ' '.join(x))
train_x['attributes'] = train_x['attributes'].apply(lambda x: ' '.join(x))

test_x['product_title'] = test_x['product_title'].apply(lambda x: ' '.join(x))
test_x['search_term'] = test_x['search_term'].apply(lambda x: ' '.join(x))
test_x['product_description'] = test_x['product_description'].apply(lambda x: ' '.join(x))
test_x['attributes'] = test_x['attributes'].apply(lambda x: ' '.join(x))

#We use sklearn TfidfVectorizer instance to create a vectorizer and transform both training and testing data on it. As the number of unique words is approximately around 250000, we use max features as 3000 to reduce the complexity of the problem.

vect = TfidfVectorizer(max_features=1000)
vect.fit(train_x['product_title'] + train_x['search_term'] + train_x['product_description'] + train_x['attributes'])

vect_product_title = vect.transform(train_x['product_title'])
vect_search_term = vect.transform(train_x['search_term'])
vect_product_description = vect.transform(train_x['product_description'])
vect_attributes = vect.transform(train_x['attributes'])

#convert to array from sparse matrices
vect_product_title = vect_product_title.toarray()
vect_search_term = vect_search_term.toarray()
vect_product_description = vect_product_description.toarray()
vect_attributes = vect_attributes.toarray()

#converting test data to tfidf vectors

test_vect_product_title = vect.transform(test_x['product_title'])
test_vect_search_term = vect.transform(test_x['search_term'])
test_vect_product_description = vect.transform(test_x['product_description'])
test_vect_attributes = vect.transform(test_x['attributes'])

test_vect_product_title = test_vect_product_title.toarray()
test_vect_search_term = test_vect_search_term.toarray()
test_vect_product_description = test_vect_product_description.toarray()
test_vect_attributes = test_vect_attributes.toarray()



## 3. Defining Similarity Measures

##  A. Cosine Similarity
##  B. Jaccard Similarity
##  C. Euclidean Distance

In [14]:
def cosine_similarity(vect1,vect2):
  """
  returns the cosine similarity between two vectors
  """
  numerator = sum([a * b for a, b in zip(vect1, vect2)])
  denominator = math.sqrt(sum([a ** 2 for a in vect1])) * math.sqrt(sum([b ** 2 for b in vect2]))
  return numerator / denominator if denominator != 0 else 0

def jaccard_similarity(x, y):
    """Returns the Jaccard similarity between two lists"""
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    if union_cardinality == 0:
        return 0
    else:
        return intersection_cardinality / float(union_cardinality)

def euclidean_distance(x,y):
  """ return euclidean distance between two lists """
  return math.sqrt(sum(math.pow(a-b,2) for a, b in zip(x, y)))


df = pd.DataFrame(columns=['cosine_pt_st','cosine_pt_pd','cosine_pt_a','jaccard_pt_st','jaccard_pt_pd','jaccard_pt_a','euclidean_pt_st','euclidean_pt_pd','euclidean_pt_a'])

## 4. Applying Similarity Measures on Train Dataset

In [15]:
chunk_size = 5000

# Create an empty dataframe to store the results
train_features = pd.DataFrame(columns=['cosine_st_pt','cosine_st_pd','cosine_st_a','jaccard_st_pt','jaccard_st_pd','jaccard_st_a','euclidean_st_pt','euclidean_st_pd','euclidean_st_a'])

# Process the training data in chunks
for chunk_start in range(0, len(train_x), chunk_size):
    chunk_end = min(chunk_start + chunk_size, len(train_x))
    chunk_train_x = train_x.iloc[chunk_start:chunk_end]
    
    # Compute TF-IDF vectors for the chunk
    chunk_vect_search_term = vect.transform(chunk_train_x['search_term'])
    chunk_vect_product_title = vect.transform(chunk_train_x['product_title'])
    chunk_vect_product_description = vect.transform(chunk_train_x['product_description'])
    chunk_vect_attributes = vect.transform(chunk_train_x['attributes'])
    
    chunk_vect_search_term = chunk_vect_search_term.toarray()
    chunk_vect_product_title = chunk_vect_product_title.toarray()
    chunk_vect_product_description = chunk_vect_product_description.toarray()
    chunk_vect_attributes = chunk_vect_attributes.toarray()
    
    # Initialize a temporary dataframe to store features for this chunk
    chunk_features = pd.DataFrame(columns=train_features.columns)
    
    # Compute features for each row in the chunk
    for i in tqdm(range(len(chunk_train_x))):
        cosine_st_pt = cosine_similarity(chunk_vect_search_term[i], chunk_vect_product_title[i])
        cosine_st_pd = cosine_similarity(chunk_vect_search_term[i], chunk_vect_product_description[i])
        cosine_st_a = cosine_similarity(chunk_vect_search_term[i], chunk_vect_attributes[i])
        
        jaccard_st_pt = jaccard_similarity(chunk_train_x['search_term'].iloc[i].split(), chunk_train_x['product_title'].iloc[i].split())
        jaccard_st_pd = jaccard_similarity(chunk_train_x['search_term'].iloc[i].split(), chunk_train_x['product_description'].iloc[i].split())
        jaccard_st_a = jaccard_similarity(chunk_train_x['search_term'].iloc[i].split(), chunk_train_x['attributes'].iloc[i].split())
        
        euclidean_st_pt = euclidean_distance(chunk_vect_search_term[i], chunk_vect_product_title[i])
        euclidean_st_pd = euclidean_distance(chunk_vect_search_term[i], chunk_vect_product_description[i])
        euclidean_st_a = euclidean_distance(chunk_vect_search_term[i], chunk_vect_attributes[i])
        
        chunk_features.loc[i] = [cosine_st_pt,cosine_st_pd,cosine_st_a,jaccard_st_pt,jaccard_st_pd,jaccard_st_a,euclidean_st_pt,euclidean_st_pd,euclidean_st_a]
    
    # Concatenate the features for this chunk with the overall features dataframe
    train_features = pd.concat([train_features, chunk_features], axis=0)

# Create an empty dataframe to store the results
test_features = pd.DataFrame(columns=['cosine_st_pt','cosine_st_pd','cosine_st_a','jaccard_st_pt','jaccard_st_pd','jaccard_st_a','euclidean_st_pt','euclidean_st_pd','euclidean_st_a'])

100%|██████████| 5000/5000 [00:17<00:00, 290.41it/s]
100%|██████████| 5000/5000 [00:16<00:00, 295.04it/s]
100%|██████████| 5000/5000 [00:17<00:00, 293.80it/s]
100%|██████████| 5000/5000 [00:16<00:00, 300.36it/s]
100%|██████████| 5000/5000 [00:16<00:00, 296.97it/s]
100%|██████████| 5000/5000 [00:16<00:00, 294.25it/s]
100%|██████████| 5000/5000 [00:16<00:00, 294.74it/s]
100%|██████████| 5000/5000 [00:17<00:00, 286.37it/s]
100%|██████████| 5000/5000 [00:17<00:00, 282.49it/s]
100%|██████████| 5000/5000 [00:17<00:00, 288.10it/s]
100%|██████████| 5000/5000 [00:16<00:00, 296.30it/s]
100%|██████████| 5000/5000 [00:17<00:00, 290.96it/s]
100%|██████████| 5000/5000 [00:16<00:00, 294.75it/s]
100%|██████████| 5000/5000 [00:17<00:00, 293.41it/s]
100%|██████████| 4067/4067 [00:14<00:00, 289.14it/s]


## 5. Applying Similarity Measures on Test Dataset

In [16]:
# Process the test data in chunks
for chunk_start in range(0, len(test_x), chunk_size):
    chunk_end = min(chunk_start + chunk_size, len(test_x))
    chunk_test_x = test_x.iloc[chunk_start:chunk_end]
    
    # Compute TF-IDF vectors for the chunk
    chunk_test_vect_search_term = vect.transform(chunk_test_x['search_term'])
    chunk_test_vect_product_title = vect.transform(chunk_test_x['product_title'])
    chunk_test_vect_product_description = vect.transform(chunk_test_x['product_description'])
    chunk_test_vect_attributes = vect.transform(chunk_test_x['attributes'])
    
    chunk_test_vect_search_term = chunk_test_vect_search_term.toarray()
    chunk_test_vect_product_title = chunk_test_vect_product_title.toarray()
    chunk_test_vect_product_description = chunk_test_vect_product_description.toarray()
    chunk_test_vect_attributes = chunk_test_vect_attributes.toarray()
    
    # Initialize a temporary dataframe to store features for this chunk
    chunk_features = pd.DataFrame(columns=test_features.columns)
    
    # Compute features for each row in the chunk
    for i in tqdm(range(len(chunk_test_x))):
        cosine_st_pt = cosine_similarity(chunk_test_vect_search_term[i], chunk_test_vect_product_title[i])
        cosine_st_pd = cosine_similarity(chunk_test_vect_search_term[i], chunk_test_vect_product_description[i])
        cosine_st_a = cosine_similarity(chunk_test_vect_search_term[i], chunk_test_vect_attributes[i])
        
        jaccard_st_pt = jaccard_similarity(chunk_test_x['search_term'].iloc[i].split(), chunk_test_x['product_title'].iloc[i].split())
        jaccard_st_pd = jaccard_similarity(chunk_test_x['search_term'].iloc[i].split(), chunk_test_x['product_description'].iloc[i].split())
        jaccard_st_a = jaccard_similarity(chunk_test_x['search_term'].iloc[i].split(), chunk_test_x['attributes'].iloc[i].split())
        
        euclidean_st_pt = euclidean_distance(chunk_test_vect_search_term[i], chunk_test_vect_product_title[i])
        euclidean_st_pd = euclidean_distance(chunk_test_vect_search_term[i], chunk_test_vect_product_description[i])
        euclidean_st_a = euclidean_distance(chunk_test_vect_search_term[i], chunk_test_vect_attributes[i])
        
        chunk_features.loc[i] = [cosine_st_pt,cosine_st_pd,cosine_st_a,jaccard_st_pt,jaccard_st_pd,jaccard_st_a,euclidean_st_pt,euclidean_st_pd,euclidean_st_a]
    
    # Concatenate the features for this chunk with the overall features dataframe
    test_features = pd.concat([test_features, chunk_features], axis=0)

100%|██████████| 5000/5000 [00:17<00:00, 292.81it/s]
100%|██████████| 5000/5000 [00:16<00:00, 297.05it/s]
100%|██████████| 5000/5000 [00:16<00:00, 297.59it/s]
100%|██████████| 5000/5000 [00:16<00:00, 300.43it/s]
100%|██████████| 5000/5000 [00:16<00:00, 297.92it/s]
100%|██████████| 5000/5000 [00:16<00:00, 296.81it/s]
100%|██████████| 5000/5000 [00:16<00:00, 300.00it/s]
100%|██████████| 5000/5000 [00:17<00:00, 289.90it/s]
100%|██████████| 5000/5000 [00:16<00:00, 294.95it/s]
100%|██████████| 5000/5000 [00:16<00:00, 301.04it/s]
100%|██████████| 5000/5000 [00:16<00:00, 296.01it/s]
100%|██████████| 5000/5000 [00:16<00:00, 300.42it/s]
100%|██████████| 5000/5000 [00:16<00:00, 299.65it/s]
100%|██████████| 5000/5000 [00:16<00:00, 295.09it/s]
100%|██████████| 5000/5000 [00:16<00:00, 301.01it/s]
100%|██████████| 5000/5000 [00:16<00:00, 298.02it/s]
100%|██████████| 5000/5000 [00:16<00:00, 295.32it/s]
100%|██████████| 5000/5000 [00:16<00:00, 299.20it/s]
100%|██████████| 5000/5000 [00:17<00:00, 286.5

In [17]:
train_features.head()

Unnamed: 0,cosine_st_pt,cosine_st_pd,cosine_st_a,jaccard_st_pt,jaccard_st_pd,jaccard_st_a,euclidean_st_pt,euclidean_st_pd,euclidean_st_a
0,0.709927,0.261701,0.115939,0.2,0.015385,0.015625,0.761673,1.215153,1.329708
1,0.0,0.0,0.0,0.0,0.0,0.0,1.414214,1.414214,1.414214
2,0.0,0.273221,0.251018,0.0,0.010526,0.007634,1.414214,1.205636,1.223913
3,0.287774,0.112145,0.067727,0.083333,0.016393,0.020408,1.193504,1.332558,1.365483
4,0.511535,0.199345,0.389144,0.2,0.033898,0.020619,0.988397,1.265429,1.105311


In [18]:
test_features.head()

Unnamed: 0,cosine_st_pt,cosine_st_pd,cosine_st_a,jaccard_st_pt,jaccard_st_pd,jaccard_st_a,euclidean_st_pt,euclidean_st_pd,euclidean_st_a
0,0.0,0.0,0.0,0.0,0.0,0.0,1.414214,1.414214,1.414214
1,0.0,0.0,0.0,0.166667,0.015152,0.015385,1.0,1.0,1.0
2,0.0,0.0,0.155892,0.166667,0.015152,0.03125,1.414214,1.414214,1.299314
3,0.0,0.0,0.155892,0.142857,0.014925,0.030769,1.414214,1.414214,1.299314
4,0.733134,0.079048,0.250267,0.25,0.017544,0.020202,0.73057,1.357167,1.224527


In [29]:
# Extract the 'id' column from test_x
test_id = test_x['id']

## 6.1 Linear Regression Model

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Step 2: Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_y, test_size=0.2, random_state=42)

# Step 3: Train a regression model
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

# Step 4: Evaluate the model on the validation set
val_predictions = regression_model.predict(X_val)
val_rmse = mean_squared_error(y_val, val_predictions, squared=False)
val_mse = mean_squared_error(y_val, val_predictions)
print("Validation RMSE:", val_rmse)
print("Validation MSE:", val_mse)
# Step 5: Make predictions on the test set
test_predictions = regression_model.predict(test_features)


Validation RMSE: 0.5051452522698641
Validation MSE: 0.2551717258907846


In [74]:

# Display the predicted relevance scores for the test dataset
print("Predicted relevance scores (test_y):")
print(test_predictions)


Predicted relevance scores (test_y):
[2.11061212 2.34838588 2.28487573 ... 2.91172535 2.34097374 2.31147913]


In [75]:
predicted_df = pd.DataFrame(columns=['id', 'relevance'])

# Fill in the 'id' column with the extracted 'id' values
predicted_df['id'] = test_id

# Fill in the 'predicted_relevance' column with the predicted values
predicted_df['relevance'] = test_predictions


predicted_df.to_csv('Info_Sub_1.csv', index=False)

## 6.2 Support Vector Regressor Model

In [76]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Step 3: Train a Support Vector Regression model
svr_model = SVR()
svr_model.fit(X_train, y_train)

# Step 4: Evaluate the model on the validation set
val_predictions = svr_model.predict(X_val)
val_rmse = mean_squared_error(y_val, val_predictions, squared=False)
val_mse = mean_squared_error(y_val, val_predictions)
print("Validation RMSE:", val_rmse)
print("Validation MSE:", val_mse)

# Step 5: Make predictions on the test set
test_predictions = svr_model.predict(test_features)

# Display the predicted relevance scores for the test dataset
print("Predicted relevance scores (test_y):")
print(test_predictions)

# Create a DataFrame to store the predictions and other relevant columns
predicted_df = pd.DataFrame(columns=['id', 'relevance'])

# Fill in the 'id' column with the extracted 'id' values
predicted_df['id'] = test_id

# Fill in the 'relevance' column with the predicted values
predicted_df['relevance'] = test_predictions

# Save the DataFrame to a CSV file
predicted_df.to_csv('Info_Sub_2.csv', index=False)


Validation RMSE: 0.5064921094962158
Validation MSE: 0.25653425698192667
Predicted relevance scores (test_y):
[2.10000573 2.54867674 2.38205306 ... 2.79468371 2.45881095 2.35327494]


## 6.3 Gradient Boosting Regressor Model

In [77]:
from sklearn.ensemble import GradientBoostingRegressor

# Step 3: Train a Gradient Boosting Regression model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)

# Step 4: Evaluate the model on the validation set
val_predictions_gb = gb_model.predict(X_val)
val_rmse_gb = mean_squared_error(y_val, val_predictions_gb, squared=False)
val_mse = mean_squared_error(y_val, val_predictions_gb)
print("Validation RMSE:", val_rmse_gb)
print("Validation MSE:", val_mse)

Validation RMSE: 0.49945415707486435
Validation MSE: 0.24945445501936328


In [78]:
# Step 5: Make predictions on the test set
test_predictions_gb = gb_model.predict(test_features)

# Display the predicted relevance scores for the test dataset
print("Predicted relevance scores using Gradient Boosting:")
print(test_predictions_gb)

# Create a DataFrame to store the predictions and other relevant columns
predicted_df = pd.DataFrame(columns=['id', 'relevance'])

# Fill in the 'id' column with the extracted 'id' values
predicted_df['id'] = test_id

# Fill in the 'relevance' column with the predicted values
predicted_df['relevance'] = test_predictions_gb

# Save the DataFrame to a CSV file
predicted_df.to_csv('Info_Sub_3.csv', index=False)

Predicted relevance scores using Gradient Boosting:
[1.98612298 2.4643741  2.32976787 ... 2.68401974 2.32771558 2.29819585]


## 6.4 Random Forest Regressor Model

In [79]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 3: Train a Random Forest Regression model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Step 4: Evaluate the model on the validation set
val_predictions_rf = rf_model.predict(X_val)
val_rmse_rf = mean_squared_error(y_val, val_predictions_rf, squared=False)
val_mse = mean_squared_error(y_val, val_predictions_rf)
print("Validation RMSE:", val_rmse_rf)
print("Validation MSE", val_mse)
      
test_predictions_rf = rf_model.predict(test_features)

# Display the predicted relevance scores for the test dataset
print("Predicted relevance scores using Random Forest Regressor:")
print(test_predictions_rf)

# Create a DataFrame to store the predictions and other relevant columns
predicted_df = pd.DataFrame(columns=['id', 'relevance'])

# Fill in the 'id' column with the extracted 'id' values
predicted_df['id'] = test_id

# Fill in the 'relevance' column with the predicted values
predicted_df['relevance'] = test_predictions_rf

# Save the DataFrame to a CSV file
predicted_df.to_csv('Info_Sub_4.csv', index=False)

Validation RMSE: 0.5111435574936236
Validation MSE 0.2612677363672373
Predicted relevance scores using Random Forest Regressor:
[1.98074005 2.6632     2.6839     ... 2.66536667 2.4139     2.2197    ]


## 6.5 Decision Tree Regressor Model

In [58]:
from sklearn.tree import DecisionTreeRegressor

# Create the model
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = dt_model.predict(X_val)
# Calculate RMSE and MSE on the validation set
val_rmse = mean_squared_error(y_val, val_predictions, squared=False)
val_mse = mean_squared_error(y_val, val_predictions)
print("Validation RMSE:", val_rmse)
print("Validation MSE:", val_mse)
# Make predictions
dt_predictions = dt_model.predict(test_features)

# Create a DataFrame to store the predictions and other relevant columns
predicted_df = pd.DataFrame(columns=['id', 'relevance'])

# Fill in the 'id' column with the extracted 'id' values
predicted_df['id'] = test_id

# Fill in the 'relevance' column with the predicted values
predicted_df['relevance'] = dt_predictions

# Save the DataFrame to a CSV file
predicted_df.to_csv('Info_Sub_5.csv', index=False)

Validation RMSE: 0.6818088512402047
Validation MSE: 0.4648633096294877


## 6.6 KNeighnors Regressor Model

In [59]:
from sklearn.neighbors import KNeighborsRegressor

# Create the model
knn_model = KNeighborsRegressor(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

val_predictions = knn_model.predict(X_val)
# Calculate RMSE and MSE on the validation set
val_rmse = mean_squared_error(y_val, val_predictions, squared=False)
val_mse = mean_squared_error(y_val, val_predictions)
print("Validation RMSE:", val_rmse)
print("Validation MSE:", val_mse)

# Make predictions
knn_predictions = knn_model.predict(test_features)

# Create a DataFrame to store the predictions and other relevant columns
predicted_df = pd.DataFrame(columns=['id', 'relevance'])

# Fill in the 'id' column with the extracted 'id' values
predicted_df['id'] = test_id

# Fill in the 'relevance' column with the predicted values
predicted_df['relevance'] = knn_predictions

# Save the DataFrame to a CSV file
predicted_df.to_csv('Info_Sub_6.csv', index=False)

Validation RMSE: 0.5422044426424125
Validation MSE: 0.2939856576211692


## 6.7 XGBoost Regressor Model

In [71]:
import xgboost as xgb

# Convert all columns to float
# Convert all columns to float
cols_to_convert = ['cosine_st_pt', 'cosine_st_pd', 'cosine_st_a', 
                   'jaccard_st_pt', 'jaccard_st_pd', 'jaccard_st_a', 
                   'euclidean_st_pt', 'euclidean_st_pd', 'euclidean_st_a']

train_features[cols_to_convert] = train_features[cols_to_convert].astype(float)
# Convert train_y to float
train_y = train_y.astype(float)
test_features[cols_to_convert] = test_features[cols_to_convert].astype(float)

X_train_xg, X_val_xg, y_train_xg, y_val_xg = train_test_split(train_features, train_y, test_size=0.2, random_state=42)

# Define the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model
xgb_model.fit(X_train_xg, y_train_xg)

# Make predictions on the validation set
val_predictions = xgb_model.predict(X_val_xg)

# Calculate RMSE and MSE on the validation set
val_rmse = mean_squared_error(y_val_xg, val_predictions, squared=False)
val_mse = mean_squared_error(y_val_xg, val_predictions)

print("Validation RMSE:", val_rmse)
print("Validation MSE:", val_mse)

xgb_predictions = xgb_model.predict(test_features)

# Create a DataFrame to store the predictions and other relevant columns
predicted_df = pd.DataFrame(columns=['id', 'relevance'])

# Fill in the 'id' column with the extracted 'id' values
predicted_df['id'] = test_id

# Fill in the 'relevance' column with the predicted values
predicted_df['relevance'] = xgb_predictions

# Save the DataFrame to a CSV file
predicted_df.to_csv('xgb_predictions.csv', index=False)


Validation RMSE: 0.5046074768620744
Validation MSE: 0.254628705705109
