In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('final_data.csv')

In [4]:
df.head()

Unnamed: 0,property_type,area_type,availability,location,bedroom,area,bath,balcony,price_per_sqft,price,built_up_area
0,flat,Super built-up Area,Ready To Move,somasundara palya,2,1140.0,2,2,4035.0,0.46,969.0
1,flat,Super built-up Area,Under Construction,marathahalli,3,1305.0,2,1,5287.0,0.69,1109.25
2,flat,Super built-up Area,Ready To Move,chikkalasandra,3,1270.0,2,3,4331.0,0.55,1079.5
3,flat,Super built-up Area,Ready To Move,harlur,2,1290.0,2,1,6589.0,0.85,1096.5
4,flat,Built-up Area,Ready To Move,other,3,1503.0,2,2,6188.0,0.93,1503.0


In [5]:
df = df.drop(columns=['price_per_sqft', 'area_type', 'area'])

In [6]:
df.head()

Unnamed: 0,property_type,availability,location,bedroom,bath,balcony,price,built_up_area
0,flat,Ready To Move,somasundara palya,2,2,2,0.46,969.0
1,flat,Under Construction,marathahalli,3,2,1,0.69,1109.25
2,flat,Ready To Move,chikkalasandra,3,2,3,0.55,1079.5
3,flat,Ready To Move,harlur,2,2,1,0.85,1096.5
4,flat,Ready To Move,other,3,2,2,0.93,1503.0


In [7]:
X = df.drop(columns=['price'])
y = df['price']

In [8]:
y_transformed = np.log1p(y)

In [9]:
columns_to_encode = ['property_type','location', 'availability']

Ordinal Encoding

In [10]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bath', 'built_up_area', 'balcony']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

In [11]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [12]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [13]:
scores.mean(),scores.std()

(np.float64(0.4728443568051849), np.float64(0.020390324741189774))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [15]:
pipeline.fit(X_train,y_train)

In [16]:
y_pred = pipeline.predict(X_test)

In [17]:
y_pred = np.expm1(y_pred)

In [18]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.3500976430527737

In [20]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [21]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

In [22]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [24]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [25]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [26]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
8,xgboost,0.802002,0.211113
5,random forest,0.763515,0.233532
6,gradient boosting,0.756953,0.249153
4,decision tree,0.590762,0.305257
7,adaboost,0.639009,0.305723
2,ridge,0.472845,0.350095
0,linear_reg,0.472844,0.350098
1,svr,0.405132,0.362475
3,LASSO,-0.000291,0.488547


One Hot Encoding

In [27]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bath', 'built_up_area', 'balcony']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['location'])
    ],
    remainder='passthrough'
)

In [28]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [29]:
  model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [30]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [31]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [32]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
8,xgboost,0.808968,0.207278
5,random forest,0.788912,0.211817
6,gradient boosting,0.771941,0.239889
4,decision tree,0.661277,0.271623
0,linear_reg,0.618178,0.286936
2,ridge,0.618369,0.287434
7,adaboost,0.627172,0.332891
1,svr,0.403692,0.362832
3,LASSO,-0.000291,0.488547


Target Encoding

In [33]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [34]:
import category_encoders as ce

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bath', 'built_up_area', 'balcony']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['location']),
        ('target_enc', ce.TargetEncoder(), ['availability'])
    ],
    remainder='passthrough'
)

In [35]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [36]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

In [37]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [38]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [39]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
8,xgboost,0.808879,0.207266
5,random forest,0.789022,0.212766
6,gradient boosting,0.771802,0.240038
4,decision tree,0.661787,0.271456
0,linear_reg,0.618317,0.286534
2,ridge,0.61838,0.287342
7,adaboost,0.602327,0.352364
1,svr,0.403707,0.362837
3,LASSO,-0.000291,0.488547


In [40]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for XGBoost
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.8, 1.0]
}

# Create the XGBoost regressor
xgb = XGBRegressor(random_state=42)

# Create the pipeline with the XGBoost regressor
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb)
])

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline_xgb, param_grid, cv=kfold, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score (R2): ", grid_search.best_score_)

Best parameters:  {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.2, 'regressor__max_depth': 7, 'regressor__n_estimators': 300, 'regressor__subsample': 1.0}
Best cross-validation score (R2):  0.8150724490011058


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Define parameter grid for XGBoost
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.8, 1.0]
}

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

# Grid Search CV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X, y_transformed)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

# Predict with best model
best_model = grid_search.best_estimator_


In [41]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import numpy as np

# Parameter distribution for random search
param_dist = {
    'regressor__n_estimators': [100, 200, 300, 500],
    'regressor__max_depth': [3, 4, 5, 6, 7, 8],
    'regressor__learning_rate': np.linspace(0.01, 0.3, 10),
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0],
    'regressor__gamma': [0, 0.1, 0.2, 0.3],
    'regressor__min_child_weight': [1, 3, 5, 7]
}

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

# Randomized Search CV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=50,                  # number of random combinations to try
    scoring='r2',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit search
random_search.fit(X, y_transformed)

# Best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best R2 Score:", random_search.best_score_)

# Best model
best_model_random = random_search.best_estimator_


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'regressor__subsample': 1.0, 'regressor__n_estimators': 200, 'regressor__min_child_weight': 1, 'regressor__max_depth': 8, 'regressor__learning_rate': np.float64(0.1711111111111111), 'regressor__gamma': 0, 'regressor__colsample_bytree': 0.6}
Best R2 Score: 0.8222376923442445


In [42]:
best_model_random.fit(X,y_transformed)

In [43]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedroom', 'bath', 'built_up_area', 'balcony']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['location']),
        ('target_enc', ce.TargetEncoder(), ['availability'])
    ],
    remainder='passthrough'
)

In [44]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

In [45]:
pipeline.fit(X,y_transformed)

In [46]:
X.columns

Index(['property_type', 'availability', 'location', 'bedroom', 'bath',
       'balcony', 'built_up_area'],
      dtype='object')

In [47]:
X.iloc[0].values

array(['flat', 'Ready To Move', 'somasundara palya', np.int64(2),
       np.int64(2), np.int64(2), np.float64(969.0)], dtype=object)

In [48]:
data = [['house', 'Ready To Move', 'electronic city phase ii', 3, 2, 1, 1050.0]]
columns = ['property_type', 'availability', 'location', 'bedroom', 'bath',
       'balcony', 'built_up_area']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,availability,location,bedroom,bath,balcony,built_up_area
0,house,Ready To Move,electronic city phase ii,3,2,1,1050.0


In [49]:
np.expm1(pipeline.predict(one_df))

array([0.6665821], dtype=float32)

In [50]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

features_preprocessed = preprocessor.transform(X)  # Transform the features using the existing preprocessor

recommender_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)
recommender_model.fit(features_preprocessed)

def get_recommendations(user_input, top_k=5, df=df, preprocessor=preprocessor, model=recommender_model):

    # Convert user_input to DataFrame if it's a dict
    if isinstance(user_input, dict):
        user_df = pd.DataFrame([user_input])
    else:
        user_df = pd.DataFrame(user_input).iloc[[0]]

    # Preprocess the user input
    user_preprocessed = preprocessor.transform(user_df)

    # Find nearest neighbors (distances and indices)
    distances, indices = model.kneighbors(user_preprocessed, n_neighbors=top_k + 1)

    # Get indices of recommendations
    rec_indices = indices.flatten()[1:]

    # Return the recommended properties from the original df
    recommendations = df.iloc[rec_indices]

    return recommendations

In [53]:
 user_preferences = {
    'property_type': 'flat',
    'availability': 'Ready To Move',
    'location': 'rajaji nagar',
    'bedroom': 3,
    'bath': 2,
    'balcony': 1,
    'built_up_area': 1200.0}

In [54]:
recommended_houses = get_recommendations(user_preferences, top_k=5)
print(recommended_houses)

     property_type   availability            location  bedroom  bath  balcony  \
4401          flat  Ready To Move        rajaji nagar        2     2        1   
2757          flat  Ready To Move        rajaji nagar        2     2        1   
7581          flat  Ready To Move        rajaji nagar        3     2        2   
1115          flat  Ready To Move        rajaji nagar        3     2        2   
6731          flat  Ready To Move  yelahanka new town        3     2        1   

      price  built_up_area  
4401   1.70         1164.5  
2757   0.70         1020.0  
7581   0.81         1028.5  
1115   2.50         1530.0  
6731   0.48          850.0  


In [55]:
from sklearn.metrics import precision_score, recall_score

X_train, X_test, y_train, y_test = train_test_split(X, df['price'], test_size=0.2, random_state=42)

X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

recommender_model.fit(X_train_preprocessed)

def is_relevant(pred_index, true_index, df_train, df_test, threshold=0.9):
    pred_features = preprocessor.transform(df_train.iloc[[pred_index]])
    true_features = preprocessor.transform(df_test.iloc[[true_index]])
    similarity = cosine_similarity(pred_features, true_features)[0][0]
    return similarity >= threshold

def precision_recall_at_k(model, X_test_preprocessed, X_train, X_test, df_train, df_test, k=5, threshold=0.9):
    """Compute Precision@k and Recall@k."""
    precisions, recalls = [], []

    for i in range(X_test_preprocessed.shape[0]):
        distances, indices = model.kneighbors(X_test_preprocessed[i:i+1], n_neighbors=k)
        recommended_indices = indices.flatten()

        relevant = [i]
        predicted_relevant = [idx for idx in recommended_indices if is_relevant(idx, i, df_train, df_test, threshold)]

        precision = len(predicted_relevant) / k if k > 0 else 0
        recall = len(predicted_relevant) / len(relevant) if len(relevant) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)

    return np.mean(precisions), np.mean(recalls)

In [56]:
precision, recall = precision_recall_at_k(recommender_model, X_test_preprocessed, X_train, X_test, X_train.join(y_train), X_test.join(y_test), k=5)

# Print results
print(f"Precision@5: {precision:.4f}")

Precision@5: 0.9961


In [59]:
print(f"Recall@5: {recall:.4f}")

Recall@5: 4.9807


In [60]:
def mrr_at_k(model, X_test_preprocessed, X_train, X_test, df_train, df_test, k=5, threshold=0.9):
    """Compute Mean Reciprocal Rank."""
    mrr_scores = []

    for i in range(X_test_preprocessed.shape[0]):
        distances, indices = model.kneighbors(X_test_preprocessed[i:i+1], n_neighbors=k)
        recommended_indices = indices.flatten()

        # Find the rank of the first relevant item
        for rank, idx in enumerate(recommended_indices, 1):
            if is_relevant(idx, i, df_train, df_test, threshold):
                mrr_scores.append(1 / rank)
                break
        else:
            mrr_scores.append(0)  # No relevant item found in top-k

    return np.mean(mrr_scores)

In [61]:
mrr = mrr_at_k(recommender_model, X_test_preprocessed, X_train, X_test, X_train.join(y_train), X_test.join(y_test), k=5)

print(f"MRR@5: {mrr:.4f}")



MRR@5: 0.9967


In [62]:
def ndcg_at_k(model, X_test_preprocessed, X_train, X_test, df_train, df_test, k=5, threshold=0.9):
    """Compute NDCG@k."""
    def dcg(relevances, k):
        return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevances[:k]))

    ndcg_scores = []

    for i in range(X_test_preprocessed.shape[0]):
        distances, indices = model.kneighbors(X_test_preprocessed[i:i+1], n_neighbors=k)
        recommended_indices = indices.flatten()

        # Relevance scores: 1 if relevant, 0 otherwise
        relevances = [1 if is_relevant(idx, i, df_train, df_test, threshold) else 0 for idx in recommended_indices]

        # DCG for recommendations
        dcg_score = dcg(relevances, k)
        # Ideal DCG: assumes all top-k are relevant
        ideal_relevances = [1] * min(k, 1)  # Simplified: one relevant item
        idcg_score = dcg(ideal_relevances, k)

        # NDCG
        ndcg = dcg_score / idcg_score if idcg_score > 0 else 0
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)

In [63]:
ndcg = ndcg_at_k(recommender_model, X_test_preprocessed, X_train, X_test, X_train.join(y_train), X_test.join(y_test), k=5)

print(f"NDCG@5: {ndcg:.4f}")

NDCG@5: 2.9375


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Step 1: Split dataset into train and test to simulate ground truth
# Assuming df is your DataFrame and X is df.drop('price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, df['price'], test_size=0.2, random_state=42)

# Preprocess training and test sets
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Refit the recommender model on training data
recommender_model.fit(X_train_preprocessed)

# Step 2: Define relevance criteria
# Example: A property is relevant if its features are similar (cosine similarity > threshold) or price is within budget
def is_relevant(pred_index, true_index, df_train, df_test, threshold=0.9):
    """Check if recommended property is relevant based on cosine similarity."""
    pred_features = preprocessor.transform(df_train.iloc[[pred_index]])
    true_features = preprocessor.transform(df_test.iloc[[true_index]])
    similarity = cosine_similarity(pred_features, true_features)[0][0]
    return similarity >= threshold  # Adjust threshold based on domain knowledge

# Step 3: Evaluation functions
def precision_recall_at_k(model, X_test_preprocessed, X_train, X_test, df_train, df_test, k=5, threshold=0.9):
    """Compute Precision@k and Recall@k."""
    precisions, recalls = [], []

    for i in range(X_test_preprocessed.shape[0]):
        # Get recommendations for test instance
        distances, indices = model.kneighbors(X_test_preprocessed[i:i+1], n_neighbors=k)
        recommended_indices = indices.flatten()

        # Ground truth: Assume the test instance itself is the relevant item
        # In practice, you might have a list of relevant indices per user
        relevant = [i]  # Simplified: test instance is relevant to itself
        predicted_relevant = [idx for idx in recommended_indices if is_relevant(idx, i, df_train, df_test, threshold)]

        # Precision: proportion of recommended items that are relevant
        precision = len(predicted_relevant) / k if k > 0 else 0
        # Recall: proportion of relevant items that were recommended
        recall = len(predicted_relevant) / len(relevant) if len(relevant) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)

    return np.mean(precisions), np.mean(recalls)

# def mrr_at_k(model, X_test_preprocessed, X_train, X_test, df_train, df_test, k=5, threshold=0.9):
#     """Compute Mean Reciprocal Rank."""
#     mrr_scores = []

#     for i in range(X_test_preprocessed.shape[0]):
#         distances, indices = model.kneighbors(X_test_preprocessed[i:i+1], n_neighbors=k)
#         recommended_indices = indices.flatten()

#         # Find the rank of the first relevant item
#         for rank, idx in enumerate(recommended_indices, 1):
#             if is_relevant(idx, i, df_train, df_test, threshold):
#                 mrr_scores.append(1 / rank)
#                 break
#         else:
#             mrr_scores.append(0)  # No relevant item found in top-k

#     return np.mean(mrr_scores)

# def ndcg_at_k(model, X_test_preprocessed, X_train, X_test, df_train, df_test, k=5, threshold=0.9):
#     """Compute NDCG@k."""
#     def dcg(relevances, k):
#         return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevances[:k]))

#     ndcg_scores = []

#     for i in range(X_test_preprocessed.shape[0]):
#         distances, indices = model.kneighbors(X_test_preprocessed[i:i+1], n_neighbors=k)
#         recommended_indices = indices.flatten()

#         # Relevance scores: 1 if relevant, 0 otherwise
#         relevances = [1 if is_relevant(idx, i, df_train, df_test, threshold) else 0 for idx in recommended_indices]

#         # DCG for recommendations
#         dcg_score = dcg(relevances, k)
#         # Ideal DCG: assumes all top-k are relevant
#         ideal_relevances = [1] * min(k, 1)  # Simplified: one relevant item
#         idcg_score = dcg(ideal_relevances, k)

#         # NDCG
#         ndcg = dcg_score / idcg_score if idcg_score > 0 else 0
#         ndcg_scores.append(ndcg)

#     return np.mean(ndcg_scores)

# def coverage(model, X_train_preprocessed, df_train, k=5):
#     """Compute catalog coverage."""
#     recommended_items = set()
#     for i in range(X_train_preprocessed.shape[0]):
#         distances, indices = model.kneighbors(X_train_preprocessed[i:i+1], n_neighbors=k)
#         recommended_items.update(indices.flatten())

#     return len(recommended_items) / len(df_train)

# def diversity(model, X_train_preprocessed, k=5):
#     """Compute average intra-list diversity (average pairwise cosine distance)."""
#     diversities = []

#     for i in range(X_train_preprocessed.shape[0]):
#         distances, indices = model.kneighbors(X_train_preprocessed[i:i+1], n_neighbors=k)
#         recommended_indices = indices.flatten()
#         recommended_features = X_train_preprocessed[recommended_indices]

#         # Compute pairwise cosine distances
#         if len(recommended_indices) > 1:
#             sim_matrix = cosine_similarity(recommended_features)
#             # Cosine distance = 1 - cosine similarity
#             dist_matrix = 1 - sim_matrix
#             # Average pairwise distance (excluding self-similarities)
#             avg_distance = np.mean(dist_matrix[np.triu_indices(len(recommended_indices), k=1)])
#             diversities.append(avg_distance)

#     return np.mean(diversities)

# Step 4: Compute metrics
# Assuming df_train = X_train.join(y_train), df_test = X_test.join(y_test)
precision, recall = precision_recall_at_k(recommender_model, X_test_preprocessed, X_train, X_test, X_train.join(y_train), X_test.join(y_test), k=5)
# mrr = mrr_at_k(recommender_model, X_test_preprocessed, X_train, X_test, X_train.join(y_train), X_test.join(y_test), k=5)
# ndcg = ndcg_at_k(recommender_model, X_test_preprocessed, X_train, X_test, X_train.join(y_train), X_test.join(y_test), k=5)
# cov = coverage(recommender_model, X_train_preprocessed, X_train.join(y_train), k=5)
# div = diversity(recommender_model, X_train_preprocessed, k=5)

# Print results
print(f"Precision@5: {precision:.4f}")
# print(f"Recall@5: {recall:.4f}")
# print(f"MRR@5: {mrr:.4f}")
# print(f"NDCG@5: {ndcg:.4f}")
# print(f"Coverage: {cov:.4f}")
# print(f"Diversity: {div:.4f}")