### Functions definitions
In this section is where the similarity metrics are calculted.

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import LabelEncoder
from Levenshtein import distance as lev_distance
import openai

# Configure OpenAI API key
openai.api_key = "my_api_key"

def calculate_embedding_similarity(description1: str, description2: str) -> float:
    """
    It generates embeddings for the descriptions using OpenAI and calculates the cosine similarity between them.

    Args:
        description1 (str): First room description.
        description2 (str): Second room description.

    Returns:
        float: Similarity score between the two descriptions (value between -1 and 1).
    """
    
    def get_embedding(text: str) -> list:
        """Generates an embedding for the given text using OpenAI."""
        response = openai.Embedding.create(
            input=text,
            model="text-embedding-ada-002"
        )
        return response['data'][0]['embedding']

    def cosine_similarity(embedding1: list, embedding2: list) -> float:
        """Calculates the cosine similarity between two embeddings."""
        embedding1 = np.array(embedding1)
        embedding2 = np.array(embedding2)
        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
        return similarity
    
    # Generate embeddings for both descriptions
    embedding1 = get_embedding(description1)
    embedding2 = get_embedding(description2)
    
    # Calculate similarity between the embeddings
    similarity_score = cosine_similarity(embedding1, embedding2)
    
    return similarity_score

def compute_numeric_similarity(room_1, room_2):
    # Calculate numerical similarity using Euclidean distance
    scaler = StandardScaler()
    scaled_rooms = scaler.fit_transform([room_1, room_2])
    return np.linalg.norm(scaled_rooms[0] - scaled_rooms[1])

def compute_categorical_similarity(room_1, room_2, categorical_columns):
    # Calculate similarity for categorical features using Label Encoding
    le = LabelEncoder()
    similarities = []
    for col in categorical_columns:
        le.fit([room_1[col], room_2[col]])
        encoded = le.transform([room_1[col], room_2[col]])
        similarities.append(1 - pairwise_distances([encoded], metric="hamming")[0][0])  # Similaridad de Hamming
    return np.mean(similarities)

def compute_levenshtein_similarity(description_1, description_2):
    # Calculate similarity using Levenshtein distance
    return 1 - lev_distance(description_1, description_2) / max(len(description_1), len(description_2))



### Model Training

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

# Prepare data for training the model
X = []  # Here the calculated similarity metrics should be added
y = []  # Here the labels should be added (0 or 1, depending on whether there is a match or not)

# Example data for two rooms
room_1 = {'price': 100, 'size': 25, 'city': 'Paris', 'description': 'A spacious room in the center of Paris'}
room_2 = {'price': 110, 'size': 28, 'city': 'Paris', 'description': 'A large room located in downtown Paris'}

# Similarity metrics calculation
numeric_similarity = compute_numeric_similarity(room_1, room_2)
categorical_similarity = compute_categorical_similarity(room_1, room_2, categorical_columns=['city'])
text_similarity = calculate_embedding_similarity(room_1['description'], room_2['description'])
lev_similarity = compute_levenshtein_similarity(room_1['description'], room_2['description'])

X.append([numeric_similarity, categorical_similarity, text_similarity, lev_similarity])

X = np.array(X)
y = np.array(y)

# Divide into training and test data. With stratify=y, we ensure that the proportion of classes is the same in both sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# Define the hyperparameter search with RandomizedSearchCV
param_dist = {
    'max_depth': randint(3, 10),           # Depht between 3 and 10
    'learning_rate': uniform(0.01, 0.3),   # Learning rate between 0.01 and 0.3
    'n_estimators': randint(50, 200),      # Number of trees between 50 and 200
    'subsample': uniform(0.6, 0.4),        # Subsampling between 0.6 and 1.0
    'colsample_bytree': uniform(0.6, 0.4)  # Subsampling of columns between 0.6 and 1.0
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,               # Number of combinations to try
    scoring=f1_score,        # Use f1 score as evaluation metric
    cv=3,                    # 3-fold cross-validation
    n_jobs=-1,               # Use all available cores
    random_state=42          # Set the seed for reproducibility
)


# Adjust the model to the training data
random_search.fit(X_train, y_train)

# Best model after searching for hyperparameters
best_model = random_search.best_estimator_

# Prediction and model evaluation
y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calculate AUC-ROC to evaluate performance in terms of binary classification
y_prob = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC: {roc_auc}")


### Store the model

In [None]:
import pickle

# Train the final model with the best set of hyperparameters found
best_model.fit(X_train, y_train)

# Save the trained model to be used in the room matching system
with open('room_match_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("Model trained and saved successfully.")