### Functions definitions

In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import LabelEncoder
from Levenshtein import distance as lev_distance
import openai

# Configurar la API key de OpenAI
openai.api_key = "my_api_key"

def calculate_embedding_similarity(description1: str, description2: str) -> float:
    """
    Genera embeddings de las descripciones usando OpenAI y calcula la similitud coseno entre ellas.

    Args:
        description1 (str): Descripción de la primera habitación.
        description2 (str): Descripción de la segunda habitación.

    Returns:
        float: Puntaje de similitud entre las dos descripciones (valor entre -1 y 1).
    """
    
    def get_embedding(text: str) -> list:
        """Genera un embedding para el texto dado utilizando OpenAI."""
        response = openai.Embedding.create(
            input=text,
            model="text-embedding-ada-002"
        )
        return response['data'][0]['embedding']

    def cosine_similarity(embedding1: list, embedding2: list) -> float:
        """Calcula la similitud coseno entre dos embeddings."""
        embedding1 = np.array(embedding1)
        embedding2 = np.array(embedding2)
        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
        return similarity
    
    # Generar embeddings para ambas descripciones
    embedding1 = get_embedding(description1)
    embedding2 = get_embedding(description2)
    
    # Calcular la similitud entre los embeddings
    similarity_score = cosine_similarity(embedding1, embedding2)
    
    return similarity_score

def compute_numeric_similarity(room_1, room_2):
    # Calcular la similitud numérica utilizando la distancia euclidiana
    scaler = StandardScaler()
    scaled_rooms = scaler.fit_transform([room_1, room_2])
    return np.linalg.norm(scaled_rooms[0] - scaled_rooms[1])

def compute_categorical_similarity(room_1, room_2, categorical_columns):
    # Calcular similitud para características categóricas usando Label Encoding
    le = LabelEncoder()
    similarities = []
    for col in categorical_columns:
        le.fit([room_1[col], room_2[col]])
        encoded = le.transform([room_1[col], room_2[col]])
        similarities.append(1 - pairwise_distances([encoded], metric="hamming")[0][0])  # Similaridad de Hamming
    return np.mean(similarities)

def compute_levenshtein_similarity(description_1, description_2):
    # Calcular similitud usando la distancia de Levenshtein
    return 1 - lev_distance(description_1, description_2) / max(len(description_1), len(description_2))



['model.pkl']

### Model Training

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Preparar datos para entrenamiento
X = []  # Aquí debes agregar las métricas de similitud calculadas
y = []  # Etiquetas de la clase (por ejemplo, 0 o 1, dependiendo de si hay un match o no)

# Simulamos algunos datos para el ejemplo
for _ in range(100):  # Usamos 100 muestras ficticias
    room_1 = {'price': 100, 'size': 25, 'city': 'Paris', 'description': 'A spacious room in the center of Paris'}
    room_2 = {'price': 110, 'size': 28, 'city': 'Paris', 'description': 'A large room located in downtown Paris'}
    
    # Características numéricas y categóricas
    numeric_similarity = compute_numeric_similarity(room_1, room_2)
    categorical_similarity = compute_categorical_similarity(room_1, room_2, categorical_columns=['city'])
    text_similarity = calculate_embedding_similarity(room_1['description'], room_2['description'])
    lev_similarity = compute_levenshtein_similarity(room_1['description'], room_2['description'])

    X.append([numeric_similarity, categorical_similarity, text_similarity, lev_similarity])
    y.append(1 if numeric_similarity < 10 else 0)  # Etiqueta ficticia (por ejemplo, 1 si hay match, 0 si no)

X = np.array(X)
y = np.array(y)

# Dividir en datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Definir el modelo XGBoost
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# Definir la búsqueda de hiperparámetros con GridSearch
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 150],
    'scale_pos_weight': [1, 2, 5]  # Ajuste para clases desbalanceadas
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1)

# Ajustar el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Mejor modelo después de la búsqueda
best_model = grid_search.best_estimator_

# Predicciones y evaluación del modelo
y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calcular AUC-ROC para evaluar el rendimiento en términos de clasificación binaria
y_prob = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC: {roc_auc}")


### Store the model

In [None]:
import joblib

# Entrenar el modelo final con el mejor conjunto de hiperparámetros encontrado
best_model.fit(X_train, y_train)

# Guardar el modelo entrenado
joblib.dump(best_model, 'room_match_model.pkl')

print("Modelo entrenado y guardado correctamente.")

In [2]:
from fastapi import FastAPI
import joblib
import numpy as np
from pydantic import BaseModel

# Cargar modelo
model = joblib.load("model.pkl")

# Crear aplicación FastAPI
app = FastAPI()

# Definir un esquema de entrada usando Pydantic
class InputData(BaseModel):
    features: list[float]

# Crear un endpoint para la predicción
@app.post("/predict")
def predict(input_data: InputData):
    # Convertir la entrada en un arreglo NumPy
    features = np.array(input_data.features).reshape(1, -1)
    
    # Hacer predicción
    prediction = model.predict(features)
    return {"prediction": int(prediction[0])}


In [None]:
import requests

url = "https://room-match.cupid.travel/room_match"

payload = {
    "debug": True,
    "run_ner": True,
    "referenceCatalog": [
        {
            "propertyName": "Pestana Park Avenue",
            "propertyId": "5122906",
            "referenceRoomInfo": [
                {
                    "roomId": "512290602",
                    "roomName": "Classic Room"
                },
                {
                    "roomId": "512290603",
                    "roomName": "Superior Room"
                },
                {
                    "roomId": "512290604",
                    "roomName": "Superior Room with City View"
                },
                {
                    "roomId": "512290605",
                    "roomName": "Balcony Room"
                },
                {
                    "roomId": "512290608",
                    "roomName": "Classic Room - Disability Access"
                },
                {
                    "roomId": "512290609",
                    "roomName": "Superior Room - Disability Access"
                },
                {
                    "roomId": "512290610",
                    "roomName": "Junior Suite - Disability Access"
                }
            ]
        }
    ],
    "inputCatalog": [
        {
            "supplierId": "nuitee",
            "supplierRoomInfo": [
                {
                    "supplierRoomId": "2",
                    "supplierRoomName": "Classic Room - Olympic Queen Bed - ROOM ONLY"
                },
                {
                    "supplierRoomId": "3",
                    "supplierRoomName": "CLASSIC ROOM ADA - ROOM ONLY"
                },
                {
                    "supplierRoomId": "5",
                    "supplierRoomName": "SUPERIOR ROOM ADA - ROOM ONLY"
                },
                {
                    "supplierRoomId": "10",
                    "supplierRoomName": "Superior Room - Olympic Queen Bed - ROOM ONLY"
                },
                {
                    "supplierRoomId": "6",
                    "supplierRoomName": "Superior City View - Olympic Queen Bed - ROOM ONLY"
                },
                {
                    "supplierRoomId": "7",
                    "supplierRoomName": "Balcony Room - Olympic Queen Bed - ROOM ONLY"
                }
            ]
        }
    ]
}
headers = {
    "accept": "application/json",
    "content-type": "application/json",
    "X-API-Key": "e2R4t6Y8u0i3O5p7A9s1D3f5G7h9J2k4"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

In [None]:
{
  "Results": [
    {
      "cleanRoomName": "classic room",
      "mappedRooms": [
        {
          "cleanSupplierRoomName": "classic room accessible r.o.",
          "roomDescription": "double-person//CLASS//roomOccupancy||classic//CLASS//roomClass||one-double-bed//CLASS//bedType||double-room//CLASS//roomType||+any_view+//CLASS//roomView||accessible//CLASS//roomAccessibility||non-smoking//CLASS//roomSmoking||room-only//CLASS//boardType||non-refundable//CLASS//cancellationPolicy||with-windows//CLASS//windows",
          "score": 1,
          "supplierId": "nuitee",
          "supplierRoomId": "3",
          "supplierRoomName": "CLASSIC ROOM ADA - ROOM ONLY"
        }
      ],
      "propertyId": "5122906",
      "propertyName": "Pestana Park Avenue",
      "roomDescription": "double-person//CLASS//roomOccupancy||classic//CLASS//roomClass||+any_bed+//CLASS//bedType||double-room//CLASS//roomType||+any_view+//CLASS//roomView||non-smoking//CLASS//roomSmoking||+any_food+//CLASS//boardType||+any-refundable+//CLASS//cancellationPolicy||with-windows//CLASS//windows",
      "roomId": "512290602",
      "roomName": "Classic Room"
    },
    {
      "cleanRoomName": "superior room",
      "mappedRooms": [
        {
          "cleanSupplierRoomName": "superior room accessible r.o.",
          "roomDescription": "double-person//CLASS//roomOccupancy||superior//CLASS//roomClass||one-double-bed//CLASS//bedType||double-room//CLASS//roomType||+any_view+//CLASS//roomView||accessible//CLASS//roomAccessibility||non-smoking//CLASS//roomSmoking||room-only//CLASS//boardType||non-refundable//CLASS//cancellationPolicy||with-windows//CLASS//windows",
          "score": 1,
          "supplierId": "nuitee",
          "supplierRoomId": "5",
          "supplierRoomName": "SUPERIOR ROOM ADA - ROOM ONLY"
        }
      ],
      "propertyId": "5122906",
      "propertyName": "Pestana Park Avenue",
      "roomDescription": "double-person//CLASS//roomOccupancy||superior//CLASS//roomClass||+any_bed+//CLASS//bedType||double-room//CLASS//roomType||+any_view+//CLASS//roomView||non-smoking//CLASS//roomSmoking||+any_food+//CLASS//boardType||+any-refundable+//CLASS//cancellationPolicy||with-windows//CLASS//windows",
      "roomId": "512290603",
      "roomName": "Superior Room"
    }
  ],
  "UnmappedRooms": [
    {
      "cleanSupplierRoomName": "classic room olympic queen bed r.o.",
      "roomDescription": "double-person//CLASS//roomOccupancy||classic//CLASS//roomClass||one-queen-bed//CLASS//bedType||queen-room//CLASS//roomType||+any_view+//CLASS//roomView||non-smoking//CLASS//roomSmoking||room-only//CLASS//boardType||non-refundable//CLASS//cancellationPolicy||with-windows//CLASS//windows",
      "supplierId": "nuitee",
      "supplierRoomId": "2",
      "supplierRoomName": "Classic Room - Olympic Queen Bed - ROOM ONLY"
    },
    {
      "cleanSupplierRoomName": "superior room olympic queen bed r.o.",
      "roomDescription": "double-person//CLASS//roomOccupancy||superior//CLASS//roomClass||one-queen-bed//CLASS//bedType||queen-room//CLASS//roomType||+any_view+//CLASS//roomView||non-smoking//CLASS//roomSmoking||room-only//CLASS//boardType||non-refundable//CLASS//cancellationPolicy||with-windows//CLASS//windows",
      "supplierId": "nuitee",
      "supplierRoomId": "10",
      "supplierRoomName": "Superior Room - Olympic Queen Bed - ROOM ONLY"
    },
    {
      "cleanSupplierRoomName": "superior city view olympic queen bed r.o.",
      "roomDescription": "double-person//CLASS//roomOccupancy||superior//CLASS//roomClass||one-queen-bed//CLASS//bedType||queen-room//CLASS//roomType||city-view//CLASS//roomView||non-smoking//CLASS//roomSmoking||room-only//CLASS//boardType||non-refundable//CLASS//cancellationPolicy||with-windows//CLASS//windows",
      "supplierId": "nuitee",
      "supplierRoomId": "6",
      "supplierRoomName": "Superior City View - Olympic Queen Bed - ROOM ONLY"
    },
    {
      "cleanSupplierRoomName": "balcony room olympic queen bed r.o.",
      "roomDescription": "double-person//CLASS//roomOccupancy||standard//CLASS//roomClass||one-queen-bed//CLASS//bedType||queen-room//CLASS//roomType||+any_view+//CLASS//roomView||balcony//CLASS//roomAmenities||non-smoking//CLASS//roomSmoking||room-only//CLASS//boardType||non-refundable//CLASS//cancellationPolicy||with-windows//CLASS//windows",
      "supplierId": "nuitee",
      "supplierRoomId": "7",
      "supplierRoomName": "Balcony Room - Olympic Queen Bed - ROOM ONLY"
    }
  ]
}