# Model evaluation

## Initialization

In [2]:
!pip install catboost
!pip install optuna
!pip install dill
!pip install psutil
!pip install scikit-learn



In [2]:
from catboost import CatBoostClassifier, Pool
import numpy as np
import time
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import optuna
import json
import psutil
from datetime import datetime
import csv
from difflib import SequenceMatcher
from collections import defaultdict
import joblib


  from .autonotebook import tqdm as notebook_tqdm


## Batch prediction

In [3]:
categories = [
    "capacity",
    "quality",
    "view",
    "bedding",
    "balcony",
    "bedrooms",
    "club",
    "floor",
    "bathroom",
    "class",
]
models_dir = "../../artifacts"

models = {}
label_encoders = {}
tfidf = joblib.load(f"{models_dir}/tfidf/tfidf_vectorizer.joblib")

for category in categories:
    model_path = os.path.join(f"{models_dir}/cbm", f"catboost_model_{category}.cbm")
    model = CatBoostClassifier()
    model.load_model(model_path)
    models[category] = model

    le = LabelEncoder()
    le.classes_ = np.load(
        f"{models_dir}/labels/npy/label_encoder_{category}.npy", allow_pickle=True
    )
    label_encoders[category] = le


def batch_predict(rate_names):
    input_data = np.array(rate_names)
    input_tfidf = tfidf.transform(input_data)
    results = []

    for category in categories:
        predictions = models[category].predict(input_tfidf)
        decoded_predictions = label_encoders[category].classes_[predictions.ravel()]
        results.append(decoded_predictions)

    return [
        {category: value for category, value in zip(categories, row)}
        for row in zip(*results)
    ]


# Example usage
batch_examples = ["King Premium Mountain View no balcony"]

print("\nBatch prediction:")
batch_results = batch_predict(batch_examples)
for result in batch_results:
    print(json.dumps(result, indent=2))


Batch prediction:
{
  "capacity": "undefined",
  "quality": "premium",
  "view": "mountain view",
  "bedding": "undefined",
  "balcony": "balcony",
  "bedrooms": "undefined",
  "club": "not club",
  "floor": "undefined",
  "bathroom": "private bathroom",
  "class": "room"
}


## Batch prediction to a file

In [8]:
input_file = "../../inputs/rates_dirty.csv"  # Replace with your input CSV file path
df_input = pd.read_csv(input_file)
df_input = df_input.fillna("undefined")


# Ensure 'rate_name' column exists
if "rate_name" not in df_input.columns:
    raise ValueError("The input CSV must contain a 'rate_name' column")

# Process in batches
batch_size = 5500
results = []

for i in range(0, len(df_input), batch_size):
    batch = df_input["rate_name"][i : i + batch_size].tolist()
    batch_results = batch_predict(batch)
    results.extend(batch_results)

df_results = pd.DataFrame(results)
df_results["rate_name"] = df_input["rate_name"]

# Reorder columns to have 'rate_name' first
columns = ["rate_name"] + categories
df_results = df_results[columns]

df_results.to_csv("../../outputs/batch_prediction_results.csv", index=False)

print("Results have been saved to batch_prediction_results.csv in the outputs directory")
print(f"Processed {len(df_input)} entries")

Results have been saved to batch_prediction_results.csv in the outputs directory
Processed 180443 entries


## Find mismatches for given CSV

In [6]:
def calculate_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()


def compare_csv_files(file1_path, file2_path, threshold=0.8):
    with open(file1_path, "r") as file1, open(file2_path, "r") as file2:
        reader1 = csv.DictReader(file1)
        reader2 = csv.DictReader(file2)

        total_rows = 0
        error_rows = 0
        column_errors = defaultdict(int)
        mismatches = []

        for row1, row2 in zip(reader1, reader2):
            total_rows += 1
            row_errors = 0
            row_mismatches = {}

            for cat in categories:
                val1 = row1.get(cat, "")
                val2 = row2.get(cat, "")
                val1 = "" if val1 == "undefined" else val1
                val2 = "" if val2 == "undefined" else val2

                similarity = calculate_similarity(val1, val2)
                if similarity < threshold:
                    row_errors += 1
                    column_errors[cat] += 1
                    row_mismatches[cat] = {"expected": val2, "received": val1}

            if row_errors > 0:
                error_rows += 1
                mismatches.append(
                    {
                        "row": total_rows,
                        "rate_name": row1.get("rate_name", "N/A"),
                        "mismatches": row_mismatches,
                    }
                )

        overall_error_rate = error_rows / total_rows
        print(f"\nOverall error rate: {overall_error_rate:.2f}")
        print(f"Total rows: {total_rows}")
        print(f"Rows with errors: {error_rows}")

        print("\nColumn Error Statistics:")
        for cat, error_count in column_errors.items():
            column_error_rate = error_count / total_rows
            print(
                f"Column {cat}: Error rate = {column_error_rate:.2f}, Errors = {error_count}"
            )

        with open("../../outputs/mismatches.json", "w") as f:
            json.dump(mismatches, f, indent=2)

        print("\nMismatches have been saved to 'mismatches.json'")


# Usage
file1_path = "../../inputs/rates_dirty.csv"
file2_path = "../../outputs/o.csv"

compare_csv_files(file1_path, file2_path)


Overall error rate: 0.09
Total rows: 180443
Rows with errors: 16638

Column Error Statistics:
Column capacity: Error rate = 0.04, Errors = 7854
Column bedding: Error rate = 0.04, Errors = 7701
Column view: Error rate = 0.02, Errors = 3902
Column bathroom: Error rate = 0.01, Errors = 2508
Column class: Error rate = 0.02, Errors = 2790
Column quality: Error rate = 0.03, Errors = 4644
Column bedrooms: Error rate = 0.01, Errors = 2214
Column club: Error rate = 0.01, Errors = 1988
Column floor: Error rate = 0.01, Errors = 2012

Mismatches have been saved to 'mismatches.json'
