# Deps


In [1]:
!pip install catboost
!pip install optuna
!pip install dill
!pip install psutil
!pip install scikit-learn



In [2]:
from catboost import CatBoostClassifier
import numpy as np
import time
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import optuna
import os
import json
import psutil
from datetime import datetime
import csv
from difflib import SequenceMatcher
from collections import defaultdict
import joblib

  from .autonotebook import tqdm as notebook_tqdm


# Preparation


In [4]:
df = pd.read_csv("../data/rates_clean.csv")
df = df.fillna("undefined")

categories = [
    "capacity",
    "quality",
    "view",
    "bedding",
    "balcony",
    "bedrooms",
    "club",
    "floor",
    "bathroom",
    "class",
]

for cat in categories:
    df[cat] = df[cat].astype(str)

df["rate_name"] = df["rate_name"].astype(str)

X = df["rate_name"]
y = df[categories]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Improved TF-IDF vectorizer
tfidf = TfidfVectorizer(
    analyzer="char_wb",  # Character n-grams, including word boundaries
    ngram_range=(1, 3),  # Unigrams, bigrams, and trigrams
    max_features=10000,  # Increased to capture more features
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.95,  # Ignore terms that appear in more than 95% of the documents
    sublinear_tf=True,  # Apply sublinear tf scaling
    lowercase=True,  # Convert all characters to lowercase
    strip_accents="unicode",  # Remove accents
    norm="l2",  # L2 normalization of the vectors
    use_idf=True,  # Enable inverse-document-frequency reweighting
    smooth_idf=True,  # Smooth idf weights by adding one to document frequencies
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Label encoding
label_encoders = {}
for category in categories:
    le = LabelEncoder()
    y_train[category] = le.fit_transform(y_train[category])
    y_test[category] = le.transform(y_test[category])
    label_encoders[category] = le

# Train


In [None]:
def objective(trial, X_train, y_train, X_test, y_test, category):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 500),
        "depth": trial.suggest_int("depth", 1, 5),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 0,
        "task_type": "CPU",
        "thread_count": psutil.cpu_count(logical=False),
        "used_ram_limit": f"{int(psutil.virtual_memory().available / (1024 * 1024 * 1024) * 0.8)}GB",
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]
        ),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "leaf_estimation_method": trial.suggest_categorical(
            "leaf_estimation_method", ["Newton", "Gradient"]
        ),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "subsample": (
            trial.suggest_float("subsample", 0.1, 1.0)
            if trial.params["bootstrap_type"] != "Bayesian"
            else None
        ),
    }

    model = CatBoostClassifier(**params)
    model.fit(
        X_train,
        y_train[category],
        eval_set=(X_test, y_test[category]),
        early_stopping_rounds=50,
        verbose=0,
    )

    y_pred = model.predict(X_test)
    f1_score = classification_report(y_test[category], y_pred, output_dict=True)[
        "weighted avg"
    ]["f1-score"]

    return f1_score


# Training and evaluation
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
models_dir = f"models_{timestamp}"
os.makedirs(models_dir, exist_ok=True)

models = {}
for category in categories:
    print(f"Training model for {category}")

    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective(
            trial, X_train_tfidf, y_train, X_test_tfidf, y_test, category
        ),
        n_trials=100,
    )

    best_params = study.best_params
    best_params["verbose"] = 100
    best_params["task_type"] = "CPU"
    model = CatBoostClassifier(**best_params)

    model.fit(X_train_tfidf, y_train[category])

    model_path = os.path.join(models_dir, f"catboost_model_{category}.cbm")
    model.save_model(model_path)

    models[category] = model

    with open(os.path.join(models_dir, f"best_parameters_{category}.json"), "w") as f:
        json.dump(best_params, f, indent=2)

    y_pred = model.predict(X_test_tfidf)
    classification_rep = classification_report(y_test[category], y_pred)

    with open(
        os.path.join(models_dir, f"classification_report_{category}.txt"), "w"
    ) as f:
        f.write(classification_rep)

print(f"Training completed. Models and results saved in {models_dir}")

#### Saving label encoders


In [41]:
for category in categories:
    le = LabelEncoder()
    y_train[category] = le.fit_transform(y_train[category])
    y_test[category] = le.transform(y_test[category])
    label_encoders[category] = le

    # Save the encoder
    np.save(f"python/models_better/label_encoder_{category}.npy", le.classes_)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Save the TF-IDF vectorizer
joblib.dump(tfidf, "python/models_better/tfidf_vectorizer.joblib")


print("Success")

Success


#### Json export for golang


In [44]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

tfidf_vocab = tfidf.vocabulary_
idf_values = tfidf.idf_

tfidf_data = {
    "vocabulary": {k: int(v) for k, v in tfidf_vocab.items()},
    "idf_values": idf_values.tolist(),
}

with open("models_better/tfidf/tfidf_data.json", "w") as f:
    json.dump(tfidf_data, f)

print("TF-IDF data exported to tfidf_data.json")

label_encoders = {}
for category in categories:
    le = LabelEncoder()
    y_train[category] = le.fit_transform(y_train[category])
    y_test[category] = le.transform(y_test[category])
    label_encoders[category] = le

labels_dir = "models_better/labels"
os.makedirs(labels_dir, exist_ok=True)

for category in categories:
    unique_labels = df[category].unique().tolist()

    unique_labels.sort()

    with open(os.path.join(labels_dir, f"labels_{category}.json"), "w") as f:
        json.dump(unique_labels, f)

print("Labels exported")

TF-IDF data exported to tfidf_data.json
Labels exported


In [50]:
categories = [
    "capacity",
    "quality",
    "view",
    "bedding",
    "balcony",
    "bedrooms",
    "club",
    "floor",
    "bathroom",
    "class",
]
models_dir = "models_better"

models = {}
label_encoders = {}
tfidf = joblib.load(f"{models_dir}/tfidf_vectorizer.joblib")

for category in categories:
    model_path = os.path.join(f"{models_dir}/cbm", f"catboost_model_{category}.cbm")
    model = CatBoostClassifier()
    model.load_model(model_path)
    models[category] = model

    le = LabelEncoder()
    le.classes_ = np.load(
        f"{models_dir}/label_text/label_encoder_{category}.npy", allow_pickle=True
    )
    label_encoders[category] = le


def batch_predict(rate_names):
    input_data = pd.Series(rate_names)
    input_tfidf = tfidf.transform(input_data)
    results = []

    for category in categories:
        predictions = models[category].predict(input_tfidf)
        predictions = predictions.ravel()
        decoded_predictions = label_encoders[category].inverse_transform(predictions)
        results.append(decoded_predictions)

    return [
        {
            category: value.item() if isinstance(value, np.integer) else value
            for category, value in zip(categories, row)
        }
        for row in zip(*results)
    ]


# Example usage
batch_examples = ["King Premium Mountain View no balcony"]

print("\nBatch prediction:")
batch_results = batch_predict(batch_examples)
for result in batch_results:
    print(json.dumps(result, indent=2))


Batch prediction:
{
  "capacity": "undefined",
  "quality": "premium",
  "view": "mountain view",
  "bedding": "undefined",
  "balcony": "balcony",
  "bedrooms": "undefined",
  "club": "not club",
  "floor": "undefined",
  "bathroom": "private bathroom",
  "class": "room"
}


In [73]:
# Load the CSV file
input_file = "rates_dirty.csv"  # Replace with your input CSV file path
df_input = pd.read_csv(input_file)
df_input = df_input.fillna("undefined")


# Ensure 'rate_name' column exists
if "rate_name" not in df_input.columns:
    raise ValueError("The input CSV must contain a 'rate_name' column")

# Process in batches
batch_size = 5500
results = []

for i in range(0, len(df_input), batch_size):
    batch = df_input["rate_name"][i : i + batch_size].tolist()
    batch_results = batch_predict(batch)
    results.extend(batch_results)

# Create DataFrame from results
df_results = pd.DataFrame(results)

# Add the original rate names to the DataFrame
df_results["rate_name"] = df_input["rate_name"]

# Reorder columns to have 'rate_name' first
columns = ["rate_name"] + categories
df_results = df_results[columns]

# Save the results as a CSV file in the models_dir
output_file = os.path.join(f"{models_dir}/output", "batch_prediction_results.csv")
df_results.to_csv(output_file, index=False)

print(f"\nResults have been saved to {output_file}")
print(f"Processed {len(df_input)} entries")


Results have been saved to python/models_better/output/batch_prediction_results.csv
Processed 180443 entries


# Mismatches


In [74]:
def calculate_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()


def compare_csv_files(file1_path, file2_path, threshold=0.8):
    with open(file1_path, "r") as file1, open(file2_path, "r") as file2:
        reader1 = csv.DictReader(file1)
        reader2 = csv.DictReader(file2)

        total_rows = 0
        error_rows = 0
        column_errors = defaultdict(int)
        mismatches = []

        for row1, row2 in zip(reader1, reader2):
            total_rows += 1
            row_errors = 0
            row_mismatches = {}

            for cat in categories:
                val1 = row1.get(cat, "")
                val2 = row2.get(cat, "")
                val1 = "" if val1 == "undefined" else val1
                val2 = "" if val2 == "undefined" else val2

                similarity = calculate_similarity(val1, val2)
                if similarity < threshold:
                    row_errors += 1
                    column_errors[cat] += 1
                    row_mismatches[cat] = {"expected": val2, "received": val1}

            if row_errors > 0:
                error_rows += 1
                mismatches.append(
                    {
                        "row": total_rows,
                        "rate_name": row1.get("rate_name", "N/A"),
                        "mismatches": row_mismatches,
                    }
                )

        overall_error_rate = error_rows / total_rows
        print(f"\nOverall error rate: {overall_error_rate:.2f}")
        print(f"Total rows: {total_rows}")
        print(f"Rows with errors: {error_rows}")

        print("\nColumn Error Statistics:")
        for cat, error_count in column_errors.items():
            column_error_rate = error_count / total_rows
            print(
                f"Column {cat}: Error rate = {column_error_rate:.2f}, Errors = {error_count}"
            )

        with open("python/models_better/output/mismatches.json", "w") as f:
            json.dump(mismatches, f, indent=2)

        print("\nMismatches have been saved to 'mismatches.json'")


# Usage
file1_path = "rates_dirty.csv"
file2_path = "python/models_better/output/batch_prediction_results.csv"
# file2_path = 'models/classified_rates.csv'
compare_csv_files(file1_path, file2_path)


Overall error rate: 0.09
Total rows: 180443
Rows with errors: 16646

Column Error Statistics:
Column capacity: Error rate = 0.04, Errors = 7858
Column bedding: Error rate = 0.04, Errors = 7704
Column view: Error rate = 0.02, Errors = 3902
Column bathroom: Error rate = 0.01, Errors = 2509
Column class: Error rate = 0.02, Errors = 2790
Column quality: Error rate = 0.03, Errors = 4647
Column bedrooms: Error rate = 0.01, Errors = 2214
Column club: Error rate = 0.01, Errors = 1988
Column floor: Error rate = 0.01, Errors = 2012

Mismatches have been saved to 'mismatches.json'


# Predict (old)


In [None]:
# Prediction function
def predict(rate_name):
    input_data = pd.Series([rate_name])
    input_tfidf = tfidf.transform(input_data)
    result = {}

    for category in categories:
        prediction = models[category].predict(input_tfidf)[0]
        result[category] = label_encoders[category].inverse_transform([prediction])[0]

    return result


# Example predictions
example1 = "deluxe triple room"
example2 = "Premium Two Queen Room with Living Area High Floor non-smoking"

print(json.dumps(predict(example1), indent=2))
print(json.dumps(predict(example2), indent=2))

# Benchmark (old)


In [None]:
categories = [
    "class",
    "quality",
    "bathroom",
    "bedding",
    "capacity",
    "club",
    "bedrooms",
    "balcony",
    "view",
    "floor",
]

models_dir = "models"

models = {}
label_encoders = {}

for category in categories:
    model_path = os.path.join(models_dir, f"catboost_model_{category}.cbm")
    model = CatBoostClassifier()
    model.load_model(model_path)
    models[category] = model

    le = LabelEncoder()
    le.classes_ = np.load(
        os.path.join(models_dir, f"label_encoder_{category}.npy"), allow_pickle=True
    )
    label_encoders[category] = le


def preprocess_input(input_strings):
    df = pd.DataFrame({"rate_name": input_strings})
    df["rate_name"] = df["rate_name"].astype(str)

    return df


def run_batch_prediction(model, label_encoder, input_strings):
    input_data = preprocess_input(input_strings)
    predictions = model.predict_proba(input_data)
    predicted_classes = predictions.argmax(axis=1)

    return label_encoder.classes_[predicted_classes]


def classify_rates(rate_names: list) -> list:
    classifications = {
        category: run_batch_prediction(
            models[category], label_encoders[category], rate_names
        )
        for category in categories
    }
    return [
        dict(zip(categories, [classifications[cat][i] for cat in categories]))
        for i in range(len(rate_names))
    ]


df_dirty = pd.read_csv("rates_dirty.csv")
df_dirty = df_dirty.fillna("undefined")


batch_size = 5500
results = []
benchmark_results = []
total_time = 0

for i in range(0, len(df_dirty), batch_size):
    batch = df_dirty.iloc[i : i + batch_size]
    rate_names = batch["rate_name"].tolist()

    iteration_start_time = time.time()
    batch_classifications = classify_rates(rate_names)
    iteration_end_time = time.time()

    iteration_time = iteration_end_time - iteration_start_time
    total_time += iteration_time
    benchmark_results.append(
        {
            "batch_size": len(batch),
            "iteration_time": iteration_time,
        }
    )

    for rate_name, classifications in zip(rate_names, batch_classifications):
        results.append({"rate_name": rate_name, **classifications})

print(f"Total time taken: {total_time:.10f} seconds")


df_results = pd.DataFrame(results)
df_benchmark = pd.DataFrame(benchmark_results)

avg_iteration_time = df_benchmark["iteration_time"].mean()
max_iteration_time = df_benchmark["iteration_time"].max()
min_iteration_time = df_benchmark["iteration_time"].min()

print(f"\nBenchmark Statistics:")
print(f"Average iteration time: {avg_iteration_time * 10:.4f} ms")
print(f"Maximum iteration time: {max_iteration_time * 10:.4f} ms")
print(f"Minimum iteration time: {min_iteration_time * 10:.4f} ms")

output_file = "classified_rates.csv"
df_results.to_csv(output_file, index=False)

print(f"\nResults have been saved to {output_file}")

print("\nFirst few rows of the results:")
print(df_results[["rate_name"] + categories].head())

# Classify (old)


In [50]:
categories = [
    "class",
    "quality",
    "bathroom",
    "bedding",
    "capacity",
    "club",
    "bedrooms",
    "balcony",
    "view",
    "floor",
]

models_dir = "models"

models = {}
label_encoders = {}

for category in categories:
    model_path = os.path.join(models_dir, f"catboost_model_{category}.cbm")
    model = CatBoostClassifier()
    model.load_model(model_path)
    models[category] = model

    le = LabelEncoder()
    le.classes_ = np.load(
        os.path.join(models_dir, f"label_encoder_{category}.npy"), allow_pickle=True
    )
    label_encoders[category] = le


def preprocess_input(input_strings):
    df = pd.DataFrame({"rate_name": input_strings})
    df["rate_name"] = df["rate_name"].astype(str)

    return df


def run_batch_prediction(model, label_encoder, input_strings):
    input_data = preprocess_input(input_strings)
    predictions = model.predict_proba(input_data)
    predicted_classes = predictions.argmax(axis=1)

    return label_encoder.classes_[predicted_classes]


def classify_rates(rate_names: list) -> list:
    classifications = {
        category: run_batch_prediction(
            models[category], label_encoders[category], rate_names
        )
        for category in categories
    }
    return [
        dict(zip(categories, [classifications[cat][i] for cat in categories]))
        for i in range(len(rate_names))
    ]


df_dirty = pd.read_csv("rates_dirty.csv")
df_dirty = df_dirty.fillna("undefined")


batch_size = 5500
results = []

for i in range(0, len(df_dirty), batch_size):
    batch = df_dirty.iloc[i : i + batch_size]
    rate_names = batch["rate_name"].tolist()

    batch_classifications = classify_rates(rate_names)

    for rate_name, classifications in zip(rate_names, batch_classifications):
        results.append({"rate_name": rate_name, **classifications})

df_results = pd.DataFrame(results)
output_file = "new_classified_rates.csv"
df_results.to_csv(output_file, index=False)