# Data Analysis

## Initialization



In [None]:
!pip install catboost
!pip install optuna
!pip install dill
!pip install psutil
!pip install scikit-learn

In [None]:
from catboost import CatBoostClassifier, Pool
import numpy as np
import time
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import optuna
import json
import psutil
from datetime import datetime
import csv
from difflib import SequenceMatcher
from collections import defaultdict
import joblib


## Preparation

In [None]:
df = pd.read_csv("rates_clean.csv")
df = df.fillna("undefined")

categories = [
    "capacity",
    "quality",
    "view",
    "bedding",
    "balcony",
    "bedrooms",
    "club",
    "floor",
    "bathroom",
    "class",
]

for cat in categories:
    df[cat] = df[cat].astype(str)

df["rate_name"] = df["rate_name"].astype(str)

X = df["rate_name"]
y = df[categories]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Improved TF-IDF vectorizer
tfidf = TfidfVectorizer(
    analyzer="char_wb",  # Character n-grams, including word boundaries
    ngram_range=(1, 3),  # Unigrams, bigrams, and trigrams
    max_features=10000,  # Increased to capture more features
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.95,  # Ignore terms that appear in more than 95% of the documents
    sublinear_tf=True,  # Apply sublinear tf scaling
    lowercase=True,  # Convert all characters to lowercase
    strip_accents="unicode",  # Remove accents
    norm="l2",  # L2 normalization of the vectors
    use_idf=True,  # Enable inverse-document-frequency reweighting
    smooth_idf=True,  # Smooth idf weights by adding one to document frequencies
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Label encoding
label_encoders = {}
for category in categories:
    le = LabelEncoder()
    y_train[category] = le.fit_transform(y_train[category])
    y_test[category] = le.transform(y_test[category])
    label_encoders[category] = le

## Training





In [None]:
def objective(trial, X_train, y_train, X_test, y_test, category):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 500),
        "depth": trial.suggest_int("depth", 1, 5),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 0,
        "task_type": "CPU",
        "thread_count": psutil.cpu_count(logical=False),
        "used_ram_limit": f"{int(psutil.virtual_memory().available / (1024 * 1024 * 1024) * 0.8)}GB",
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]
        ),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "leaf_estimation_method": trial.suggest_categorical(
            "leaf_estimation_method", ["Newton", "Gradient"]
        ),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "subsample": (
            trial.suggest_float("subsample", 0.1, 1.0)
            if trial.params["bootstrap_type"] != "Bayesian"
            else None
        ),
    }

    model = CatBoostClassifier(**params)
    model.fit(
        X_train,
        y_train[category],
        eval_set=(X_test, y_test[category]),
        early_stopping_rounds=50,
        verbose=0,
    )

    y_pred = model.predict(X_test)
    f1_score = classification_report(y_test[category], y_pred, output_dict=True)[
        "weighted avg"
    ]["f1-score"]

    return f1_score


# Training and evaluation
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
models_dir = f"models_{timestamp}"
os.makedirs(models_dir, exist_ok=True)

models = {}
for category in categories:
    print(f"Training model for {category}")

    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective(
            trial, X_train_tfidf, y_train, X_test_tfidf, y_test, category
        ),
        n_trials=100,
    )

    best_params = study.best_params
    best_params["verbose"] = 100
    best_params["task_type"] = "CPU"
    model = CatBoostClassifier(**best_params)

    model.fit(X_train_tfidf, y_train[category])

    model_path = os.path.join(models_dir, f"catboost_model_{category}.cbm")
    model.save_model(model_path)

    models[category] = model

    with open(os.path.join(models_dir, f"best_parameters_{category}.json"), "w") as f:
        json.dump(best_params, f, indent=2)

    y_pred = model.predict(X_test_tfidf)
    classification_rep = classification_report(y_test[category], y_pred)

    with open(
        os.path.join(models_dir, f"classification_report_{category}.txt"), "w"
    ) as f:
        f.write(classification_rep)

print(f"Training completed. Models and results saved in {models_dir}")

### Saving label encoders for python

In [None]:
for category in categories:
    le = LabelEncoder()
    y_train[category] = le.fit_transform(y_train[category])
    y_test[category] = le.transform(y_test[category])
    label_encoders[category] = le

    # Save the encoder
    np.save(f"python/models_better/label_encoder_{category}.npy", le.classes_)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Save the TF-IDF vectorizer
joblib.dump(tfidf, "python/models_better/tfidf_vectorizer.joblib")


print("Success")

### JSON export for golang

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

tfidf_vocab = tfidf.vocabulary_
idf_values = tfidf.idf_

tfidf_data = {
    "vocabulary": {k: int(v) for k, v in tfidf_vocab.items()},
    "idf_values": idf_values.tolist(),
}

with open("models_better/tfidf/tfidf_data.json", "w") as f:
    json.dump(tfidf_data, f)

print("TF-IDF data exported to tfidf_data.json")

label_encoders = {}
for category in categories:
    le = LabelEncoder()
    y_train[category] = le.fit_transform(y_train[category])
    y_test[category] = le.transform(y_test[category])
    label_encoders[category] = le

labels_dir = "models_better/labels"
os.makedirs(labels_dir, exist_ok=True)

for category in categories:
    unique_labels = df[category].unique().tolist()

    unique_labels.sort()

    with open(os.path.join(labels_dir, f"labels_{category}.json"), "w") as f:
        json.dump(unique_labels, f)

print("Labels exported")