In [None]:
!pip install catboost
!pip install scikit-learn
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import optuna
import os
import json
import psutil
from datetime import datetime

In [None]:
df = pd.read_csv('rates_clean.csv')
df = df.fillna('undefined')

categories = ['class', 'quality', 'bathroom', 'bedding', 'capacity', 'club', 'bedrooms', 'balcony', 'view', 'floor']

for cat in categories:
    df[cat] = df[cat].astype(str)

df['rate_name'] = df['rate_name'].astype(str)

X = df['rate_name']
y = df[categories]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Improved TF-IDF vectorizer
tfidf = TfidfVectorizer(
    analyzer='char_wb',  # Character n-grams, including word boundaries
    ngram_range=(1, 3),  # Unigrams, bigrams, and trigrams
    max_features=10000,  # Increased to capture more features
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.95,  # Ignore terms that appear in more than 95% of the documents
    sublinear_tf=True,  # Apply sublinear tf scaling
    lowercase=True,  # Convert all characters to lowercase
    strip_accents='unicode',  # Remove accents
    norm='l2',  # L2 normalization of the vectors
    use_idf=True,  # Enable inverse-document-frequency reweighting
    smooth_idf=True,  # Smooth idf weights by adding one to document frequencies
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

vocabulary = tfidf.vocabulary_
doc_freq = tfidf.idf_.tolist()
num_docs = tfidf.idf_.shape[0]
doc_freq_dict = {term: int(tfidf.idf_[idx]) for term, idx in vocabulary.items()}

tfidf_data = {
    "vocabulary": vocabulary,
    "idf_values": {term: tfidf.idf_[idx] for term, idx in vocabulary.items()},
    "doc_freq": doc_freq_dict,
    "num_docs": num_docs
}

with open("tfidf_data.json", "w") as f:
    json.dump(tfidf_data, f, indent=2)

print("TF-IDF data exported to tfidf_data.json")

label_encoders = {}
for category in categories:
    le = LabelEncoder()
    y_train[category] = le.fit_transform(y_train[category])
    y_test[category] = le.transform(y_test[category])
    label_encoders[category] = le

labels_dir = "labels"
os.makedirs(labels_dir, exist_ok=True)

for category in categories:
    with open(os.path.join(labels_dir, f"labels_{category}.json"), "w") as f:
        json.dump(label_encoders[category].classes_.tolist(), f)

print("Labels exported")

def objective(trial, X_train, y_train, X_test, y_test, category):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 500),
        'depth': trial.suggest_int('depth', 1, 5),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'verbose': 0,
        'task_type': 'CPU',
        'thread_count': psutil.cpu_count(logical=False),
        'used_ram_limit': f'{int(psutil.virtual_memory().available / (1024 * 1024 * 1024) * 0.8)}GB',
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'leaf_estimation_method': trial.suggest_categorical('leaf_estimation_method', ['Newton', 'Gradient']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0) if trial.params['bootstrap_type'] != 'Bayesian' else None
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train[category], eval_set=(X_test, y_test[category]), early_stopping_rounds=50, verbose=0)

    y_pred = model.predict(X_test)
    f1_score = classification_report(y_test[category], y_pred, output_dict=True)['weighted avg']['f1-score']

    return f1_score

# Training and evaluation
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
models_dir = f"models_{timestamp}"
os.makedirs(models_dir, exist_ok=True)

models = {}
for category in categories:
    print(f"Training model for {category}")

    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_tfidf, y_train, X_test_tfidf, y_test, category), n_trials=100)

    best_params = study.best_params
    best_params['verbose'] = 100
    best_params['task_type'] = 'CPU'
    model = CatBoostClassifier(**best_params)

    model.fit(X_train_tfidf, y_train[category])

    model_path = os.path.join(models_dir, f"catboost_model_{category}.cbm")
    model.save_model(model_path)

    models[category] = model

    with open(os.path.join(models_dir, f"best_parameters_{category}.json"), "w") as f:
        json.dump(best_params, f, indent=2)

    y_pred = model.predict(X_test_tfidf)
    classification_rep = classification_report(y_test[category], y_pred)

    with open(os.path.join(models_dir, f"classification_report_{category}.txt"), "w") as f:
        f.write(classification_rep)

print(f"Training completed. Models and results saved in {models_dir}")

TypeError: Object of type int64 is not JSON serializable

In [None]:
# Prediction function
def predict(rate_name):
    input_data = pd.Series([rate_name])
    input_tfidf = tfidf.transform(input_data)
    result = {}

    for category in categories:
        prediction = models[category].predict(input_tfidf)[0]
        result[category] = label_encoders[category].inverse_transform([prediction])[0]

    return result

# Example predictions
example1 = "deluxe triple room"
example2 = "Premium Two Queen Room with Living Area High Floor non-smoking"

print(json.dumps(predict(example1), indent=2))
print(json.dumps(predict(example2), indent=2))

In [None]:
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import joblib

# Load the saved CatBoost model
model = CatBoostClassifier()
model.load_model('catboost_model_quality.cbm')

tfidf_vectorizer = joblib.load('tfidf_vectorizer.joblib')

def preprocess_input(text):
    # Implement any text cleaning or preprocessing here
    # For example: lowercase, remove punctuation, etc.
    # return text.lower()
    return text

def vectorize_input(text):
    # Transform the input text using the loaded TF-IDF vectorizer
    return tfidf_vectorizer.transform([text])

def predict(text):
    # Preprocess the input text
    preprocessed_text = preprocess_input(text)

    # Vectorize the preprocessed text
    vectorized_text = vectorize_input(preprocessed_text)
    print(vectorized_text)

    # Make prediction
    prediction = model.predict(vectorized_text)

    return prediction

# Example usage
input_text = "King Premium Mountain View no balcony"
result = predict(input_text)
print(f"Prediction: {result}")

# If your model returns probability scores, you can use predict_proba instead
# probability = model.predict_proba(vectorized_text)
# print(f"Prediction probabilities: {probability}")

  (0, 133)	0.05159103942290583
  (0, 134)	0.11686431517097241
  (0, 219)	0.08611483890237456
  (0, 223)	0.08701424709009065
  (0, 236)	0.13769439337993772
  (0, 241)	0.18478591666814925
  (0, 245)	0.12032907478739258
  (0, 250)	0.12241903445421606
  (0, 264)	0.08896645384864628
  (0, 271)	0.11488159657231892
  (0, 319)	0.0751312934772496
  (0, 322)	0.07541877184237387
  (0, 857)	0.07376255595637449
  (0, 916)	0.1391520143352493
  (0, 921)	0.1634984753876855
  (0, 928)	0.09723173060848084
  (0, 932)	0.12963105107316736
  (0, 1043)	0.043513119802300396
  (0, 1054)	0.11254126515953183
  (0, 1058)	0.12969669888726051
  (0, 1115)	0.0614275419321618
  (0, 1178)	0.09924562953013805
  (0, 1186)	0.11373091457124099
  (0, 1450)	0.12112889668753928
  (0, 1453)	0.12476419997129916
  :	:
  (0, 2591)	0.07887086645582893
  (0, 2609)	0.12989443459511288
  (0, 2663)	0.06636099323255523
  (0, 2669)	0.14286117005301907
  (0, 2692)	0.06279961347242877
  (0, 2760)	0.11321009245762832
  (0, 2761)	0.11926023