In [None]:
import csv
import datetime
import re
import pickle
import joblib
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, early_stopping

import optuna


In [None]:
df = pd.read_pickle('C:\\diplomka_work\\python_code\\data_pickles\\train_cely.pkl')

In [None]:


# Načítanie dát (použi tvoje dáta)
# df = pd.read_csv('df_vzorek.csv')

# Pridanie stĺpca `next_product_id`
df['next_product_name'] = df.groupby('doklad_id_int')['produkt_nazev'].shift(-1)

# Označenie, či je riadok posledný v objednávke
df['is_last'] = df['next_product_name'].isna()

# Odstránenie riadkov, kde je `is_last` True
df = df[~df['is_last']].copy()


# Odstránenie tried, ktoré majú menej ako 2 vzorky
class_counts = df['next_product_name'].value_counts()
valid_classes = class_counts[class_counts > 1].index
df = df[df['next_product_name'].isin(valid_classes)].copy()


In [None]:
column_names_dumm = ['produkt_nazev']
df = pd.get_dummies(df, columns = column_names_dumm)


In [None]:
# Automatický výber všetkých čŕt okrem identifikátorov a cieľových premenných
exclude_columns = ['doklad_id', 'produkt_id', 'item_sequence', 'poradi', 'next_product_name', 'next_product_id_encoded', 'is_last','otevreni_datum_cas','produkt_nazev', 'produkt_id_int', 'doklad_id_int']
features = [col for col in df.columns if col not in exclude_columns]

X = df[features]
y = df['next_product_name']

In [None]:
print(df.dtypes) 

In [None]:
# Rozdelenie dát na tréningovú a testovaciu množinu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:


# Získanie veľkostí jednotlivých datasetov
sizes = {
    "X_train": len(X_train),
    "X_test": len(X_test),
    "y_train": len(y_train),
    "y_test": len(y_test),
}

# Vytvorenie grafu
plt.figure(figsize=(8, 6))
plt.bar(sizes.keys(), sizes.values())
plt.xlabel("Dataset")
plt.ylabel("Počet vzoriek")
plt.title("Porovnanie veľkostí datasetov")
plt.show()


In [None]:
import re

# Oprava názvov stĺpcov (odstránenie špeciálnych znakov)
X.columns = [re.sub(r'[^A-Za-z0-9_]', '_', col) for col in X.columns]

In [None]:
# Skontroluj dátové typy
print("Dátové typy v DataFrame:")
print(df.dtypes)

# Nájdeme stĺpce, ktoré nie sú int, float alebo bool
problematic_columns = df.select_dtypes(exclude=['int64', 'float64', 'bool']).columns.tolist()

# Výpis problematických stĺpcov
if problematic_columns:
    print("\n⚠️  Stĺpce s nesprávnym typom (môžu spôsobiť chybu v modelovaní):")
    print(problematic_columns)
else:
    print("\n✅  Všetky stĺpce sú v správnom formáte pre modelovanie.")

In [None]:

# Kódovanie cieľovej premennej (Label Encoding)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Rozdelenie datasetu na trénovaciu a testovaciu množinu
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Definovanie cieľovej funkcie pre Optunu
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),  # Počet iterácií
        "max_depth": trial.suggest_int("max_depth", 3, 12),  # Hĺbka stromov
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),  # Learning rate
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),  # Počet listov na strome
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),  # Min vzoriek v liste
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10),  # L1 regularizácia
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10),  # L2 regularizácia
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),  # Bagging
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),  # Výber čŕt
        "min_split_gain": trial.suggest_float("min_split_gain", 0, 0.2),  # Práh pre split
        "n_jobs": -1,  # Využitie všetkých CPU jadier
        "random_state": 42
    }

    # Trénovanie modelu
    model = LGBMClassifier(**params)
    model.fit(
        X_train, y_train, 
        eval_set=[(X_test, y_test)], 
        callbacks=[early_stopping(50, verbose=False)]  # **Použitý správny spôsob Early Stopping**
    )

    # Predikcia
    y_pred = model.predict(X_test)

    # Vyhodnotenie presnosti
    return accuracy_score(y_test, y_pred)

# Spustenie Optuny s viac iteráciami
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=-1)  # 50 skúšok pre lepšiu optimalizáciu

# Najlepšie parametre
best_params = study.best_params
print("Najlepšie parametre:", best_params)

# Trénovanie finálneho modelu s najlepšími parametrami
final_model = LGBMClassifier(**best_params, n_jobs=-1)
final_model.fit(
    X_train, y_train, 
    eval_set=[(X_test, y_test)], 
    callbacks=[early_stopping(50, verbose=True)]  # Použitie Early Stopping v správnom formáte
)

# Predikcia na testovacej množine
y_pred_final = final_model.predict(X_test)

# Vyhodnotenie finálneho modelu
final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Optimized LightGBM Test Accuracy: {final_accuracy:.4f}")



In [None]:
    with open("best_params.txt", "w") as f:
        f.write("Najlepšie parametre:\n")
        for key, val in study.best_trial.params.items():
            f.write(f"{key}: {val}\n")
        f.write(f"\nNajlepšia presnosť: {study.best_value:.4f}\n")

    print("Výsledky boli uložené do 'optuna_trials.txt' a 'best_params.txt'.")

# --- Výpis naj výsledku aj na konzolu ---
print("Najlepšie parametre:")
print(study.best_params)
print(f"Najlepšia presnosť: {study.best_value:.4f}")


In [None]:
# --- Label encoding cieľa ---
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)


In [None]:

# # --- Použitie najlepších hyperparametrov ---

# best_params = {
#     'n_estimators': 712,
#     'max_depth': 5,
#     'learning_rate': 0.23786612527806789,
#     'num_leaves': 45,
#     'min_child_samples': 39,
#     'reg_alpha': 2.260779731734915,
#     'reg_lambda': 7.276727287983203,
#     'subsample': 0.5971143224408253,
#     'colsample_bytree': 0.9359953488545416,
#     'min_split_gain': 0.0009650122115177073,
#     'n_jobs': -1,
#     'random_state': 42
# }



In [None]:

# --- Trénovanie finálneho modelu ---
final_model = LGBMClassifier(**best_params)

In [None]:
final_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[early_stopping(80, verbose=True)]
)


In [None]:
# --- Predikcia ---
y_pred_final = final_model.predict(X_test)

In [None]:
# --- Vyhodnotenie ---
final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Final LightGBM Accuracy: {final_accuracy:.4f}")

In [None]:
# Vyhodnotenie presnosti na trénovacej množine
y_train_pred = final_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {final_accuracy:.4f}")

In [None]:
 
 #uloženie label_encoder
 
 with open('C:\\diplomka_work\\python_code\\data_pickles\\LGBM\\text_LGBM_label_encoder.pkl', 'wb') as f:
     pickle.dump(encoder, f)


In [None]:
# Uloženie modelu do súboru
with open('C:\\diplomka_work\\python_code\\data_pickles\\LGBM\\text_LGBM_saved_model.pkl', 'wb') as file:
    pickle.dump(final_model, file=file)

In [None]:
# --- Uloženie zoznamu features ---
with open('C:\\diplomka_work\\python_code\\data_pickles\\LGBM\\text_LGBM_uložene_feature.pkl', 'wb') as f:
    pickle.dump(X_train.columns.tolist(), f)