In [None]:
import pickle
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Masking
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import os



In [None]:
# === Naƒç√≠tanie d√°t ===
df = pd.read_pickle(r'C:\diplomka_work\python_code\diploma_kod_text\RNN\na_predanie\train_cely.pkl')


In [None]:



# === Pr√≠prava sekvenci√≠ ===
df = df.sort_values(by=['doklad_id_int', 'poradi'])  # zoradenie podƒæa dokladu a poradia

# Label encoding produktov
encoder = LabelEncoder()
df['produkt_nazev_encoded'] = encoder.fit_transform(df['produkt_nazev'])

# Skupinovanie do sekvenci√≠ podƒæa doklad_id
grouped = df.groupby('doklad_id_int')['produkt_nazev_encoded'].apply(list)

# Pr√≠prava vstupov a cieƒæov
X_sequences = []
y_labels = []

for seq in grouped:
    for i in range(1, len(seq)):
        X_sequences.append(seq[:i])  # sekvencia do i-t√©ho kroku
        y_labels.append(seq[i])      # cieƒæ = nasleduj√∫ci produkt

# Padding (aby boli v≈°etky sekvencie rovnakej dƒ∫≈æky)
max_seq_len = max([len(seq) for seq in X_sequences])
X_padded = pad_sequences(X_sequences, maxlen=max_seq_len, padding='pre')

# Konverzia y do numpy array
y_labels = np.array(y_labels)

# === Filtrovanie vzoriek s m√°lo v√Ωskytmi ===
label_counts = Counter(y_labels)
valid_labels = {label for label, count in label_counts.items() if count >= 2}

X_filtered = []
y_filtered = []

for x, y in zip(X_padded, y_labels):
    if y in valid_labels:
        X_filtered.append(x)
        y_filtered.append(y)

X_filtered = np.array(X_filtered)
y_filtered = np.array(y_filtered)

# === Rozdelenie datasetu ===
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)

# === Defin√≠cia modelu ===
vocab_size = df['produkt_nazev_encoded'].nunique() + 1  # poƒçet unik√°tnych produktov
embedding_dim = 128

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len, mask_zero=True))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print(model.summary())




In [None]:
# === Tr√©ning ===
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

In [None]:
# === Vyhodnotenie
loss, acc = model.evaluate(X_test, y_test)
print(f"\nüéØ Test Accuracy: {acc:.4f}")

In [None]:
loss, acc = model.evaluate(X_train, y_train)
print(f"\nüéØ Test Accuracy: {acc:.4f}")

In [None]:
y_test_pred_probs = model.predict(X_test, verbose=0)

In [None]:
# Ulo≈æenie modelu do s√∫boru
with open('C:\\diplomka_work\\python_code\\data_pickles\\RNN\\saved_model_RNN.pkl', 'wb') as file:
    pickle.dump(model, file=file)

In [None]:
model.save('C:\\diplomka_work\\python_code\\data_pickles\\RNN\\saved_model_RNN.h5')

In [None]:
 
 #dsd label_encoder
 
 with open('C:\\diplomka_work\\python_code\\data_pickles\\RNN\\label_encoder_RNN.pkl', 'wb') as f:
     pickle.dump(encoder, f)


In [None]:

# === Predikcia pravdepodobnost√≠ pre testovaciu mno≈æinu ===
y_test_pred_probs = model.predict(X_test, verbose=0)

# === Z√≠skanie top 2 predikovan√Ωch tried pre ka≈æd√Ω vstup ===
top_2_preds = np.argsort(y_test_pred_probs, axis=1)[:, -2:][:, ::-1]  # zoraden√© zostupne

# === Prevod zak√≥dovan√Ωch hodn√¥t sp√§≈• na n√°zvy produktov ===
top_1_names = encoder.inverse_transform(top_2_preds[:, 0])
top_2_names = encoder.inverse_transform(top_2_preds[:, 1])
true_names = encoder.inverse_transform(y_test)

# === Z√°pis do .txt s√∫boru ===
output_path = os.path.join(os.getcwd(), "predikcie_test_top2.txt")

with open(output_path, "w", encoding="utf-8") as f:
    f.write("=== TOP 2 PREDIKCIE + SKUTOƒåN√Å HODNOTA ===\n\n")
    for i in range(len(y_test)):
        f.write(f"Top 1 predikcia: {top_1_names[i]}\n")
        f.write(f"Top 2 predikcia: {top_2_names[i]}\n")
        f.write(f"Skutoƒçn√° hodnota: {true_names[i]}\n\n")

print(f"‚úÖ V√Ωstup ulo≈æen√Ω do s√∫boru: {output_path}")
