<a href="https://colab.research.google.com/github/negovanovic-e/automatska-kategorizacija-proizvoda/blob/main/03_treniranje_modela.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.sparse import hstack
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Učitaj pripremljene podatke
df = pd.read_csv('products_clean.csv')
print("Učitano proizvoda:", len(df))

In [None]:
# Priprema podataka za modeliranje
# TF-IDF vektorizacija
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf.fit_transform(df['cleaned_title'])

# Kombiniraj s numeričkim značajkama
X_combined = hstack([X_tfidf, df[['title_length', 'word_count']].values])
y = df['Category Label']

print(f"Dimenzije značajki: {X_combined.shape}")

In [None]:
# Podjela na train i test skup
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape[0]} primjera")
print(f"Test: {X_test.shape[0]} primjera")

In [None]:
# Testiranje različitih modela
models = {
    'Slučajna šuma': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistička regresija': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42)
}

results = {}

print("TRENIRANJE MODELA:")
print("=" * 50)

for name, model in models.items():
    print(f"\nTreniram: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

    print(f"Točnost: {accuracy:.2%}")

    # Detaljniji izvještaj za najbolji model
    if accuracy == max(results.values()):
        print(f"Detaljni izvještaj za {name}:")
        print(classification_report(y_test, y_pred))

In [None]:
# Odabir najboljeg modela
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print(f"\nNAJBOLJI MODEL: {best_model_name}")
print(f"Točnost: {results[best_model_name]:.2%}")

In [None]:
# Matrica zabune za najbolji model
y_pred_best = best_model.predict(X_test)

plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Matrica zabune - {best_model_name}')
plt.ylabel('Stvarna kategorija')
plt.xlabel('Predviđena kategorija')
plt.show()

In [None]:
# Spremanje modela
import os
if not os.path.exists('models'):
    os.makedirs('models')

joblib.dump(best_model, 'models/product_classifier.pkl')
joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')

print("Model i vektorizier su spremljeni!")

In [None]:
# Testiranje na primjerima iz zadatka
test_products = [
    "iphone 7 32gb gold",
    "olympus e m10 mark iii",
    "kenwood k20mss15 solo",
    "bosch wap28390gb 8kg 1400 spin"
]

expected = ["Mobile Phones", "Digital Cameras", "Microwaves", "Washing Machines"]

print("\nTESTIRANJE NA PRIMJERIMA IZ ZADATKA:")
for product, expected_cat in zip(test_products, expected):
    # Preprocesiranje
    cleaned = preprocess_text(product)

    # Značajke
    title_len = len(cleaned)
    word_cnt = len(cleaned.split())

    # TF-IDF
    X_tfidf_test = tfidf.transform([cleaned])
    X_combined_test = hstack([X_tfidf_test, [[title_len, word_cnt]]])

    # Predikcija
    prediction = best_model.predict(X_combined_test)
    probability = best_model.predict_proba(X_combined_test).max()

    status = "✅" if prediction[0] == expected_cat else "❌"
    print(f"{status} {product}")
    print(f"   Očekivano: {expected_cat}")
    print(f"   Predviđeno: {prediction[0]} ({probability:.2%})")
    print()

print("Treniranje modela završeno!")