## 1. Import biblioteka i definicija funkcija
Prvo ćemo importovati sve potrebne biblioteke i definisati pomoćne funkcije
za treniranje modela i evaluaciju performansi.

In [31]:
import os
import pickle
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from sklearn.naive_bayes import ComplementNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import save_npz, load_npz, csr_matrix

### 1.1 Funkcije za text model (Naive Bayes)
ComplementNB je pogodan za multi-label klasifikaciju sa neubalansiranim podacima.

In [32]:
def train_text_nb(X: csr_matrix, Y: np.ndarray):
    """Trenira Naive Bayes model za text klasifikaciju"""
    model = OneVsRestClassifier(ComplementNB(alpha=0.5))
    model.fit(X, Y)
    return model

def save_text_model(model: OneVsRestClassifier, path: str):
    """Čuva trenirani text model"""
    joblib.dump(model, path)

def load_text_model(path: str):
    """Učitava sačuvani text model"""
    return joblib.load(path)

### 1.2 Funkcije za image model (KNN)
KNN sa Manhattan metrikom se koristi za klasifikaciju na osnovu vizuelnih karakteristika postera.

In [33]:
def train_image_knn(X_features: np.ndarray, Y: np.ndarray, n_neighbors=5, metric='manhattan'):
    """Trenira KNN model za image klasifikaciju"""
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_features)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance', metric=metric)
    model = OneVsRestClassifier(knn)
    model.fit(X_scaled, Y)
    return model, scaler

def save_image_model(model: OneVsRestClassifier, path: str):
    """Čuva trenirani image model"""
    joblib.dump(model, path)

def load_image_model(path: str):
    """Učitava sačuvani image model"""
    return joblib.load(path)

### 1.3 Funkcije za fusion model
Late fusion kombinuje predikcije text i image modela.

In [34]:
def late_fusion(text_probs: np.ndarray, image_probs: np.ndarray, alpha: float = 0.6):
    """
    Kombinuje probability predikcije iz text i image modela
    alpha: težina za text model (1-alpha za image model)
    """
    return alpha * text_probs + (1 - alpha) * image_probs

def predict_multilabel(probs: np.ndarray, threshold: float = 0.5):
    """Konvertuje probabilities u binarne predikcije na osnovu threshold-a"""
    return (probs >= threshold).astype(int)

### 1.4 Funkcija za evaluaciju
Standardne metrike za multi-label klasifikaciju: F1-score, Hamming loss i accuracy po labeli.

In [35]:
def evaluate_model(y_true, y_pred, dataset_name=""):
    """Evaluira model i prikazuje metrike"""
    f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
    hamming = hamming_loss(y_true, y_pred)
    acc_per_label = accuracy_score(y_true, y_pred)
    
    print(f"\n--- {dataset_name} metrics ---")
    print(f"F1-score (micro): {f1:.4f}")
    print(f"Hamming loss: {hamming:.4f}")
    print(f"Accuracy per label: {acc_per_label:.4f}")
    
    return f1, hamming, acc_per_label

## 2. Treniranje Text Modela
Učitavamo podatke, pripremamo TF-IDF features i treniramo Naive Bayes model.
Pretpostavka je da su funkcije `clean_text`, `build_tfidf` i `prepare_labels` 
dostupne iz modula `src`.

In [36]:
from src.preprocessing.text_preproc import clean_text, build_tfidf
from src.data.dataset import prepare_labels

# Učitavanje podataka
csv_path = "data/processed/movies_valid.csv"
df = pd.read_csv(csv_path)
df["overview"] = df["overview"].apply(clean_text)

# Split podataka (70/15/15)
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

Train size: 3565, Val size: 764, Test size: 764


### 2.1 Priprema labela i TF-IDF vektorizacija
MultiLabelBinarizer transformiše žanrove u binarni format, a TF-IDF konvertuje tekst u numeričke features.

In [37]:
# Priprema labela
y_train, mlb = prepare_labels(train_df, fit_mlb=True)
y_val = prepare_labels(val_df, fit_mlb=False, mlb=mlb)
y_test = prepare_labels(test_df, fit_mlb=False, mlb=mlb)

# TF-IDF vektorizacija
x_train_tfidf, vectorizer = build_tfidf(train_df["overview"].fillna("").tolist())
x_val_tfidf = vectorizer.transform(val_df["overview"].fillna("").tolist())
x_test_tfidf = vectorizer.transform(test_df["overview"].fillna("").tolist())

print(f"TF-IDF shape: {x_train_tfidf.shape}")
print(f"Number of labels: {y_train.shape[1]}")

TF-IDF shape: (3565, 6739)
Number of labels: 20


### 2.2 Treniranje i evaluacija text modela

In [38]:
# Treniranje
text_model = train_text_nb(x_train_tfidf, y_train)

# Predikcije
val_preds = text_model.predict(x_val_tfidf)
test_preds = text_model.predict(x_test_tfidf)

# Evaluacija
evaluate_model(y_val, val_preds, "Text Model - Validation")
evaluate_model(y_test, test_preds, "Text Model - Test")


--- Text Model - Validation metrics ---
F1-score (micro): 0.4530
Hamming loss: 0.1081
Accuracy per label: 0.1309

--- Text Model - Test metrics ---
F1-score (micro): 0.4633
Hamming loss: 0.1048
Accuracy per label: 0.1531


(0.46329198793161247, 0.10477748691099477, 0.1531413612565445)

### 2.3 Čuvanje text modela i features

In [39]:
os.makedirs("data/processed", exist_ok=True)
os.makedirs("models", exist_ok=True)

# Čuvanje TF-IDF features
save_npz("data/processed/text_train_features.npz", x_train_tfidf)
save_npz("data/processed/text_val_features.npz", x_val_tfidf)
save_npz("data/processed/text_test_features.npz", x_test_tfidf)

# Čuvanje labela
np.save("data/processed/y_train.npy", y_train)
np.save("data/processed/y_val.npy", y_val)
np.save("data/processed/y_test.npy", y_test)

# Čuvanje modela i transformera
with open("data/processed/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
with open("models/text_nb_model.pkl", "wb") as f:
    pickle.dump(text_model, f)
joblib.dump(mlb, "data/processed/mlb.pkl")

print("✓ Text model i features saved")

✓ Text model i features saved


## 3. Treniranje Image Modela
Ekstrakcija vizuelnih features iz postera filmova i treniranje KNN klasifikatora.
Features se ekstrahuju pomoću pre-trained CNN modela.

In [40]:
from src.data.dataset import PosterDataset
from src.preprocessing.image_preproc import extract_features_from_dataset, get_transforms

# Kreiranje dataseta
train_dataset = PosterDataset(train_df, y_train, transforms=get_transforms(train=True))
val_dataset = PosterDataset(val_df, y_val, transforms=get_transforms(train=False))
test_dataset = PosterDataset(test_df, y_test, transforms=get_transforms(train=False))

print(f"Datasets created: Train={len(train_dataset)}, Val={len(val_dataset)}, Test={len(test_dataset)}")

Datasets created: Train=3565, Val=764, Test=764


### 3.1 Ekstrakcija image features
Ova operacija može trajati duže jer prolazi kroz sve slike i ekstrahuje deep features.

In [41]:
print("Extracting image features...")
x_train_img, y_train_img = extract_features_from_dataset(train_dataset)
x_val_img, y_val_img = extract_features_from_dataset(val_dataset)
x_test_img, y_test_img = extract_features_from_dataset(test_dataset)

print(f"Image features shape: {x_train_img.shape}")

Extracting image features...


Extracting color histograms: 100%|██████████| 3565/3565 [00:54<00:00, 65.08it/s]
Extracting color histograms: 100%|██████████| 764/764 [00:07<00:00, 106.43it/s]
Extracting color histograms: 100%|██████████| 764/764 [00:07<00:00, 105.91it/s]

Image features shape: (3565, 512)





### 3.2 Treniranje i evaluacija image modela

In [42]:
# Treniranje KNN
image_model, scaler = train_image_knn(x_train_img, y_train_img)

# Skaliranje validation i test setova
X_val_scaled = scaler.transform(x_val_img)
X_test_scaled = scaler.transform(x_test_img)

# Predikcije
val_preds_img = image_model.predict(X_val_scaled)
test_preds_img = image_model.predict(X_test_scaled)

# Evaluacija
evaluate_model(y_val_img, val_preds_img, "Image Model - Validation")
evaluate_model(y_test_img, test_preds_img, "Image Model - Test")


--- Image Model - Validation metrics ---
F1-score (micro): 0.2324
Hamming loss: 0.1185
Accuracy per label: 0.0720

--- Image Model - Test metrics ---
F1-score (micro): 0.2178
Hamming loss: 0.1236
Accuracy per label: 0.0785


(0.21780538302277433, 0.12362565445026177, 0.07853403141361257)

### 3.3 Čuvanje image modela i features

In [43]:
# Čuvanje features
np.savez("data/processed/image_features.npz",
         X_train=x_train_img, Y_train=y_train_img,
         X_val=x_val_img, Y_val=y_val_img,
         X_test=x_test_img, Y_test=y_test_img)

# Čuvanje modela
save_image_model(image_model, "models/image_knn_model.pkl")

print("✓ Image model i features saved")

✓ Image model i features saved


## 4. Late Fusion Model
Kombinujemo predikcije text i image modela kroz ponderisano усрењавање probability vrednosti.
Alpha parametar kontroliše koliki uticaj ima svaki model (alpha=0.6 znači 60% text, 40% image).

In [44]:
print("\n" + "="*60)
print("FUSION MODEL - Combining Text and Image Predictions")
print("="*60)

# Parametri fusion modela
alpha = 0.6  # Težina za text model
threshold = 0.5  # Threshold za binarnu klasifikaciju

print(f"Fusion parameters: alpha={alpha}, threshold={threshold}")


FUSION MODEL - Combining Text and Image Predictions
Fusion parameters: alpha=0.6, threshold=0.5


### 4.1 Generisanje probability predikcija
Koristimo `predict_proba` metod oba modela da dobijemo probability ocene umesto binarnih predikcija.

In [45]:
# Text model probabilities
text_val_probs = text_model.predict_proba(x_val_tfidf)
text_test_probs = text_model.predict_proba(x_test_tfidf)

# Image model probabilities
image_val_probs = image_model.predict_proba(X_val_scaled)
image_test_probs = image_model.predict_proba(X_test_scaled)

print(f"Probability shapes - Text: {text_val_probs.shape}, Image: {image_val_probs.shape}")

Probability shapes - Text: (764, 20), Image: (764, 20)


### 4.2 Fusion i finalne predikcije
Kombinujemo probabilities i primenjujemo threshold da dobijemo binarne predikcije.

In [46]:
# Late fusion
fused_val_probs = late_fusion(text_val_probs, image_val_probs, alpha=alpha)
fused_test_probs = late_fusion(text_test_probs, image_test_probs, alpha=alpha)

# Threshold predictions
val_preds_fusion = predict_multilabel(fused_val_probs, threshold)
test_preds_fusion = predict_multilabel(fused_test_probs, threshold)

### 4.3 Evaluacija fusion modela
Uporedimo performanse fusion modela sa pojedinačnim modelima.

In [47]:
evaluate_model(y_val, val_preds_fusion, "Fusion Model - Validation")
evaluate_model(y_test, test_preds_fusion, "Fusion Model - Test")


--- Fusion Model - Validation metrics ---
F1-score (micro): 0.2975
Hamming loss: 0.0992
Accuracy per label: 0.1073

--- Fusion Model - Test metrics ---
F1-score (micro): 0.2605
Hamming loss: 0.1048
Accuracy per label: 0.1099


(0.2605080831408776, 0.10477748691099477, 0.1099476439790576)

## 5. Finalna Poređenja Modela
Rezime svih modela radi lakšeg poređenja performansi.

In [48]:
print("FINAL MODEL COMPARISON (Test Set)")
print("="*60)

models_results = {
    "Text Model": test_preds,
    "Image Model": test_preds_img,
    "Fusion Model": test_preds_fusion
}

results = []
for model_name, predictions in models_results.items():
    f1 = f1_score(y_test, predictions, average="micro", zero_division=0)
    hamming = hamming_loss(y_test, predictions)
    acc = accuracy_score(y_test, predictions)
    results.append({
        "Model": model_name,
        "F1-Score": f1,
        "Hamming Loss": hamming,
        "Accuracy": acc
    })

comparison_df = pd.DataFrame(results)
print(comparison_df.to_string(index=False))

FINAL MODEL COMPARISON (Test Set)
       Model  F1-Score  Hamming Loss  Accuracy
  Text Model  0.463292      0.104777  0.153141
 Image Model  0.217805      0.123626  0.078534
Fusion Model  0.260508      0.104777  0.109948


## 6. Zaključak
Pipeline je kompletan! Sve modeli su trenirani i sačuvani:
- **Text Model**: `models/text_nb_model.pkl`
- **Image Model**: `models/image_knn_model.pkl`
- **Features**: `data/processed/` direktorijum

Fusion model kombinuje najbolje od oba pristupa i obično daje najbolje rezultate za multi-label klasifikaciju filmskih žanrova.