In [74]:
# ============================================================
# 0. IMPORTI + SEED
# ============================================================

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf
from tensorflow.keras import layers, Model, Input

np.random.seed(42)
tf.random.set_seed(42)

# ============================================================
# 1. UČITAVANJE I ČIŠĆENJE PODATAKA
# ============================================================

df = pd.read_csv("./car-details-v3.csv")

# Zadržavamo bitne kolone
df = df[
    [
        "name",
        "year",
        "selling_price",
        "km_driven",
        "fuel",
        "seller_type",
        "transmission",
        "owner",
        "mileage",
        "engine",
        "max_power",
        "torque",
        "seats",
    ]
].copy()

# ----- Parsiranje numeričkih vrednosti iz string kolona -----
def extract_first_number(s):
    s = str(s)
    out = pd.Series(s).str.extract(r"(\d+\.?\d*)")[0]
    try:
        return float(out)
    except Exception:
        return np.nan

df["mileage"] = df["mileage"].apply(extract_first_number)
df["engine"] = df["engine"].apply(extract_first_number)
df["max_power"] = df["max_power"].apply(extract_first_number)
df["torque"] = df["torque"].apply(extract_first_number)
df["seats"] = df["seats"].astype(float)

# Uklanjamo redove sa NaN
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

print("Broj automobila nakon čišćenja:", len(df))

# ============================================================
# 2. FEATURE ENGINEERING ZA AUTOMOBILE
# ============================================================

# --- 2.1. Brand i flagovi (SUV, sportski, luksuzni) ---

def extract_brand(name: str) -> str:
    name = str(name)
    tokens = name.split()
    if len(tokens) >= 2 and tokens[0] == "Mercedes-Benz":
        return "Mercedes-Benz"
    if len(tokens) >= 2 and tokens[0] == "Land" and tokens[1] == "Rover":
        return "Land Rover"
    return tokens[0]

df["brand"] = df["name"].apply(extract_brand)

# gruba lista SUV/offroad modela
SUV_PAT = r"Scorpio|Bolero|Fortuner|Safari|Sumo|Innova|XUV|Creta|Duster|Terrano|EcoSport|CR-V|Endeavour|Hector|Harrier|Compass|BR-V|Captiva|Xtrail|X-Trail|X3|X5|GLA|GLC|GLE|Q3|Q5|Q7|XC60|XC90|X1|Innova Crysta"

df["is_suv"] = df["name"].str.contains(SUV_PAT, case=False, regex=True).astype(int)

# sportski modeli: GTI, RS, TSI, VTEC, Abarth, Cooper S...
SPORT_PAT = r"GTI|GT TSI|TSI|TFSI|vRS|RS\b|iVTEC|VTEC|Type R|Sports|Sportz|Abarth|Turbo|N Line|Cooper S|ST Line|AMG|M\b|M3|M4|M5|S\b|S3|S4|S5"

df["is_sport_model"] = df["name"].str.contains(
    SPORT_PAT, case=False, regex=True
).astype(int)

# luksuzni brendovi
LUX_BRANDS = {
    "BMW",
    "Mercedes-Benz",
    "Audi",
    "Volvo",
    "Jaguar",
    "Land Rover",
    "Porsche",
    "Lexus",
    "Mini",
    "MINI",
}

df["is_luxury_brand"] = df["brand"].isin(LUX_BRANDS).astype(int)

# --- 2.2. Normalizacija numeričkih kolona + enkodovanje kategorija ---

NUM_COLS = [
    "year",
    "selling_price",
    "km_driven",
    "mileage",
    "engine",
    "max_power",
    "torque",
    "seats",
]

BIN_COLS = ["is_suv", "is_sport_model", "is_luxury_brand"]

CAT_COLS = ["fuel", "seller_type", "transmission", "owner"]

# Skaliranje numeričkih u [0,1]
scaler = MinMaxScaler()
df[NUM_COLS] = scaler.fit_transform(df[NUM_COLS])

# Enkodovanje kategorija u ID-jeve
for col in CAT_COLS:
    df[col] = df[col].astype("category")

fuel_cat = df["fuel"].cat.categories
seller_cat = df["seller_type"].cat.categories
trans_cat = df["transmission"].cat.categories
owner_cat = df["owner"].cat.categories

df["fuel"] = df["fuel"].cat.codes
df["seller_type"] = df["seller_type"].cat.codes
df["transmission"] = df["transmission"].cat.codes
df["owner"] = df["owner"].cat.codes

num_fuel = len(fuel_cat)
num_seller = len(seller_cat)
num_trans = len(trans_cat)
num_owner = len(owner_cat)

print("fuel:", list(fuel_cat))
print("seller_type:", list(seller_cat))
print("transmission:", list(trans_cat))
print("owner:", list(owner_cat))

# Konačni skup feature-a za ITEM tower
ITEM_FEATURES = NUM_COLS + BIN_COLS + ["fuel", "seller_type", "transmission", "owner"]
df_items = df[ITEM_FEATURES].copy()

# Helper za dobijanje koda
def get_code(categories, name, default=0):
    if name in categories:
        return int(np.where(categories == name)[0][0])
    return default

petrol_code = get_code(fuel_cat, "Petrol", 0)
diesel_code = get_code(fuel_cat, "Diesel", 0)

manual_code = get_code(trans_cat, "Manual", 0)
auto_code = get_code(trans_cat, "Automatic", manual_code)

first_owner_code = 0
for i, c in enumerate(owner_cat):
    if "First Owner" in c:
        first_owner_code = i
        break

dealer_code = 0
individual_code = 0
for i, c in enumerate(seller_cat):
    if "Dealer" in c:
        dealer_code = i
    if "Individual" in c:
        individual_code = i

# ============================================================
# 3. GENERISANJE KORISNIČKIH SEGMENTA
# ============================================================

# segment_id:
# 0 - Budget
# 1 - Diesel commuter
# 2 - Family
# 3 - Sport
# 4 - Offroad/SUV
# 5 - Luxury premium

SEGMENT_NAMES = {
    0: "Budget",
    1: "Diesel commuter",
    2: "Family",
    3: "Sport",
    4: "Offroad",
    5: "Luxury",
}

def generate_segment_users(n_per_segment=80):
    users = []

    for _ in range(n_per_segment):
        # 0) Budget
        users.append(
            {
                "segment_id": 0,
                "year": np.random.uniform(0.3, 0.65),
                "selling_price": np.random.uniform(0.1, 0.4),
                "km_driven": np.random.uniform(0.4, 0.8),
                "mileage": np.random.uniform(0.6, 1.0),
                "engine": np.random.uniform(0.2, 0.5),
                "max_power": np.random.uniform(0.2, 0.5),
                "torque": np.random.uniform(0.3, 0.6),
                "seats": np.random.uniform(0.4, 0.8),
                "fuel": petrol_code,
                "seller_type": individual_code,
                "transmission": manual_code,
                "owner": first_owner_code,
            }
        )

        # 1) Diesel commuter
        users.append(
            {
                "segment_id": 1,
                "year": np.random.uniform(0.5, 0.8),
                "selling_price": np.random.uniform(0.3, 0.6),
                "km_driven": np.random.uniform(0.2, 0.5),
                "mileage": np.random.uniform(0.6, 1.0),
                "engine": np.random.uniform(0.4, 0.7),
                "max_power": np.random.uniform(0.3, 0.6),
                "torque": np.random.uniform(0.4, 0.8),
                "seats": np.random.uniform(0.4, 0.7),
                "fuel": diesel_code,
                "seller_type": dealer_code,
                "transmission": manual_code,
                "owner": first_owner_code,
            }
        )

        # 2) Family
        users.append(
            {
                "segment_id": 2,
                "year": np.random.uniform(0.6, 0.9),
                "selling_price": np.random.uniform(0.4, 0.7),
                "km_driven": np.random.uniform(0.1, 0.4),
                "mileage": np.random.uniform(0.5, 0.9),
                "engine": np.random.uniform(0.4, 0.7),
                "max_power": np.random.uniform(0.4, 0.7),
                "torque": np.random.uniform(0.4, 0.7),
                "seats": np.random.uniform(0.7, 1.0),
                "fuel": np.random.choice([petrol_code, diesel_code]),
                "seller_type": dealer_code,
                "transmission": auto_code,
                "owner": first_owner_code,
            }
        )

        # 3) Sport
        users.append(
            {
                "segment_id": 3,
                "year": np.random.uniform(0.5, 0.9),
                "selling_price": np.random.uniform(0.6, 0.9),
                "km_driven": np.random.uniform(0.05, 0.3),
                "mileage": np.random.uniform(0.3, 0.7),
                "engine": np.random.uniform(0.6, 0.9),
                "max_power": np.random.uniform(0.7, 1.0),
                "torque": np.random.uniform(0.6, 0.9),
                "seats": np.random.uniform(0.3, 0.6),
                "fuel": petrol_code,
                "seller_type": dealer_code,
                "transmission": manual_code,
                "owner": first_owner_code,
            }
        )

        # 4) Offroad / SUV
        users.append(
            {
                "segment_id": 4,
                "year": np.random.uniform(0.4, 0.8),
                "selling_price": np.random.uniform(0.4, 0.7),
                "km_driven": np.random.uniform(0.3, 0.7),
                "mileage": np.random.uniform(0.3, 0.7),
                "engine": np.random.uniform(0.6, 1.0),
                "max_power": np.random.uniform(0.5, 0.9),
                "torque": np.random.uniform(0.6, 1.0),
                "seats": np.random.uniform(0.6, 1.0),
                "fuel": diesel_code,
                "seller_type": dealer_code,
                "transmission": manual_code,
                "owner": first_owner_code,
            }
        )

        # 5) Luxury premium
        users.append(
            {
                "segment_id": 5,
                "year": np.random.uniform(0.8, 1.0),
                "selling_price": np.random.uniform(0.7, 1.0),
                "km_driven": np.random.uniform(0.0, 0.25),
                "mileage": np.random.uniform(0.3, 0.7),
                "engine": np.random.uniform(0.6, 0.9),
                "max_power": np.random.uniform(0.6, 0.9),
                "torque": np.random.uniform(0.5, 0.8),
                "seats": np.random.uniform(0.5, 0.9),
                "fuel": petrol_code,
                "seller_type": dealer_code,
                "transmission": auto_code,
                "owner": first_owner_code,
            }
        )

    return pd.DataFrame(users)


users_df = generate_segment_users(80)
print("Broj sintetičkih korisnika:", len(users_df))

# ============================================================
# 4. HEURISTIČKI SCORE (SEGMENT-SPECIFIČAN)
# ============================================================

# težine po segmentu za NUM_COLS (redosled: NUM_COLS)
SEG_NUM_WEIGHTS = {
    0: np.array([1.5, 3.0, 2.0, 3.0, 0.5, 0.5, 0.5, 1.0]),  # Budget
    1: np.array([2.0, 2.0, 3.0, 4.0, 1.0, 1.0, 2.0, 1.0]),  # Diesel commuter
    2: np.array([3.0, 2.0, 3.0, 2.0, 1.0, 1.0, 2.0, 4.0]),  # Family
    3: np.array([2.0, 2.0, 2.0, 1.0, 4.0, 5.0, 4.0, 3.0]),  # Sport
    4: np.array([2.0, 2.0, 1.0, 1.0, 4.0, 3.0, 4.0, 3.0]),  # Offroad
    5: np.array([4.0, 4.0, 3.0, 1.0, 3.0, 3.0, 2.0, 2.0]),  # Luxury
}

def get_segment_params(seg_id: int):
    """Vraća težine za kategorijske i binarne feature-e po segmentu."""
    # default
    params = {
        "w_fuel": 2.0,
        "w_trans": 2.0,
        "w_owner": 1.0,
        "w_suv": 0.0,
        "w_sport_model": 0.0,
        "w_lux": 0.0,
    }

    if seg_id == 0:  # Budget
        params.update(
            dict(
                w_fuel=2.0,
                w_trans=2.0,
                w_owner=1.0,
                w_suv=-1.0,
                w_sport_model=0.0,
                w_lux=-1.0,
            )
        )
    elif seg_id == 1:  # Diesel commuter
        params.update(
            dict(
                w_fuel=4.0,
                w_trans=2.0,
                w_owner=1.0,
                w_suv=0.5,
                w_sport_model=0.0,
                w_lux=0.0,
            )
        )
    elif seg_id == 2:  # Family
        params.update(
            dict(
                w_fuel=2.0,
                w_trans=3.0,
                w_owner=2.0,
                w_suv=3.0,
                w_sport_model=0.0,
                w_lux=2.0,
            )
        )
    elif seg_id == 3:  # Sport
        params.update(
            dict(
                w_fuel=3.0,
                w_trans=3.0,
                w_owner=1.0,
                w_suv=-4.0,      # kazni SUV
                w_sport_model=6.0,
                w_lux=2.0,
            )
        )
    elif seg_id == 4:  # Offroad
        params.update(
            dict(
                w_fuel=3.0,
                w_trans=2.0,
                w_owner=1.0,
                w_suv=7.0,       # traži SUV
                w_sport_model=0.0,
                w_lux=1.0,
            )
        )
    elif seg_id == 5:  # Luxury
        params.update(
            dict(
                w_fuel=2.0,
                w_trans=4.0,     # automatik
                w_owner=3.0,
                w_suv=1.0,
                w_sport_model=1.0,
                w_lux=6.0,       # luksuzni brend
            )
        )

    return params


def score_item_for_segment(user, car):
    seg = user["segment"]

    # ====== KOMUNALNI BONUSI ======
    base_num = 1 - np.abs(car[NUM_COLS].values - user[NUM_COLS].values)
    num_score = base_num.sum()

    # ====== DETEKCIJA BIN FLAGOVA ======
    suv      = car["is_suv"]
    sport    = car["is_sport_model"]
    offroad  = car["is_offroad"]
    premium  = car["is_premium"]
    family   = car["is_family"]
    econ     = car["is_economy"]
    perf     = car["is_performance"]

    # ============= 1) BUDGET USER =============
    if seg == 1:
        return (
            num_score
            + (1 - car["selling_price"]) * 8
            + (1 - car["engine"]) * 3
            + (1 - car["max_power"]) * 3
            - sport * 5
            - premium * 3
            + econ * 5
        )

    # ============= 2) DIESEL COMMUTER =============
    if seg == 2:
        return (
            num_score
            + (car["fuel"] == diesel_code) * 8
            + (car["mileage"]) * 4
            + (1 - car["km_driven"]) * 3
            + (1 - sport) * 2
            - suv * 3
        )

    # ============= 3) FAMILY USER =============
    if seg == 3:
        return (
            num_score
            + (car["seats"]) * 10
            + family * 8
            - sport * 10
            - perf * 8
            - offroad * 10
            + suv * 4
        )

    # ============= 4) SPORT USER =============
    if seg == 4:
        return (
            num_score
            + car["max_power"] * 15
            + car["engine"] * 10
            + car["torque"] * 8
            - suv * 10
            - family * 6
            + sport * 12
            + perf * 10
            - premium * 3
            - (car["seats"] > 0.6) * 10
        )

    # ============= 5) OFFROAD USER =============
    if seg == 5:
        return (
            num_score
            + offroad * 15
            + suv * 10
            + car["torque"] * 6
            - sport * 6
            + car["engine"] * 4
        )

    # ============= 6) LUXURY USER =============
    if seg == 6:
        return (
            num_score
            + premium * 12
            + (1 - car["km_driven"]) * 6
            + (car["year"]) * 8
            + (car["selling_price"]) * 8
            - sport * 8
            + suv * 3  # premium SUV ok
        )

    return num_score

def score_items_for_segment(user_row, cars_df):
    seg = user_row["segment"]
    scores = np.zeros(len(cars_df))

    # prolazak kroz sve automobile
    for i, car in cars_df.iterrows():
        scores[i] = score_item_for_segment(user_row, car)

    return scores

def score_user_item(user_row: pd.Series, cars_df: pd.DataFrame) -> np.ndarray:
    """
    Vraća vektor skorova za sve automobile za datog user-a.
    Skor je kombinacija:
    - blizine u NUM_COLS (sa težinama po segmentu)
    - slaganja kategorija (fuel, transmission, owner)
    - binarnih flagova (SUV, sport model, luksuzni brend)
    """
    seg_id = int(user_row["segment_id"])
    w_num = SEG_NUM_WEIGHTS[seg_id]
    params = get_segment_params(seg_id)

    # NUMERIC closeness
    car_num = cars_df[NUM_COLS].values  # (N, num_numeric)
    user_num_vec = user_row[NUM_COLS].values  # (num_numeric,)

    diff = np.abs(car_num - user_num_vec)
    closeness = 1.0 - diff  # veća vrednost = bliže preferenciji
    num_score = (closeness * w_num).sum(axis=1)  # (N,)

    # BINARNI flagovi
    suv = cars_df["is_suv"].values
    sport_flag = cars_df["is_sport_model"].values
    lux_flag = cars_df["is_luxury_brand"].values

    bin_score = (
        suv * params["w_suv"]
        + sport_flag * params["w_sport_model"]
        + lux_flag * params["w_lux"]
    )

    # KATEGORIJSKE preferencije (fuel, transmission, owner)
    fuel_match = (
        (cars_df["fuel"].values == int(user_row["fuel"])) * params["w_fuel"]
    )
    trans_match = (
        (cars_df["transmission"].values == int(user_row["transmission"]))
        * params["w_trans"]
    )
    owner_match = (
        (cars_df["owner"].values == int(user_row["owner"]))
        * params["w_owner"]
    )

    return num_score + bin_score + fuel_match + trans_match + owner_match


# ============================================================
# 5. GENERISANJE POZITIVNIH/NEGATIVNIH PAROVA ZA TRENING
# ============================================================

def generate_training_pairs_fast(users_df, cars_df, n_pos=20, n_neg=20):
    """
    Za svakog user-a:
      - izračunaj skor za sve automobile
      - uzmi top n_pos kao pozitivne
      - uzmi bottom n_neg kao negativne
    Vraća već SPLITOVANE matrice za Keras (user_num, user_fuel, ..., item_num, item_fuel, ...).
    """
    u_num_list = []
    u_fuel_list = []
    u_trans_list = []
    u_owner_list = []
    u_segment_list = []

    i_num_list = []    # NUM_COLS + BIN_COLS
    i_fuel_list = []
    i_trans_list = []
    i_owner_list = []

    y_list = []

    for _, user in users_df.iterrows():
        scores = score_user_item(user, cars_df)
        # indeks automobila u cars_df
        idx_sorted = np.argsort(scores)

        pos_idx = idx_sorted[-n_pos:]
        neg_idx = idx_sorted[:n_neg]

        # funkcija koja dodaje parove
        def add_pairs(car_indices, label):
            for idx in car_indices:
                car = cars_df.iloc[idx]
                # user features
                u_num_list.append(user[NUM_COLS].values.astype("float32"))
                u_fuel_list.append(int(user["fuel"]))
                u_trans_list.append(int(user["transmission"]))
                u_owner_list.append(int(user["owner"]))
                u_segment_list.append(int(user["segment_id"]))

                # item features (numeric + binarni)
                i_num_list.append(
                    np.concatenate(
                        [
                            car[NUM_COLS].values.astype("float32"),
                            car[BIN_COLS].values.astype("float32"),
                        ]
                    )
                )
                i_fuel_list.append(int(car["fuel"]))
                i_trans_list.append(int(car["transmission"]))
                i_owner_list.append(int(car["owner"]))

                y_list.append(float(label))

        add_pairs(pos_idx, 1.0)
        add_pairs(neg_idx, 0.0)

    # konverzija u np.array
    u_num = np.stack(u_num_list).astype("float32")
    u_fuel = np.array(u_fuel_list, dtype="int32")
    u_trans = np.array(u_trans_list, dtype="int32")
    u_owner = np.array(u_owner_list, dtype="int32")
    u_segment = np.array(u_segment_list, dtype="int32")

    i_num = np.stack(i_num_list).astype("float32")  # NUM_COLS + BIN_COLS
    i_fuel = np.array(i_fuel_list, dtype="int32")
    i_trans = np.array(i_trans_list, dtype="int32")
    i_owner = np.array(i_owner_list, dtype="int32")

    y = np.array(y_list, dtype="float32")

    return (u_num, u_fuel, u_trans, u_owner, u_segment,
            i_num, i_fuel, i_trans, i_owner, y)


(
    u_num,
    u_fuel,
    u_trans,
    u_owner,
    u_segment,
    i_num,
    i_fuel,
    i_trans,
    i_owner,
    y,
) = generate_training_pairs_fast(users_df, df_items, n_pos=15, n_neg=15)

print("u_num:", u_num.shape)
print("i_num:", i_num.shape)
print("y:", y.shape)

# ============================================================
# 6. TWO–TOWER MODEL
# ============================================================

embedding_dim = 32

num_numeric_user = len(NUM_COLS)          # 8
num_numeric_item = len(NUM_COLS) + len(BIN_COLS)  # 8 + 3 = 11

# ---- USER tower ----
user_num_in = Input(shape=(num_numeric_user,), name="user_num")
user_fuel_in = Input(shape=(), dtype="int32", name="user_fuel")
user_trans_in = Input(shape=(), dtype="int32", name="user_trans")
user_owner_in = Input(shape=(), dtype="int32", name="user_owner")
user_segment_in = Input(shape=(), dtype="int32", name="user_segment")

uf_emb = layers.Embedding(num_fuel, 8)(user_fuel_in)
ut_emb = layers.Embedding(num_trans, 8)(user_trans_in)
uo_emb = layers.Embedding(num_owner, 8)(user_owner_in)
usg_emb = layers.Embedding(len(SEGMENT_NAMES), 8)(user_segment_in)

user_concat = layers.Concatenate()(
    [
        user_num_in,
        layers.Flatten()(uf_emb),
        layers.Flatten()(ut_emb),
        layers.Flatten()(uo_emb),
        layers.Flatten()(usg_emb),
    ]
)

u_hidden = layers.Dense(128, activation="relu")(user_concat)
u_hidden = layers.Dropout(0.2)(u_hidden)
u_hidden = layers.Dense(64, activation="relu")(u_hidden)
u_vec = layers.Dense(embedding_dim)(u_hidden)

user_tower = Model(
    inputs=[user_num_in, user_fuel_in, user_trans_in, user_owner_in, user_segment_in],
    outputs=u_vec,
    name="user_tower",
)

# ---- ITEM tower ----
item_num_in = Input(shape=(num_numeric_item,), name="item_num")
item_fuel_in = Input(shape=(), dtype="int32", name="item_fuel")
item_trans_in = Input(shape=(), dtype="int32", name="item_trans")
item_owner_in = Input(shape=(), dtype="int32", name="item_owner")

if_emb = layers.Embedding(num_fuel, 8)(item_fuel_in)
it_emb = layers.Embedding(num_trans, 8)(item_trans_in)
io_emb = layers.Embedding(num_owner, 8)(item_owner_in)

item_concat = layers.Concatenate()(
    [
        item_num_in,
        layers.Flatten()(if_emb),
        layers.Flatten()(it_emb),
        layers.Flatten()(io_emb),
    ]
)

i_hidden = layers.Dense(128, activation="relu")(item_concat)
i_hidden = layers.Dropout(0.2)(i_hidden)
i_hidden = layers.Dense(64, activation="relu")(i_hidden)
i_vec = layers.Dense(embedding_dim)(i_hidden)

item_tower = Model(
    inputs=[item_num_in, item_fuel_in, item_trans_in, item_owner_in],
    outputs=i_vec,
    name="item_tower",
)

# Dot-product score
dot_score = layers.Dot(axes=1, normalize=False)([u_vec, i_vec])

model = Model(
    inputs=[
        user_num_in,
        user_fuel_in,
        user_trans_in,
        user_owner_in,
        user_segment_in,
        item_num_in,
        item_fuel_in,
        item_trans_in,
        item_owner_in,
    ],
    outputs=dot_score,
)

model.compile(optimizer="adam", loss="binary_crossentropy")
model.summary()

# ============================================================
# 7. TRENING TWO–TOWER MODELA
# ============================================================

history = model.fit(
    [
        u_num,
        u_fuel,
        u_trans,
        u_owner,
        u_segment,
        i_num,
        i_fuel,
        i_trans,
        i_owner,
    ],
    y,
    epochs=10,
    batch_size=128,
    verbose=1,
)

# ============================================================
# 8. PREKALKULISANI ITEM EMBEDDINGS
# ============================================================

# za sve automobile (u istom redosledu kao df_items)
item_num_all = np.concatenate(
    [df_items[NUM_COLS].values, df_items[BIN_COLS].values], axis=1
).astype("float32")
item_fuel_all = df_items["fuel"].values.astype("int32")
item_trans_all = df_items["transmission"].values.astype("int32")
item_owner_all = df_items["owner"].values.astype("int32")

item_embeddings = item_tower.predict(
    [item_num_all, item_fuel_all, item_trans_all, item_owner_all],
    verbose=0,
)

# ============================================================
# 9. FUNKCIJA ZA PREPORUKE
# ============================================================

def recommend_for_user(user_pref: dict, top_n=10):
    """
    user_pref mora imati ključeve:
    - NUM_COLS (year, selling_price, km_driven, mileage, engine, max_power, torque, seats) već u [0,1]
    - 'fuel', 'transmission', 'owner', 'segment_id' kao ID-jeve (code-ove)
    """
    # user numeric
    user_num_vec = np.array([[user_pref[c] for c in NUM_COLS]], dtype="float32")
    user_fuel_vec = np.array([user_pref["fuel"]], dtype="int32")
    user_trans_vec = np.array([user_pref["transmission"]], dtype="int32")
    user_owner_vec = np.array([user_pref["owner"]], dtype="int32")
    user_segment_vec = np.array([user_pref["segment_id"]], dtype="int32")

    u_emb = user_tower.predict(
        [user_num_vec, user_fuel_vec, user_trans_vec, user_owner_vec, user_segment_vec],
        verbose=0,
    )

    scores = cosine_similarity(u_emb, item_embeddings)[0]
    top_idx = np.argsort(scores)[::-1][:top_n]

    # mapiranje nazad na df (koristimo indexe df_items)
    car_indices = df_items.index[top_idx]
    result = df.loc[car_indices, ["name", "year", "selling_price", "km_driven", "fuel", "transmission", "owner"]].copy()

    return result

# ============================================================
# 10. TESTNE PERSONE KORISNIKA
# ============================================================

# Helper: malo utility za lep print
def print_recs(title, user_pref, top_n=10):
    print("=" * 20, title, "=" * 20)
    recs = recommend_for_user(user_pref, top_n=top_n)
    print(recs)
    print()

# SPORT USER (segment_id = 3)
sport_user = {
    "segment_id": 3,
    "year": 0.6,
    "selling_price": 0.75,
    "km_driven": 0.15,
    "mileage": 0.4,
    "engine": 0.8,
    "max_power": 0.9,
    "torque": 0.8,
    "seats": 0.4,
    "fuel": petrol_code,
    "transmission": manual_code,
    "owner": first_owner_code,
}

# LUXURY USER (segment_id = 5)
luxury_user = {
    "segment_id": 5,
    "year": 0.9,
    "selling_price": 0.9,
    "km_driven": 0.1,
    "mileage": 0.5,
    "engine": 0.75,
    "max_power": 0.75,
    "torque": 0.7,
    "seats": 0.8,
    "fuel": petrol_code,
    "transmission": auto_code,
    "owner": first_owner_code,
}

# FAMILY USER (segment_id = 2)
family_user = {
    "segment_id": 2,
    "year": 0.75,
    "selling_price": 0.55,
    "km_driven": 0.25,
    "mileage": 0.7,
    "engine": 0.6,
    "max_power": 0.6,
    "torque": 0.6,
    "seats": 0.9,
    "fuel": diesel_code,
    "transmission": auto_code,
    "owner": first_owner_code,
}

# BUDGET CITY USER (segment_id = 0)
budget_user = {
    "segment_id": 0,
    "year": 0.5,
    "selling_price": 0.2,
    "km_driven": 0.6,
    "mileage": 0.8,
    "engine": 0.4,
    "max_power": 0.4,
    "torque": 0.4,
    "seats": 0.6,
    "fuel": petrol_code,
    "transmission": manual_code,
    "owner": first_owner_code,
}

# OFFROAD USER (segment_id = 4)
offroad_user = {
    "segment_id": 4,
    "year": 0.6,
    "selling_price": 0.6,
    "km_driven": 0.5,
    "mileage": 0.5,
    "engine": 0.8,
    "max_power": 0.7,
    "torque": 0.9,
    "seats": 0.8,
    "fuel": diesel_code,
    "transmission": manual_code,
    "owner": first_owner_code,
}

# DIESEL COMMUTER (segment_id = 1)
diesel_commuter_user = {
    "segment_id": 1,
    "year": 0.7,
    "selling_price": 0.5,
    "km_driven": 0.3,
    "mileage": 0.85,
    "engine": 0.6,
    "max_power": 0.5,
    "torque": 0.7,
    "seats": 0.6,
    "fuel": diesel_code,
    "transmission": manual_code,
    "owner": first_owner_code,
}

# PRINT PREPORUKA
print_recs("SPORT USER", sport_user, top_n=10)
print_recs("LUXURY USER", luxury_user, top_n=10)
print_recs("FAMILY USER", family_user, top_n=10)
print_recs("BUDGET USER", budget_user, top_n=10)
print_recs("OFFROAD USER", offroad_user, top_n=10)
print_recs("DIESEL COMMUTER USER", diesel_commuter_user, top_n=10)


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Broj automobila nakon čišćenja: 7906
fuel: ['CNG', 'Diesel', 'LPG', 'Petrol']
seller_type: ['Dealer', 'Individual', 'Trustmark Dealer']
transmission: ['Automatic', 'Manual']
owner: ['First Owner', 'Fourth & Above Owner', 'Second Owner', 'Test Drive Car', 'Third Owner']
Broj sintetičkih korisnika: 480
u_num: (14400, 8)
i_num: (14400, 11)
y: (14400,)


Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 0.1771
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0144
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0021
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 0.0033
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 7.7116e-04
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0016
Epoch 7/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 1.7872e-04
Epoch 8/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 1.4233e-04
Epoch 9/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0012
Epoch 10/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1