In [135]:
import pandas as pd
import numpy as np

In [136]:
df = pd.read_csv('./car-details-v3.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [137]:
# clear data (biranje kljucnih kolona i ciscenje podataka)

df = df[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission']]
df.dropna(inplace=True)

# TODO: ovo je minimalno ciscenje, u realnom sistemu mora postojati vise obrade

In [138]:
# Normalizacija numerickih atributa

# Skaliranje u opseg [0,1] omogućava da neuronska mreža lakše konvergira.
# Kategorije ostaju neskalirane — što je u redu na ovom nivou.

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['year','selling_price','km_driven']] = scaler.fit_transform(df[['year','selling_price','km_driven']])

In [139]:
# Pretvaranje kategorija u kategorijske ID-jeve
df['fuel'] = df['fuel'].astype('category')
df['seller_type'] = df['seller_type'].astype('category')
df['transmission'] = df['transmission'].astype('category')

fuel_cat = df['fuel'].cat.categories
seller_cat = df['seller_type'].cat.categories
trans_cat = df['transmission'].cat.categories

df['fuel'] = df['fuel'].cat.codes
df['seller_type'] = df['seller_type'].cat.codes
df['transmission'] = df['transmission'].cat.codes

# Broj kategorija (za embedding)
num_fuel = len(fuel_cat)
num_seller = len(seller_cat)
num_trans = len(trans_cat)

print(num_fuel, num_seller, num_trans)

4 3 2


In [140]:
# Segment 1: Budget Buyer
# Želi jeftino vozilo, često benzin, umerena starost, manuelni menjač.

# Segment 2: Diesel Commuter
# Želi ekonomičan dizel, umereno novi auto, srednja cena, mala potrošnja.

# Segment 3: Family Buyer
# Novije vozilo, srednje–viša cena, mala pređena kilometraža, često automatik.

# Segment 4: Enthusiast / Sports Segment
# Benzin, jači modeli, viša cena, manja km (koliko dataset dozvoljava).

# Segment 5: Off-road / Utility Buyer
# Preferira Jeep / Gypsy stil (dizel, terensko, robustno).

# Segment 6: Premium Urban Buyer
# Novije vozilo, automatik, benzinsko, niska km, dealer.

def generate_segment_users(n_per_segment=50):
    users = []
    
    for _ in range(n_per_segment):
        # Segment 1: Budget Buyer
        users.append({
            "segment": 1,
            "year": np.random.uniform(0.3, 0.55),
            "selling_price": np.random.uniform(0.1, 0.35),
            "km_driven": np.random.uniform(0.1, 0.4),
            "fuel": 0,  # petrol
            "seller_type": 0,
            "transmission": np.random.choice([0,1])
        })

        # Segment 2: Diesel Commuter
        users.append({
            "segment": 2,
            "year": np.random.uniform(0.5, 0.8),
            "selling_price": np.random.uniform(0.3, 0.55),
            "km_driven": np.random.uniform(0.05, 0.25),
            "fuel": 1,  # diesel
            "seller_type": 1,
            "transmission": 0
        })

        # Segment 3: Family Buyer
        users.append({
            "segment": 3,
            "year": np.random.uniform(0.6, 0.9),
            "selling_price": np.random.uniform(0.4, 0.7),
            "km_driven": np.random.uniform(0.05, 0.2),
            "fuel": np.random.choice([0,1]),
            "seller_type": 1,
            "transmission": 1
        })

        # Segment 4: Sport Enthusiast
        users.append({
            "segment": 4,
            "year": np.random.uniform(0.4, 0.7),
            "selling_price": np.random.uniform(0.5, 0.9),
            "km_driven": np.random.uniform(0.05, 0.2),
            "fuel": 0,  # petrol
            "seller_type": np.random.choice([0,1]),
            "transmission": 0
        })

        # Segment 5: Off-road Utility
        users.append({
            "segment": 5,
            "year": np.random.uniform(0.2, 0.5),
            "selling_price": np.random.uniform(0.3, 0.6),
            "km_driven": np.random.uniform(0.1, 0.4),
            "fuel": np.random.choice([1,2]),  # diesel or CNG jeeps
            "seller_type": np.random.choice([0,1]),
            "transmission": 0
        })

        # Segment 6: Premium Urban Buyer
        users.append({
            "segment": 6,
            "year": np.random.uniform(0.75, 1.0),
            "selling_price": np.random.uniform(0.6, 0.9),
            "km_driven": np.random.uniform(0.0, 0.15),
            "fuel": 0,  # petrol
            "seller_type": 1,
            "transmission": 1
        })

    return pd.DataFrame(users)


user_df = generate_segment_users(50)
user_df.head()

Unnamed: 0,segment,year,selling_price,km_driven,fuel,seller_type,transmission
0,1,0.5049,0.29707,0.174595,0,0,1
1,2,0.692932,0.406554,0.205686,1,1,0
2,3,0.886203,0.550306,0.083896,1,1,1
3,4,0.518212,0.831726,0.156493,0,1,0
4,5,0.296067,0.498053,0.111874,2,0,0


In [141]:
def user_item_score(user, car):
    score = 0
    
    score += 1 - abs(user["year"] - car["year"])
    score += 1 - abs(user["selling_price"] - car["selling_price"])
    score += 1 - abs(user["km_driven"] - car["km_driven"])
    score += 1 if user["fuel"] == car["fuel"] else 0
    score += 1 if user["transmission"] == car["transmission"] else 0
    score += 1 if user["seller_type"] == car["seller_type"] else 0
    
    return score

In [142]:
def generate_training_pairs(users, cars, n_pos=10, n_neg=10):
    X_user = []
    X_item = []
    y = []

    for _, user in users.iterrows():
        # Izračunavanje skorova za sve automobile
        cars["score"] = cars.apply(lambda row: user_item_score(user, row), axis=1)
        
        # Pozitivni primeri (najboljih n_pos automobila)
        pos_items = cars.sort_values("score", ascending=False).head(n_pos)
        
        for _, item in pos_items.iterrows():
            X_user.append(user.values[:-1])  # sve osim segment
            X_item.append(item.values[:-1])  # sve osim score
            y.append(1)

        # Negativni primeri (najgorih n_neg automobila)
        neg_items = cars.sort_values("score").head(n_neg)
        
        for _, item in neg_items.iterrows():
            X_user.append(user.values[:-1])
            X_item.append(item.values[:-1])
            y.append(0)

    return (
        np.array(X_user, dtype="float32"),
        np.array(X_item, dtype="float32"),
        np.array(y, dtype="float32")
    )


In [143]:
# Pretvaranje u numpy matrice:
users = generate_segment_users(50)
cars = df_items.copy()  # df_items = df[FEATURES]

X_user, X_item, y = generate_training_pairs(users, cars)

In [144]:
# Two-Tower model sa embedding slojevima
import tensorflow as tf
from tensorflow.keras import layers, Model

embedding_dim = 32

# -------------------
# USER tower
# -------------------
user_numeric = layers.Input(shape=(3,), name="user_num")
user_fuel = layers.Input(shape=(), dtype="int32", name="user_fuel")
user_seller = layers.Input(shape=(), dtype="int32", name="user_seller")
user_trans = layers.Input(shape=(), dtype="int32", name="user_trans")

emb_fuel = layers.Embedding(num_fuel, 4)(user_fuel)
emb_seller = layers.Embedding(num_seller, 4)(user_seller)
emb_trans = layers.Embedding(num_trans, 4)(user_trans)

x = layers.Concatenate()([user_numeric, 
                          layers.Flatten()(emb_fuel),
                          layers.Flatten()(emb_seller),
                          layers.Flatten()(emb_trans)])

x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(embedding_dim)(x)

user_tower = Model([user_numeric, user_fuel, user_seller, user_trans], x)

# -------------------
# ITEM tower
# -------------------
item_numeric = layers.Input(shape=(3,), name="item_num")
item_fuel = layers.Input(shape=(), dtype="int32", name="item_fuel")
item_seller = layers.Input(shape=(), dtype="int32", name="item_seller")
item_trans = layers.Input(shape=(), dtype="int32", name="item_trans")

ei_fuel = layers.Embedding(num_fuel, 4)(item_fuel)
ei_seller = layers.Embedding(num_seller, 4)(item_seller)
ei_trans = layers.Embedding(num_trans, 4)(item_trans)

i = layers.Concatenate()([item_numeric,
                          layers.Flatten()(ei_fuel),
                          layers.Flatten()(ei_seller),
                          layers.Flatten()(ei_trans)])

i = layers.Dense(64, activation='relu')(i)
i = layers.Dropout(0.2)(i)
i = layers.Dense(embedding_dim)(i)

item_tower = Model([item_numeric, item_fuel, item_seller, item_trans], i)


In [145]:
# Dot-product model
u_vec = user_tower([user_numeric, user_fuel, user_seller, user_trans])
i_vec = item_tower([item_numeric, item_fuel, item_seller, item_trans])

score = layers.Dot(axes=1)([u_vec, i_vec])

model = Model(
    inputs=[user_numeric, user_fuel, user_seller, user_trans,
            item_numeric, item_fuel, item_seller, item_trans],
    outputs=score
)

model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

In [146]:
# Priprema inputa (razdvajanje numeric + kategorijskog dela)
def split_features(data):
    num = data[:, :3]
    fuel = data[:, 3].astype("int32")
    seller = data[:, 4].astype("int32")
    trans = data[:, 5].astype("int32")
    return num, fuel, seller, trans

u_num, u_f, u_s, u_t = split_features(X_user)
i_num, i_f, i_s, i_t = split_features(X_item)


In [147]:
# Trening
model.fit(
    [u_num, u_f, u_s, u_t,
     i_num, i_f, i_s, i_t],
    y,
    epochs=10,
    batch_size=64,
    verbose=1
)

Epoch 1/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - loss: 2.5368
Epoch 2/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 1.0236
Epoch 3/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.8098
Epoch 4/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 1.9346
Epoch 5/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 1.5810
Epoch 6/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - loss: 1.3526
Epoch 7/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 1.3186
Epoch 8/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 1.2099
Epoch 9/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 1.1929
Epoch 10/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 3.2007

<keras.src.callbacks.history.History at 0x35dd09b50>

In [130]:
# Generisanje preporuka (Top-K)
# 1. Uzimamo samo numeričke i kategorijske feature-e
FEATURES = ['year','selling_price','km_driven','fuel','seller_type','transmission']
df_items = df[FEATURES].copy()

# 2. Konverzija u float32 (numerički deo), int32 (kategorijski deo)
item_array = df_items.values.astype("float32")

# 3. Split kao i kod treninga
item_num, item_f, item_s, item_t = split_features(item_array)

# 4. Generisanje embeddinga za sve automobile
item_embeddings = item_tower.predict([item_num, item_f, item_s, item_t], verbose=0)

# 5. Korisnički embedding
user_vec_test = user_tower.predict(
    [u_num[:1], u_f[:1], u_s[:1], u_t[:1]],
    verbose=0
)

# 6. Kosinusna sličnost i Top-K preporuke
from sklearn.metrics.pairwise import cosine_similarity

scores = cosine_similarity(user_vec_test, item_embeddings)[0]
top_idx = np.argsort(scores)[::-1][:10]

df.iloc[top_idx][['name','year','selling_price','km_driven']]

Unnamed: 0,name,year,selling_price,km_driven
4900,Ford Endeavour 2.5L 4X2 MT,0.567568,0.037111,0.039575
1216,Mahindra Jeep Classic,0.351351,0.01334,0.002118
3922,Mahindra Jeep MM 540,0.405405,0.018054,0.02033
5451,Mahindra Marshal DI,0.459459,0.012036,0.021182
4496,Maruti Esteem DI,0.540541,0.004012,0.008473
5583,Mahindra Bolero LX,0.594595,0.032096,0.006354
4720,Maruti Zen D,0.513514,0.009027,0.025418
3310,Tata Estate Std,0.432432,0.004815,0.033891
2867,Mahindra Jeep MM 550 XDB,0.594595,0.037111,0.012709
7846,Toyota Qualis Fleet A3,0.459459,0.017051,0.042364


In [134]:
def recommend_for_user(user_dict, n=10):
    arr = np.array([
        user_dict["year"],
        user_dict["selling_price"],
        user_dict["km_driven"],
        user_dict["fuel"],
        user_dict["seller_type"],
        user_dict["transmission"]
    ], dtype="float32").reshape(1, -1)

    u_num, u_f, u_s, u_t = split_features(arr)
    u_emb = user_tower.predict([u_num, u_f, u_s, u_t], verbose=0)

    scores = cosine_similarity(u_emb, item_embeddings)[0]
    top_idx = np.argsort(scores)[::-1][:n]

    return df.iloc[top_idx][['name','year','selling_price','km_driven']]
