In [63]:
import pandas as pd
import numpy as np

In [64]:
df = pd.read_csv('./car-details-v3.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [65]:
# clear data (biranje kljucnih kolona i ciscenje podataka)

df = df[['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type', 'transmission']]
df.dropna(inplace=True)

# TODO: ovo je minimalno ciscenje, u realnom sistemu mora postojati vise obrade

In [66]:
# Normalizacija numerickih atributa

# Skaliranje u opseg [0,1] omogućava da neuronska mreža lakše konvergira.
# Kategorije ostaju neskalirane — što je u redu na ovom nivou.

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['year','selling_price','km_driven']] = scaler.fit_transform(df[['year','selling_price','km_driven']])

In [67]:
# Pretvaranje kategorija u kategorijske ID-jeve
df['fuel'] = df['fuel'].astype('category')
df['seller_type'] = df['seller_type'].astype('category')
df['transmission'] = df['transmission'].astype('category')

fuel_cat = df['fuel'].cat.categories
seller_cat = df['seller_type'].cat.categories
trans_cat = df['transmission'].cat.categories

df['fuel'] = df['fuel'].cat.codes
df['seller_type'] = df['seller_type'].cat.codes
df['transmission'] = df['transmission'].cat.codes

# Broj kategorija (za embedding)
num_fuel = len(fuel_cat)
num_seller = len(seller_cat)
num_trans = len(trans_cat)

print(num_fuel, num_seller, num_trans)

4 3 2


In [68]:
def generate_users(n=50):
    users = []

    for _ in range(n):
        user = {
            "year": np.random.uniform(0.3,1.0),      # preferencija za novost
            "selling_price": np.random.uniform(0.2,0.8),
            "km_driven": np.random.uniform(0.0,0.5),
            "fuel": np.random.randint(0, num_fuel),
            "seller_type": np.random.randint(0, num_seller),
            "transmission": np.random.randint(0, num_trans),
        }
        users.append(user)

    return pd.DataFrame(users)

user_df = generate_users(50)
user_df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission
0,0.391421,0.522868,0.295399,3,1,0
1,0.535552,0.472449,0.224532,2,0,0
2,0.983565,0.735903,0.004141,0,2,1
3,0.444195,0.503473,0.054843,1,0,1
4,0.674607,0.473835,0.164593,2,2,0


In [83]:
# Kreiranje pozitivnih i negativnih parova
# Pozitivni par = ako item približno odgovara user preferencama.
def match_score(user, item):
    score = 0
    score += 1 - abs(user["year"] - item["year"])
    score += 1 - abs(user["selling_price"] - item["selling_price"])
    score += 1 - abs(user["km_driven"] - item["km_driven"])
    score += 1 if user["fuel"] == item["fuel"] else 0
    score += 1 if user["transmission"] == item["transmission"] else 0
    return score

# kreiramo training parove
FEATURES = ['year','selling_price','km_driven','fuel','seller_type','transmission']
df_items = df[FEATURES].copy()

pairs = []
labels = []

for _, user in user_df.iterrows():

    pos_items = df_items.sample(20)
    neg_items = df_items.sample(20)

    for _, item in pos_items.iterrows():
        pairs.append((user.values, item.values))
        labels.append(1)

    for _, item in neg_items.iterrows():
        pairs.append((user.values, item.values))
        labels.append(0)

pairs = np.array(pairs, dtype=object)
labels = np.array(labels)

In [84]:
# Pretvaranje u numpy matrice:
X_user = np.vstack([x[0] for x in pairs]).astype('float32')
X_item = np.vstack([x[1] for x in pairs]).astype('float32')
y = labels.astype('float32')

In [85]:
# Two-Tower model sa embedding slojevima
import tensorflow as tf
from tensorflow.keras import layers, Model

embedding_dim = 32

# -------------------
# USER tower
# -------------------
user_numeric = layers.Input(shape=(3,), name="user_num")
user_fuel = layers.Input(shape=(), dtype="int32", name="user_fuel")
user_seller = layers.Input(shape=(), dtype="int32", name="user_seller")
user_trans = layers.Input(shape=(), dtype="int32", name="user_trans")

emb_fuel = layers.Embedding(num_fuel, 4)(user_fuel)
emb_seller = layers.Embedding(num_seller, 4)(user_seller)
emb_trans = layers.Embedding(num_trans, 4)(user_trans)

x = layers.Concatenate()([user_numeric, 
                          layers.Flatten()(emb_fuel),
                          layers.Flatten()(emb_seller),
                          layers.Flatten()(emb_trans)])

x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(embedding_dim)(x)

user_tower = Model([user_numeric, user_fuel, user_seller, user_trans], x)

# -------------------
# ITEM tower
# -------------------
item_numeric = layers.Input(shape=(3,), name="item_num")
item_fuel = layers.Input(shape=(), dtype="int32", name="item_fuel")
item_seller = layers.Input(shape=(), dtype="int32", name="item_seller")
item_trans = layers.Input(shape=(), dtype="int32", name="item_trans")

ei_fuel = layers.Embedding(num_fuel, 4)(item_fuel)
ei_seller = layers.Embedding(num_seller, 4)(item_seller)
ei_trans = layers.Embedding(num_trans, 4)(item_trans)

i = layers.Concatenate()([item_numeric,
                          layers.Flatten()(ei_fuel),
                          layers.Flatten()(ei_seller),
                          layers.Flatten()(ei_trans)])

i = layers.Dense(64, activation='relu')(i)
i = layers.Dropout(0.2)(i)
i = layers.Dense(embedding_dim)(i)

item_tower = Model([item_numeric, item_fuel, item_seller, item_trans], i)


In [86]:
# Dot-product model
u_vec = user_tower([user_numeric, user_fuel, user_seller, user_trans])
i_vec = item_tower([item_numeric, item_fuel, item_seller, item_trans])

score = layers.Dot(axes=1)([u_vec, i_vec])

model = Model(
    inputs=[user_numeric, user_fuel, user_seller, user_trans,
            item_numeric, item_fuel, item_seller, item_trans],
    outputs=score
)

model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

In [87]:
# Priprema inputa (razdvajanje numeric + kategorijskog dela)
def split_features(data):
    num = data[:, :3]
    fuel = data[:, 3].astype("int32")
    seller = data[:, 4].astype("int32")
    trans = data[:, 5].astype("int32")
    return num, fuel, seller, trans

u_num, u_f, u_s, u_t = split_features(X_user)
i_num, i_f, i_s, i_t = split_features(X_item)


In [88]:
# Trening
model.fit(
    [u_num, u_f, u_s, u_t,
     i_num, i_f, i_s, i_t],
    y,
    epochs=10,
    batch_size=64,
    verbose=1
)

Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - loss: 0.7921
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.7583
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.7286
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.7493
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 0.7203
Epoch 6/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.7184
Epoch 7/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.7205
Epoch 8/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.7160
Epoch 9/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 0.7192
Epoch 10/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 0.7087

<keras.src.callbacks.history.History at 0x32aa07bd0>

In [90]:
# Generisanje preporuka (Top-K)
item_num, item_f, item_s, item_t = split_features(df.values[:, 1:])  # skip name

item_embeddings = item_tower.predict([item_num, item_f, item_s, item_t])
user_vec_test = user_tower.predict([u_num[:1], u_f[:1], u_s[:1], u_t[:1]])

from sklearn.metrics.pairwise import cosine_similarity

scores = cosine_similarity(user_vec_test, item_embeddings)[0]
top_idx = np.argsort(scores)[::-1][:10]

df.iloc[top_idx][['name', 'year', 'selling_price', 'km_driven']]


SyntaxError: invalid syntax (658274110.py, line 1)