In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

In [2]:
adatok_mappa = Path("../adatok")

hirdetesek = pd.read_csv(adatok_mappa / "advertisements_202006112147.csv")

hirdetesek.head()


  hirdetesek = pd.read_csv(adatok_mappa / "advertisements_202006112147.csv")


Unnamed: 0,ad_id,region_id,ad_price,numpictures,proseller,adoldness,postalcode,production,mileage,clime_id,...,ccm,highlighted,upload_date,description,advertisement_url,catalog_url,sales_date,is_sold,download_date,sales_update_date
0,1545570,5,530000,5,False,4584,5600,1997-12-01,244000,3,...,1896,False,2007-10-07,,,no catalog,,False,2020-04-25,2020-04-25
1,4066033,13,1290000,6,False,3509,2671,2005-12-01,148000,4,...,1364,False,2010-08-11,,,http://katalogus.hasznaltauto.hu/opel/astra_1....,,False,2020-03-20,2020-03-20
2,4109007,4,580000,6,False,3526,6000,1994-11-01,181900,2,...,2498,False,2010-08-30,,,http://katalogus.hasznaltauto.hu/saab/900_2.5_...,,False,2020-04-25,2020-04-25
3,4246385,9,1450000,6,False,3435,4033,2005-09-01,185000,2,...,1390,False,2010-10-25,,,http://katalogus.hasznaltauto.hu/seat/leon_1.4...,,False,2020-03-21,2020-03-21
4,5440448,15,9990000,6,False,3022,8600,1900-01-01,98500,5,...,5439,False,2012-01-17,,,http://katalogus.hasznaltauto.hu/mercedes-benz...,,False,2020-04-26,2020-04-26


In [3]:
q33 = hirdetesek["ad_price"].quantile(0.33)
q66 = hirdetesek["ad_price"].quantile(0.66)


In [4]:
def ar_kategoria(ar):
    if ar <= q33:
        return "olcso"
    elif ar <= q66:
        return "kozepes"
    else:
        return "draga"

hirdetesek["ar_kategoria"] = hirdetesek["ad_price"].apply(ar_kategoria)


In [5]:
brand = pd.read_csv(adatok_mappa / "brand_202006112147.csv")
model = pd.read_csv(adatok_mappa / "model_202006112147.csv")
car_type = pd.read_csv(adatok_mappa / "car_type_202006112147.csv")
drive = pd.read_csv(adatok_mappa / "drive_202006112147.csv")


In [6]:
adat = hirdetesek.merge(
    brand[["brand_id", "brand_name"]],
    on="brand_id",
    how="left"
).merge(
    model[["model_id", "model_name"]],
    on="model_id",
    how="left"
)


In [7]:
adat = adat[adat["ad_price"] > 0].copy()

adat.shape


(38009, 31)

In [8]:
hasznalt_adat = adat[
    [
        "production",
        "mileage",
        "ccm",
        "doorsnumber",
        "person_capacity",
        "brand_name",
        "model_name",
        "shifter",
        "color",
        "ar_kategoria"
    ]
].copy()

hasznalt_adat.head()


Unnamed: 0,production,mileage,ccm,doorsnumber,person_capacity,brand_name,model_name,shifter,color,ar_kategoria
0,1997-12-01,244000,1896,5,5,VOLKSWAGEN,GOLF III,M0,10,olcso
1,2005-12-01,148000,1364,5,5,OPEL,ASTRA H,M5,22,olcso
2,1994-11-01,181900,2498,5,5,SAAB,900,M5,62,olcso
3,2005-09-01,185000,1390,5,5,SEAT,LEON,M5,132,kozepes
4,1900-01-01,98500,5439,2,2,MERCEDES-BENZ,SL 55 AMG,T5,40,draga


In [9]:
hasznalt_adat["gyartasi_ev"] = pd.to_datetime(
    hasznalt_adat["production"], errors="coerce"
).dt.year

hasznalt_adat.drop(columns=["production"], inplace=True)


In [10]:
numerikus = ["gyartasi_ev", "mileage", "ccm", "doorsnumber", "person_capacity"]
kategorikus = ["brand_name", "model_name", "shifter", "color"]

for col in numerikus:
    hasznalt_adat[col] = hasznalt_adat[col].fillna(hasznalt_adat[col].median())

for col in kategorikus:
    hasznalt_adat[col] = hasznalt_adat[col].fillna("ismeretlen")


In [11]:
numerikus = [
    "gyartasi_ev",
    "mileage",
    "ccm",
    "doorsnumber",
    "person_capacity"
]


In [12]:
def iqr_szures(df, oszlopok, k=1.5):
    szurt = df.copy()
    for col in oszlopok:
        q1 = szurt[col].quantile(0.25)
        q3 = szurt[col].quantile(0.75)
        iqr = q3 - q1
        also = q1 - k * iqr
        felso = q3 + k * iqr
        szurt = szurt[(szurt[col] >= also) & (szurt[col] <= felso)]
    return szurt


In [13]:
adat_iqr = iqr_szures(hasznalt_adat, numerikus)

hasznalt_adat.shape, adat_iqr.shape


((38009, 10), (23126, 10))

In [14]:
adat_szurt = adat_iqr[
    (adat_iqr["mileage"] <= adat_iqr["mileage"].quantile(0.99)) &
    (adat_iqr["ccm"] <= adat_iqr["ccm"].quantile(0.99)) &
    (adat_iqr["gyartasi_ev"] >= 1970)
].copy()

adat_iqr.shape, adat_szurt.shape


((23126, 10), (22681, 10))

In [15]:
X = adat_szurt.drop(columns=["ar_kategoria"])
y = adat_szurt["ar_kategoria"]


In [16]:
adat_szurt = adat_iqr[
    (adat_iqr["mileage"] <= adat_iqr["mileage"].quantile(0.99)) &
    (adat_iqr["ccm"] <= adat_iqr["ccm"].quantile(0.99)) &
    (adat_iqr["gyartasi_ev"] >= 1970)
].copy()

adat_iqr.shape, adat_szurt.shape


((23126, 10), (22681, 10))

In [17]:
X = adat_szurt.drop(columns=["ar_kategoria"])
y = adat_szurt["ar_kategoria"]

X.shape, y.shape


((22681, 9), (22681,))

In [18]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)

X_train.shape, X_val.shape, X_test.shape


((15876, 9), (3402, 9), (3403, 9))

In [19]:
numerikus = ["gyartasi_ev", "mileage", "ccm", "doorsnumber", "person_capacity"]
kategorikus = ["brand_name", "model_name", "shifter", "color"]

inputs = {}

for col in numerikus:
    inputs[col] = tf.keras.Input(shape=(1,), name=col)

for col in kategorikus:
    inputs[col] = tf.keras.Input(shape=(1,), name=col, dtype=tf.string)


In [20]:
kategorikus = ["brand_name", "model_name", "shifter", "color"]

for col in kategorikus:
    X_train[col] = X_train[col].astype(str)
    X_val[col]   = X_val[col].astype(str)
    X_test[col]  = X_test[col].astype(str)


In [21]:
num_stack = tf.keras.layers.Concatenate()([inputs[col] for col in numerikus])

normalizer = tf.keras.layers.Normalization()
normalizer.adapt(np.array(X_train[numerikus]))

num_norm = normalizer(num_stack)


In [22]:
embed_layers = [num_norm]

for col in kategorikus:
    lookup = tf.keras.layers.StringLookup(
        output_mode="int",
        mask_token=None
    )

    lookup.adapt(X_train[col].values)

    vocab = lookup.vocabulary_size()
    dim = min(16, vocab // 2 + 1)

    x = lookup(inputs[col])
    x = tf.keras.layers.Embedding(vocab, dim)(x)
    x = tf.keras.layers.Reshape((dim,))(x)

    embed_layers.append(x)

x = tf.keras.layers.Concatenate()(embed_layers)


In [23]:
x = tf.keras.layers.Dense(256, activation="relu")(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.3)(x)

x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)


In [24]:
label_lookup = tf.keras.layers.StringLookup(output_mode="int")
label_lookup.adapt(y_train.astype(str).values)

y_train_i = label_lookup(y_train.astype(str).values)
y_val_i   = label_lookup(y_val.astype(str).values)
y_test_i  = label_lookup(y_test.astype(str).values)

outputs = tf.keras.layers.Dense(
    label_lookup.vocabulary_size(),
    activation="softmax"
)(x)


In [25]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [26]:
def df_to_ds(Xdf, yarr, shuffle=False, batch_size=256):
    X_dict = {}
    for col in Xdf.columns:
        if Xdf[col].dtype == object:
            X_dict[col] = Xdf[col].astype(str).values
        else:
            X_dict[col] = Xdf[col].astype("float32").values

    ds = tf.data.Dataset.from_tensor_slices((X_dict, yarr))
    if shuffle:
        ds = ds.shuffle(len(Xdf))
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)


In [27]:
train_ds = df_to_ds(X_train, y_train_i, shuffle=True)
val_ds   = df_to_ds(X_val, y_val_i)
test_ds  = df_to_ds(X_test, y_test_i)


In [28]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=50,
    callbacks=[early_stop]
)


Epoch 1/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8044 - loss: 0.4937 - val_accuracy: 0.8225 - val_loss: 0.8761
Epoch 2/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8712 - loss: 0.3154 - val_accuracy: 0.8654 - val_loss: 0.7179
Epoch 3/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8847 - loss: 0.2823 - val_accuracy: 0.8871 - val_loss: 0.5723
Epoch 4/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8901 - loss: 0.2762 - val_accuracy: 0.8845 - val_loss: 0.4566
Epoch 5/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8941 - loss: 0.2623 - val_accuracy: 0.8880 - val_loss: 0.3908
Epoch 6/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8949 - loss: 0.2623 - val_accuracy: 0.8792 - val_loss: 0.3272
Epoch 7/50
[1m63/63[0m [32m━━━━━━━━━━

In [29]:
test_loss, test_acc = model.evaluate(test_ds)
test_loss, test_acc


[1m 1/14[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 15ms/step - accuracy: 0.8906 - loss: 0.2922

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8927 - loss: 0.2676 


(0.26761266589164734, 0.8927416801452637)

In [30]:
y_pred_prob = model.predict(test_ds)
y_pred = np.argmax(y_pred_prob, axis=1)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [31]:
cimkek = label_lookup.get_vocabulary()
y_pred_labels = [cimkek[i] for i in y_pred]
y_true_labels = [cimkek[i] for i in y_test_i]


In [32]:
valos_cimkek = sorted(list(set(y_true_labels)))

cm = confusion_matrix(
    y_true_labels,
    y_pred_labels,
    labels=valos_cimkek
)

cm_df = pd.DataFrame(
    cm,
    index=valos_cimkek,
    columns=valos_cimkek
)

cm_df


Unnamed: 0,draga,kozepes,olcso
draga,833,104,1
kozepes,55,1150,51
olcso,3,151,1055


In [33]:
print(classification_report(y_true_labels, y_pred_labels))


              precision    recall  f1-score   support

       draga       0.93      0.89      0.91       938
     kozepes       0.82      0.92      0.86      1256
       olcso       0.95      0.87      0.91      1209

    accuracy                           0.89      3403
   macro avg       0.90      0.89      0.90      3403
weighted avg       0.90      0.89      0.89      3403



In [34]:
plt.figure()
plt.plot(history.history["accuracy"], label="train")
plt.plot(history.history["val_accuracy"], label="validation")
plt.xlabel("Epoch")
plt.ylabel("Pontosság")
plt.title("Tanítási és validációs pontosság")
plt.legend()
plt.tight_layout()

plt.savefig("../kepek/pontossag.png", dpi=300)
plt.close()


In [35]:
plt.figure()
plt.plot(history.history["loss"], label="train")
plt.plot(history.history["val_loss"], label="validation")
plt.xlabel("Epoch")
plt.ylabel("Veszteség")
plt.title("Tanítási és validációs veszteség")
plt.legend()
plt.tight_layout()

plt.savefig("../kepek/veszteseg.png", dpi=300)
plt.close()


In [36]:
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_df,
    annot=True,
    fmt="d",
    cmap="Blues"
)

plt.xlabel("Előrejelzett kategória")
plt.ylabel("Valós kategória")
plt.title("Konfúziós mátrix")

plt.tight_layout()
plt.savefig("../kepek/konfuzios_matrix.png", dpi=300)
plt.close()


In [37]:
model.save("../modell/ar_kategoria_model.keras")
