In [1]:
import pandas as pd
import plotly.express as px
from httpcore import NetworkError

from src.preprocessing.brand_mapping import brand_category_weights
from src.common.Config import Config
from src.common.exceptions import NoDataFile

pd.set_option("display.max_columns", None)


In [None]:

if not Config.data_path.exists():
    raise NoDataFile(
        f"There is no data file in location {Config.data_path}."
        "Run download_data.py script first!"
    )
df = pd.read_csv(Config.data_path)
df.head(5)

In [None]:
# Print column information
df.info()
df.describe()

## Display data characteristics to determine column processing


In [None]:
# Display unique car brands
unique_brands = df['Vehicle_brand'].unique()
unique_brands

In [None]:
# Count the number of cars for each unique brand
brand_counts = df['Vehicle_brand'].value_counts().sort_values()
brand_counts

In [None]:
# See the percentage of drive types for given car model
df["Vehicle"] = df["Vehicle_brand"] + "/" + df["Vehicle_model"]

# Group by the new Vehicle column and Drive, then calculate the size of each group
drive_counts = df.groupby(["Vehicle", "Drive"]).size().reset_index(name="count")

# Calculate the total count for each Vehicle
total_counts = df.groupby("Vehicle")["Drive"].count().reset_index(name="total_count")

# Merge the counts with the total counts
drive_counts = drive_counts.merge(total_counts, on="Vehicle")

# Calculate the percentage
drive_counts["percentage"] = (drive_counts["count"] / drive_counts["total_count"]) * 100

# Display the result
drive_counts

In [None]:
# See the percentage of transmission types for given car model
df["Vehicle"] = df["Vehicle_brand"] + "/" + df["Vehicle_model"]

# Group by the new Vehicle column, Year, and Transmission, then calculate the size of each group
transmission_counts = (
    df.groupby(["Vehicle", "Production_year", "Transmission"]).size().reset_index(name="count")
)

# Calculate the total count for each Vehicle and Year
total_counts = (
    df.groupby(["Vehicle", "Production_year"])["Transmission"]
    .count()
    .reset_index(name="total_count")
)

# Merge the counts with the total counts
transmission_counts = transmission_counts.merge(total_counts, on=["Vehicle", "Production_year"])

# Calculate the percentage
transmission_counts["percentage"] = (
    transmission_counts["count"] / transmission_counts["total_count"]
) * 100

# Display the result
transmission_counts

## Small record count
Brands with small record counts are being deleted, due to lack of reliability during model training.


## Field handling

### **Index** (205,160 / 208,304 non-null)

**Description:** Index of record.

**Handling N/A:** REMOVE COLUMN as it's not important and might be harmful for training results.


### **Currency** (205,160 / 208,304 non-null)

**Description:** Currency of record.

**Handling N/A:** REMOVE COLUMN after price convertion to pln.


### **Vehicle** (205,160 / 208,304 non-null)

**Description:** Contains "brand/model" information (e.g. Alfa RomeoGiulietta **1.4 TB 16V**).

**Importance:** Field is not important. It duplicates code from other fields.

**Handling N/A:** REMOVE COLUMN


### **Vehicle_version** (138,082 / 208,304 non-null)

**Description:** Contains information about model version (e.g. Alfa RomeoGiulietta **1.4 TB 16V**).

**Importance:** Field contains important values, due to value difference for specific trims.

**Handling N/A:** MERGE vehicle model and version as a extended column (e.g. advanced_version)

### **Vehicle_generation** (147,860 / 208,304 non-null)
**Description:** Contains information about model generation.

**Importance:** Field might be sometimes important, because some generations might be more desired than others.

**Handling N/A:** FILL EMPTY WITH UNKNOWN


### **Mileage_km** (207,321 / 208,304 non-null)
**Description:** Contains information about car's mileage.

**Importance:** Field is very important, due to high influence of mileage on cars value.

**Handling N/A:** REMOVE ROWS: As the car's without given mileage are usually not representative for average case, they are outliers that do not provide meaningful information.


### **Power_HP** (207,661 / 208,304 non-null)

**Description:** Contains information about car's horsepower.

**Importance:** Field is very important, due to high influence of engine power on price (more power == better car trim)

**Handling N/A:** DELETE ROWS - As there are not many of them and displacement is usually missing for them too, it's better to remove these rows.


### **Displacement_cm3** (206,338 / 208,304 non-null)

**Description:** Contains information about car's engine displacement in cm3

**Importance:** Field is rather very important, due to high influence of displacement on price (more displacement == more power == better car trim)

**Handling N/A:** DELETE ROWS - As there are not many of them and displacement is usually missing for them too, it's better to remove these rows.


### **CO2_emissions** (94,047 / 208,304 non-null)

**Description:** Contains information about car's engine displacement in cm3

**Importance:** Field is not important for car value.

**Handling N/A:** DELETE COLUMN - not important for evaluation


### **Drive** (94,047 / 208,304 non-null)

**Description:** Contains information about car's drive (eg. 4-wd, front-wheel drive, rear-wheel drive)

**Importance:** Field is important as it usually means a rather significant price difference

**Handling N/A:** FILL WITH MOST FREQUENT VALUE - from analyzing data above, most car models tend to favor certain drive types.

### **Transmission** (94,047 / 208,304 non-null)

**Description:** Contains information about car's transmission (e.g. manual, automatic)

**Importance:** Similar to drive

**Handling N/A:** FILL WITH MOST FREQUENT VALUE - similar to drive

### **Doors_number** (206,817 / 208,304 non-null)

**Description:** Contains information about car's door count

**Importance:** Not very important

**Handling N/A:** FILL WITH MOST FREQUENT VALUE - Price difference between 3 and 5-door cars are not great.


### **Origin_country** (118,312 / 208,304 non-null)

**Description:** Contains information about car's origin country

**Importance:** Not very important as the most of the cars on Polish market have european origin, with similar climate and regulations.

**Handling N/A:** REMOVE COLUMN


### **First_owner** (65,094 / 208,304 non-null)

**Description:** Contains information about if car had only one owner.

**Importance:** Not very important and high percentage of missing values

**Handling N/A:** NULL = NO


### **First_registration_date** (86,445 / 208,304 non-null)

**Description:** Contains information about cars first registration date

**Importance:** Not very important and high percentage of missing values

**Handling N/A:** REMOVE COLUMN


### **Offer_location** (208,304 / 208,304 non-null)
**Description:** Contains information about offer location

**Importance:** Not important

**Handling N/A:** REMOVE COLUMN

## Additional values
### **Brand_category**:
**Description:** Contains information about brands luxury tier

**Importance:** Important for calculating feature value

### **Brand_category**:
**Description:** Contains information about brands luxury tier

**Importance:** Important for calculating feature value


In [8]:
# Drop rows that were designated for deletion

# Remove rows where Mileage_km is missing
df = df.dropna(subset=["Mileage_km"])

# Remove rows where Power_HP is missing
df = df.dropna(subset=["Power_HP"])

# Remove rows where Displacement_cm3 is missing
df = df.dropna(subset=["Displacement_cm3"])

In [9]:
# Drop brands that have less than 50 cars
brand_counts = df["Vehicle_brand"].value_counts()
brands_to_drop = brand_counts[brand_counts < 50].index
df = df[~df["Vehicle_brand"].isin(brands_to_drop)]

In [None]:
# Handle null values in the remaining columns

df["Drive"] = df.groupby("Vehicle_model")["Drive"].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else "Front wheels")
)

df["Transmission"] = df.groupby("Vehicle_model")["Transmission"].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else "Manual")
)

df["Doors_number"] = df.groupby("Vehicle_model")["Doors_number"].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 5)
)

df["First_owner"] = df["First_owner"].fillna(False).replace("Yes", True).infer_objects(copy=False)

df["Vehicle_generation"] = df["Vehicle_generation"].fillna("Unknown")

# Convert to string to avoid TypeError with categorical data
df["Vehicle_model"] = df["Vehicle_model"].astype(str)
df["Vehicle_version"] = df["Vehicle_version"].fillna("").astype(str)

df["Advanced_model"] = df["Vehicle_model"] + " " + df["Vehicle_version"].fillna("")
df["Advanced_model"] = df["Advanced_model"].str.strip()

df.head(5)

In [None]:
import requests

url = "http://api.nbp.pl/api/exchangerates/rates/A/EUR/"

# Make a GET request to the API
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    # Extract the Euro value
    euro_value = data["rates"][0]["mid"]
    print(f"The current value of Euro (EUR) is: {euro_value} PLN")
else:
    raise NetworkError("Failed to fetch the Euro value from NBP API")

In [12]:
# Convert the Price column to float64 before performing the multiplication
df["Price"] = df["Price"].astype("float64")

# Convert EUR prices to PLN and round to 2 decimal places
df.loc[df["Currency"] == "EUR", "Price"] = (
    df.loc[df["Currency"] == "EUR", "Price"] * euro_value
).round(2)

# Convert the Price column back to Int64
df["Price"] = df["Price"].round(0).astype("Int64")

# Update the currency to PLN
df.loc[df["Currency"] == "EUR", "Currency"] = "PLN"

# Features
Extract unique values first to evaluate the data.


In [None]:
import ast

# Extract all unique feature values from the 'Features' column
unique_features = set()

# Iterate through each row in the 'Features' column
for features_str in df["Features"]:
    # Convert the string representation of the list to an actual list
    features_list = ast.literal_eval(features_str)
    unique_features.update(features_list)

# Convert the set to a sorted list
unique_features = sorted(unique_features)

# Print all unique feature values
unique_features

As the features column contains a normalized list of features (there is not two different values for ABS), we will process it by using one-hot encoding.

In [None]:
# Give each column a brand category
from src.preprocessing.brand_mapping import brand_to_category

def get_brand_weight(brand):
    category = brand_to_category.get(brand, None)
    if category:
        return brand_category_weights.get(category.capitalize(), 1.0)  # Default weight is 1.0 if not found
    return 1.0  # Default weight for unrecognized brands

df["Brand_weight"] = df["Vehicle_brand"].apply(get_brand_weight)
df.head(5)


In [None]:
# Calculate feature score based on the features and brand_weight
from src.preprocessing.feature_weights_mapping import feature_weights
# Convert the string representation of the list to an actual list
df["Features"] = df["Features"].apply(ast.literal_eval)

def calculate_feature_score(row):
    score = 0
    features = row["Features"]
    for feature in features:
        score += feature_weights[feature]
    return score * row["Brand_weight"]

df['Feature_score'] = df.apply(lambda row: calculate_feature_score(row), axis=1)
df.head(5)

In [16]:
# Remove columns that were designated for deletion
columns_to_remove = [
    "Currency",
    "CO2_emissions",
    "Origin_country",
    "First_registration_date",
    "Offer_location",
    "Vehicle",
    "Vehicle_version",
    "Offer_publication_date",
    "Brand_weight",
    "Features"
]

df = df.drop(columns=columns_to_remove)

## Types
This mapping is designated for initial table. All fields generated by one-hot reloading are boolean values,

In [None]:
from src.preprocessing.dtype_mapping import dtype_mapping
# Apply the dtype mapping to the DataFrame
df = dtype_mapping(df)
df.head(5)

In [None]:
# Calculate bounds for IQR
Q1 = df["Price"].quantile(0.25)
Q3 = df["Price"].quantile(0.98)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Mark outliers
df["is_outlier"] = (df["Price"] < lower_bound) | (df["Price"] > upper_bound)

# Plotly plots
fig = px.scatter(
    df,
    x="Index",
    y="Price",
    color="is_outlier",
    title="Outliery w danych",
    labels={"is_outlier": "Czy outlier?", "Price": "Wartość", "Index": "Indeks"},
)

fig.update_traces(marker=dict(size=10))
fig.show()

print(df[df["is_outlier"] == True]["Price"].min())

df = df[(df["Price"] >= lower_bound) & (df["Price"] <= upper_bound)]

In [None]:
# Remove outliers for specific brands
# Calculate IQR per Vehicle_brand
Q1 = df.groupby('Vehicle_brand')['Price'].quantile(0.25)
Q3 = df.groupby('Vehicle_brand')['Price'].quantile(0.96)
IQR = Q3 - Q1

# Merge the quantiles back to the dataframe
df = df.merge(Q1.rename('Q1'), on='Vehicle_brand')
df = df.merge(Q3.rename('Q3'), on='Vehicle_brand')
df['IQR'] = df['Q3'] - df['Q1']

# Calculate lower and upper bounds
df['lower_bound'] = df['Q1'] - 1.5 * df['IQR']
df['upper_bound'] = df['Q3'] + 1.5 * df['IQR']

# Flag outliers
df['is_outlier'] = (df['Price'] < df['lower_bound']) | (df['Price'] > df['upper_bound'])


# Iterate over each unique Vehicle_brand
for brand in df['Vehicle_brand'].unique():
    subset = df[df['Vehicle_brand'] == brand]
    
    fig = px.scatter(
        subset,
        x="Index",
        y="Price",
        color="is_outlier",
        title=f"Outliers in {brand}",
        labels={"is_outlier": "Is Outlier?", "Price": "Price", "Index": "Index"},
    )
    
    fig.update_traces(marker=dict(size=10))
    fig.show()

# Remove outliers from each brand
df = df[(df["Price"] >= df["lower_bound"]) & (df["Price"] <= df["upper_bound"])]


# Drop intermediate columns
df = df.drop(['Q1', 'Q3', 'IQR', 'lower_bound', 'upper_bound', 'is_outlier'], axis=1)

# Future prediction

To easily predict prices for future years, we replace the production year column with car age.

In [23]:
df["Car_age"] = Config.actual_year - df["Production_year"]
df = df.drop(["Production_year"], axis=1)

In [None]:
# Drop index column
df = df.drop(columns=["Index"])

# Export the resulting DataFrame to a CSV file
df.to_csv(Config.processed_data_path, index=False)

# Display a message indicating the file has been saved
print(f"Resulting table has been exported to {Config.processed_data_path}")