# Car price 

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
df = pd.read_csv("car_price.csv")

display(df.head())
print("Shape:", df.shape)
print("\nDtypes:\n", df.dtypes)


Unnamed: 0.1,Unnamed: 0,Classes,Old Price,Price Change,New Price,date_range
0,0,Peugeot 5008 A/T / Active Pack 2023,"1,029,990 EGP","trending_up +110,000 EGP","1,139,990 EGP",1/19/2023
1,1,Ssang Yong Tivoli XLV A/T / Style 2023,"675,000 EGP","trending_up +5,000 EGP","680,000 EGP",1/19/2023
2,2,Ssang Yong Tivoli XLV A/T / Comfort 2023,"605,000 EGP","trending_up +185,000 EGP","790,000 EGP",1/19/2023
3,3,Ssang Yong Tivoli A/T / Style Plus 2023,"665,000 EGP","trending_up +185,000 EGP","850,000 EGP",1/19/2023
4,4,Ssang Yong Tivoli A/T / Comfort 2023,"590,000 EGP","trending_up +170,000 EGP","760,000 EGP",1/19/2023


Shape: (3504, 6)

Dtypes:
 Unnamed: 0       int64
Classes         object
Old Price       object
Price Change    object
New Price       object
date_range      object
dtype: object


## Profilowanie danych (ydata-profiling)

In [3]:
from ydata_profiling import ProfileReport
from IPython.display import IFrame

profile = ProfileReport(df, title = "car_price.csv — Profiling report", explorative = True)
report_path = Path("car_price_profiling_report.html")
profile.to_file(report_path)

IFrame(src = str(report_path), width = "100%", height = 600)

ModuleNotFoundError: No module named 'ydata_profiling'

## Czyszczenie: stringi na liczby

In [None]:
if "Unnamed: 0" in df.columns:
    df = df.drop(columns = ["Unnamed: 0"])

df.columns = [c.strip().replace(" ", "_") for c in df.columns]

display(df.head())
print("Columns:", list(df.columns))

In [None]:
for c in ["Old_Price", "New_Price"]:
    df[c] = (df[c].astype("string")
             .str.replace("EGP", "", regex = False)
             .str.replace(r"[^0-9]", "", regex = True)
             .replace("", pd.NA)
             .astype(float))

price_cols = ["Old_Price", "New_Price"]

display(df[price_cols].head())

print(df[price_cols].describe().T)

### `Price_Change`: wyciągamy znak i wartość

In [None]:
s = df["Price_Change"].astype("string")
df["Price_Change_Value"] = (
        s.str.contains("trending_down", na = False).map({True: -1, False: 1})
        * s.str.replace(r"[^0-9]", "", regex = True).replace("", pd.NA).astype(float)
)
display(df[["Price_Change", "Price_Change_Value"]].head(10))

## Daty

In [None]:
date_cols = [c for c in df.columns if "date" in c.lower()]
print("Date-like columns:", date_cols)

for c in date_cols:
    df[c] = pd.to_datetime(df[c], errors = "coerce", dayfirst = True)

if len(date_cols) > 0:
    display(df[date_cols].head())
else:
    print("Brak kolumn z datą w tym zbiorze.")

## Braki danych i imputacja

In [None]:
missing = df.isna().mean().sort_values(ascending = False)
display(missing[missing > 0].to_frame("Brakujące dane").head(30))

## Outliery

In [None]:
candidate_cols = [c for c in ["New_Price", "Old_Price", "Price_Change_Value"] if c in df.columns]


def count_outliers_iqr (s):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    return ((s < lo) | (s > hi)).sum()


for c in candidate_cols:
    n = count_outliers_iqr(df[c].dropna())
    print(f"{c}: {n} outliers")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 4))
df[candidate_cols].boxplot()
plt.title("Boxplot – detekcja outlierów")
plt.show()


In [None]:
df_wins = df.copy()
for c in candidate_cols:
    lo, hi = df_wins[c].quantile(0.01), df_wins[c].quantile(0.99)
    df_wins[c] = df_wins[c].clip(lower = lo, upper = hi)

print("Po winsoryzacji (opis):")
display(df_wins[candidate_cols].describe().T)

## Skalowanie

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

target = "New_Price" if "New_Price" in df.columns else None
feature_cols = [c for c in df.columns if c != target]

X = df[feature_cols].copy()
y = df[target].copy() if target else None

num_features = X.select_dtypes(include = [np.number]).columns.tolist()
cat_features = [c for c in X.columns if c not in num_features]

print("Target:", target)
print("Num features:", len(num_features), "Cat features:", len(cat_features))

numeric_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown = "ignore")),
])

preprocessor = ColumnTransformer(
    transformers = [
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ],
    remainder = "drop",
    verbose_feature_names_out = False,
)

X_train, X_test = train_test_split(X, test_size = 0.2, random_state = 42)

X_train_t = preprocessor.fit_transform(X_train)
X_test_t = preprocessor.transform(X_test)

print("X_train_t shape:", X_train_t.shape)
print("X_test_t shape:", X_test_t.shape)

## Szybki model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

target = "New_Price"

X = df.drop(columns = [target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

model = Pipeline(steps = [
    ("preprocess", preprocessor),
    ("reg", LinearRegression()),
])

model.fit(X_train, y_train)
preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, preds)

print(f"MAE:  {mae:,.0f}")
print(f"RMSE: {rmse:,.0f}")
print(f"R^2:  {r2:.3f}")


In [None]:
# --- NAPRAWA: usuwamy kolumny powodujące przeciek i budujemy preprocessor od nowa
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

target = "New_Price"

leak_cols = [c for c in ["Old_Price", "Price_Change_Value"] if c in df.columns]

X = df.drop(columns = [target] + leak_cols)
y = df[target]

num_features = X.select_dtypes(include = "number").columns
cat_features = X.select_dtypes(exclude = "number").columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown = "ignore")),
])

preprocessor_clean = ColumnTransformer([
    ("num", numeric_transformer, num_features),
    ("cat", categorical_transformer, cat_features),
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model_clean = Pipeline([
    ("preprocess", preprocessor_clean),
    ("reg", LinearRegression()),
])

model_clean.fit(X_train, y_train)
preds = model_clean.predict(X_test)

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("MODEL PO NAPRAWIE")
print(f"MAE:  {mae:,.0f}")
print(f"RMSE: {rmse:,.0f}")
print(f"R^2:  {r2:.3f}")
print("Usunięte kolumny:", leak_cols)
print("Liczba cech num:", len(num_features), "| kat:", len(cat_features))
