In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Dane treningowe
df = pd.read_csv('./data/used_car_dataset.csv')

df.head()
df.info()
df.describe()
df.isnull().sum()

# Stworzenie wykresów dla każdej kolumny
fig, axes = plt.subplots(nrows=len(df.columns), figsize=(10, 5*len(df.columns)))

for ax, column in zip(axes, df.columns):
    if df[column].dtype == 'object':
        sns.countplot(y=column, data=df, order=df[column].value_counts().index, ax=ax)
        ax.set_title(f'Rozkład zmiennej: {column}', fontsize=15)
    else:
        sns.histplot(df[column], bins=30, kde=True, ax=ax, color="blue")
        ax.set_title(f'Rozkład zmiennej: {column}', fontsize=15)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import shap

# Podział danych
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Oddzielenie cech od targetu ('price')
X_train = train_df.drop(columns=['price'])
y_train = train_df['price']

X_valid = valid_df.drop(columns=['price'])
y_valid = valid_df['price']

X_test = test_df.drop(columns=['price'])
y_test = test_df['price']


# Kodowanie zmiennych kategorycznych
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_valid_encoded = pd.get_dummies(X_valid, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Upewnienie się, że kolumny są takie same
X_valid_encoded = X_valid_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Imputacja brakujących wartości
imputer = SimpleImputer(strategy='mean')

X_train_imputed = imputer.fit_transform(X_train_encoded)
X_valid_imputed = imputer.transform(X_valid_encoded)
X_test_imputed = imputer.transform(X_test_encoded)

models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet()
}

results = {}

# Trening modeli i wybór najlepszego
for name, model in models.items():
    model.fit(X_train_imputed, y_train)
    y_pred = model.predict(X_valid_imputed)
    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    results[name] = rmse

# Wyświetlenie wyników
print("Wyniki RMSE dla modeli:", results)

# Wybór najlepszego modelu
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]
print(f"Najlepszy model: {best_model_name}")

# Predykcja na zbiorze testowym
y_test_pred = best_model.predict(X_test_imputed)

# Obliczenie RMSE na zbiorze testowym
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"Ostateczny Test RMSE dla {best_model_name}: {test_rmse:.2f}")

# SHAP wymaga DataFrame z nazwami kolumn
X_test_df = pd.DataFrame(X_test_imputed, columns=X_train_encoded.columns)
X_train_df = pd.DataFrame(X_train_imputed, columns=X_train_encoded.columns)

explainer = shap.Explainer(best_model, X_train_df)
shap_values = explainer(X_test_df)

# Wizualizacja wyników SHAP
shap.summary_plot(shap_values, features=X_test_df)