### Необходимые либы

In [49]:
import random
import re

from typing import Callable, Self

import numpy as np
import pandas as pd
import pickle

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as MSE, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from IPython.display import display, display_markdown


random.seed(42)
np.random.seed(42)

### Скачиваем тренировочный и тестовый датасеты

In [50]:
df_train = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/Hometasks/HT1/cars_train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/Hometasks/HT1/cars_test.csv')

### Удаление полных дубликатов

In [51]:
df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()

columns_without_target_1 = df_train_cleaned.drop(["selling_price"], axis=1).columns

display_markdown(
    f"<h3>Тренировочный датасет</h3>",
    raw=True
)
display_markdown(
    f"<h4>Количество дубликатов до обработки: {df_train_cleaned.duplicated(columns_without_target_1).sum()}</h4>",
    raw=True
)

df_train_cleaned = df_train_cleaned.drop_duplicates(columns_without_target_1, ignore_index=True)

display_markdown(
    f"<h4>Количество дубликатов после обработки: {df_train_cleaned.duplicated(columns_without_target_1).sum()}</h4>",
    raw=True
)

display_markdown("<hr>", raw=True)

display_markdown(
    f"<h3>Тестовый датасет</h3>",
    raw=True
)
display_markdown(
    f"<h4>Количество дубликатов до обработки: {df_test_cleaned.duplicated(columns_without_target_1).sum()}</h4>",
    raw=True
)

df_test_cleaned = df_test_cleaned.drop_duplicates(columns_without_target_1, ignore_index=True)

display_markdown(
    f"<h4>Количество дубликатов после обработки: {df_test_cleaned.duplicated(columns_without_target_1).sum()}</h4>",
    raw=True
)

<h3>Тренировочный датасет</h3>

<h4>Количество дубликатов до обработки: 1159</h4>

<h4>Количество дубликатов после обработки: 0</h4>

<hr>

<h3>Тестовый датасет</h3>

<h4>Количество дубликатов до обработки: 69</h4>

<h4>Количество дубликатов после обработки: 0</h4>

### Отделяем целевую переменную

In [52]:
X_train = df_train_cleaned.drop(["selling_price"], axis=1)
y_train = df_train_cleaned["selling_price"].copy()

X_test = df_test_cleaned.drop(["selling_price"], axis=1)
y_test = df_test_cleaned["selling_price"].copy()

display_markdown(
    f"<h3>Тренировочный датасет</h3>",
    raw=True
)
display_markdown(
    f"<h4>Объекты с признаками:</h4>",
    raw=True
)
display(X_train.head(10))
display_markdown(
    f"<h4>Целевая переменная:</h4>",
    raw=True
)
display(y_train.head(10))

display_markdown("<hr>", raw=True)

display_markdown(
    f"<h3>Тестовый датасет</h3>",
    raw=True
)
display_markdown(
    f"<h4>Объекты с признаками:</h4>",
    raw=True
)
display(X_test.head(10))
display_markdown(
    f"<h4>Целевая переменная:</h4>",
    raw=True
)
display(y_test.head(10))

<h3>Тренировочный датасет</h3>

<h4>Объекты с признаками:</h4>

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Hyundai i20 Sportz Diesel,2010,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
3,Maruti Swift VXI BSIII,2007,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0
4,Hyundai Xcent 1.2 VTVT E Plus,2017,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,113.75nm@ 4000rpm,5.0
5,Maruti Wagon R LXI DUO BSIII,2007,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,57.5 bhp,"7.8@ 4,500(kgm@ rpm)",5.0
6,Maruti 800 DX BSII,2001,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,37 bhp,59Nm@ 2500rpm,4.0
7,Toyota Etios VXD,2011,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,67.1 bhp,170Nm@ 1800-2400rpm,5.0
8,Ford Figo Diesel Celebration Edition,2013,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5.0
9,Renault Duster 110PS Diesel RxL,2014,68000,Diesel,Individual,Manual,Second Owner,19.01 kmpl,1461 CC,108.45 bhp,248Nm@ 2250rpm,5.0


<h4>Целевая переменная:</h4>

0    450000
1    370000
2    225000
3    130000
4    440000
5     96000
6     45000
7    350000
8    200000
9    500000
Name: selling_price, dtype: int64

<hr>

<h3>Тестовый датасет</h3>

<h4>Объекты с признаками:</h4>

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Mahindra Xylo E4 BS IV,2010,168000,Diesel,Individual,Manual,First Owner,14.0 kmpl,2498 CC,112 bhp,260 Nm at 1800-2200 rpm,7.0
1,Tata Nexon 1.5 Revotorq XE,2017,25000,Diesel,Individual,Manual,First Owner,21.5 kmpl,1497 CC,108.5 bhp,260Nm@ 1500-2750rpm,5.0
2,Honda Civic 1.8 S AT,2007,218463,Petrol,Individual,Automatic,First Owner,12.9 kmpl,1799 CC,130 bhp,172Nm@ 4300rpm,5.0
3,Honda City i DTEC VX,2015,173000,Diesel,Individual,Manual,First Owner,25.1 kmpl,1498 CC,98.6 bhp,200Nm@ 1750rpm,5.0
4,Tata Indica Vista Aura 1.2 Safire BSIV,2011,70000,Petrol,Individual,Manual,Second Owner,16.5 kmpl,1172 CC,65 bhp,96 Nm at 3000 rpm,5.0
5,Mahindra Thar CRDe,2019,12584,Diesel,Dealer,Manual,First Owner,16.55 kmpl,2498 CC,105 bhp,247Nm@ 1800-2000rpm,6.0
6,Chevrolet Spark 1.0 LS,2011,35000,Petrol,Individual,Manual,First Owner,18.0 kmpl,995 CC,62 bhp,90.3Nm@ 4200rpm,5.0
7,Maruti Ritz ZXi,2012,70000,Petrol,Individual,Manual,Second Owner,18.5 kmpl,1197 CC,85.80 bhp,114Nm@ 4000rpm,5.0
8,Maruti Alto LX,2011,72000,Petrol,Individual,Manual,Second Owner,19.7 kmpl,796 CC,46.3 bhp,62Nm@ 3000rpm,5.0
9,Hyundai Creta 1.6 CRDi SX,2016,58000,Diesel,Individual,Manual,First Owner,19.67 kmpl,1582 CC,126.2 bhp,259.9Nm@ 1900-2750rpm,5.0


<h4>Целевая переменная:</h4>

0    229999
1    665000
2    175000
3    635000
4    130000
5    975000
6    150000
7    275000
8    140000
9    850000
Name: selling_price, dtype: int64

### Создание пайплайна

#### Необходимые utils

In [53]:
template = re.compile(r"\d+\.?\d*")


def get_float(value: str | float) -> float | None:
    if pd.isna(value):
        return

    if not isinstance(value, str):
        value = str(value)

    match = template.search(value)

    if match:
        return float(match[0])

    return


def get_object(value: np.float64) -> str | None:
    if pd.isna(value):
        return np.float64("nan")

    return str(int(value))


class ColumnConverter(BaseEstimator, TransformerMixin):
    def __init__(self, func: Callable) -> None:
        self.func = func

    def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> Self:
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        for column in X.columns:
            X[column] = X[column].apply(self.func)
        return X

    def get_feature_names_out(self, input_features: pd.Series | None = None) -> pd.Index:
        return input_features


def display_metrics(pipeline: Pipeline, X: pd.DataFrame, y: pd.Series) -> None:
    predictions = pipeline.predict(X)

    display_markdown(
        "<h4>Первые десять значений предсказаний:</h4>",
        raw=True
    )
    display(predictions[:10])

    display_markdown("<br>", raw=True)

    display_markdown(
        f"<h4>&radic;MSE: {MSE(y, predictions) ** 0.5:.2f}</h4>",
        raw=True
    )
    display_markdown(
        f"<h4>R<sup>2</sup>: {r2_score(y, predictions):.5f}</h4>",
        raw=True
    )

#### Трансформеры

In [54]:
numeric_transformer = Pipeline(
    steps=[
        ("filler", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

categorial_transformer = Pipeline(
    steps=[
        ("filler", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False)),
    ]
)

#### Конвертер

In [55]:
to_float_converter = ColumnConverter(func=get_float)
to_object_converter = ColumnConverter(func=get_object)

converter = ColumnTransformer(
    transformers=[
        ("to_float", to_float_converter, ["mileage", "engine", "max_power"]),
        ("to_object", to_object_converter, ["seats"]), 
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
).set_output(transform="pandas")

#### Препроцессор

In [56]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ["year", "km_driven", "mileage", "engine", "max_power"]),
        ("cat", categorial_transformer, ["fuel", "seller_type", "transmission", "owner", "seats"]),
    ],
    verbose_feature_names_out=False
).set_output(transform="pandas")

#### Пайплайн

In [57]:
pipeline = Pipeline(
    steps=[
        ("converter", converter),
        ("preprocessor", preprocessor),
        ("regressor", Ridge(alpha=21.54)),
    ]
)

#### Обучение и проверка

In [58]:
pipeline.fit(X_train, y_train)

display_metrics(pipeline, X_train, y_train)

display_markdown("<hr>", raw=True)

display_metrics(pipeline, X_test, y_test)

<h4>Первые десять значений предсказаний:</h4>

array([389057.31401186, 619485.5766867 , 405134.26169256, 166704.49504768,
       538980.34166315, -22945.43219705, -54275.69776756, 254459.12145578,
       271268.54561808, 671528.34217353])

<br>

<h4>&radic;MSE: 317442.52</h4>

<h4>R<sup>2</sup>: 0.64844</h4>

<hr>

<h4>Первые десять значений предсказаний:</h4>

array([ 608688.16443828,  878952.48851408,  797065.06821004,
        667851.39300868,   69186.08553817, 1044796.22856281,
        110739.44371467,  312909.43833842, -105298.97006937,
        978584.72625284])

<br>

<h4>&radic;MSE: 348089.80</h4>

<h4>R<sup>2</sup>: 0.66162</h4>

### Сохранение модели

In [59]:
with open("../selling_price_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

### Загрузка модели и проверка

In [None]:
with open("../selling_price_model.pkl", "rb") as f:
    model: Pipeline = pickle.load(f)

display_metrics(model, X_test, y_test)

<h4>Первые десять значений предсказаний:</h4>

array([389057.31401186, 619485.5766867 , 405134.26169256, 166704.49504768,
       538980.34166315, -22945.43219705, -54275.69776756, 254459.12145578,
       271268.54561808, 671528.34217353])

<br>

<h4>&radic;MSE: 317442.52</h4>

<h4>R<sup>2</sup>: 0.64844</h4>