In [30]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
# from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import Ridge, LinearRegression
import numpy as np
# import matplotlib.pyplot as plt
import random
from sklearn.metrics import r2_score, mean_squared_error as MSE

# import seaborn as sns
from IPython.display import display as disp

random.seed(42)
np.random.seed(42)

In [104]:
df_train = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/Hometasks/HT1/cars_train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/Hometasks/HT1/cars_test.csv')

In [98]:
# numeric_features = ["year", "km_driven", "seats", "engine", "max_power", 'mileage']
# categorical_features = ["fuel", "seller_type", "transmission", "owner"]

numeric_features = ["year", "km_driven", "seats", 'mileage', 'engine', 'max_power']
categorical_features = ['name', "fuel", "seller_type", "transmission", "owner"]
target = 'selling_price'


def remove_duplicates(df):
    df = df.drop_duplicates(keep='first')
    df = df.reset_index(drop=True)

    return df


def drop_cols(df):
    df = df[categorical_features + numeric_features + [target]]

    return df

def convert_str_cols(df):
    for c in ['mileage', 'engine', 'max_power']:
        df[c] = df[c].str.replace(r'[^\d.-]', '', regex=True)
        df[c] = pd.to_numeric(df[c], errors='coerce')

    return df

In [105]:
# вырезаем из марок авто бренды. считаем что это первое слово в названии
df_train['name'] = df_train['name'].str.split().str[0]
df_test['name'] = df_test['name'].str.split().str[0]

df_train = drop_cols(df_train)
df_train = remove_duplicates(df_train)
df_train = convert_str_cols(df_train)

df_test = drop_cols(df_test)
df_test = remove_duplicates(df_test)
df_test = convert_str_cols(df_test)

X_train = df_train[numeric_features + categorical_features]
y_train = df_train[target]

X_test = df_train[numeric_features + categorical_features]
y_test = df_train[target]

In [101]:
# Преобразование числовых столбцов
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Замена пропусков на среднее
    # ('scaler', StandardScaler())  # Масштабирование признаков
    ('scaler', MinMaxScaler())  # Масштабирование признаков
])

# Преобразование категориальных столбцов
categorical_transformer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),  # Замена пропусков на 'NA'
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))  # OHE-кодирование
])

preprocessor_cols = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

preprocessor = Pipeline(steps=[
    # ('convert_str_to_num', TransformStrCols(['mileage', 'engine', 'max_power'])),
    ('preprocessor_cols', preprocessor_cols)

])

# Полный пайплайн с линейной регрессией
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # ('regressor', LinearRegression())
    ('regressor', Ridge())
    ])

In [106]:
pipe.fit(X_train, y_train)

y_test_predict = pipe.predict(X_test)

res = r2_score(y_test, y_test_predict)
disp(f"r2 = {res}")

feature_importance = pd.DataFrame({'coef': pipe['regressor'].coef_, 'param':pipe['preprocessor'].get_feature_names_out()})
disp(feature_importance.sort_values(by='coef', ascending=False))


'r2 = 0.7601698936494548'

Unnamed: 0,coef,param
43,2518947.0,cat__owner_Test Drive Car
5,2290282.0,num__max_power
34,1717501.0,cat__name_Volvo
21,1523162.0,cat__name_Lexus
0,1412349.0,num__year
7,1110111.0,cat__name_BMW
20,865377.3,cat__name_Land
17,677682.7,cat__name_Jaguar
25,591464.6,cat__name_Mercedes-Benz
6,420657.0,cat__name_Audi


In [107]:
from pickle import dump

with open("app/model_ridge_cars_brands.pkl", "wb") as f:
    dump(pipe, f)

чтобы получить 90+ на тесте по r2 надо ничего не делать с колонкой name 😀
он тупо выучит какие марки стоят дорого
а всякие Марути дешевые