In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


In [2]:
df = pd.read_csv("../data/processed/cars_processed.csv")
df.head()


Unnamed: 0,id,region,price,year,manufacturer,model,condition,fuel,transmission,drive;,car_age,log_price,year_bin
0,7316814884,auburn,33590.0,2014,gmc,sierra 1500 crew cab slt,good,gas,other,;,10,10.422013,2011-2015
1,7316814758,auburn,22590.0,2010,chevrolet,silverado 1500,good,gas,other,;,14,10.025307,2001-2010
2,7316814989,auburn,39590.0,2020,chevrolet,silverado 1500 crew,good,gas,other,;,4,10.586357,2016-2020
3,7316743432,auburn,30990.0,2017,toyota,tundra double cab sr,good,gas,other,;,7,10.341452,2016-2020
4,7316356412,auburn,15000.0,2013,ford,f-150 xlt,excellent,gas,automatic,rwd;,11,9.615872,2011-2015


In [3]:
# Целевая переменная
y = df["price"]

# Признаки
X = df.drop(["price", "log_price"], axis=1)

# Определим типы признаков
numeric_features = ["year", "car_age"]
categorical_features = ["manufacturer", "model", "condition", "fuel", "transmission", "region", "drive;", "year_bin"]

# Препроцессор: OneHotEncoder для категориальных
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [4]:
linreg_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linreg_model.fit(X_train, y_train)

y_pred_linreg = linreg_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred_linreg)
mse = mean_squared_error(y_test, y_pred_linreg)
r2 = r2_score(y_test, y_pred_linreg)

mae, mse, r2


(4114.012037570073, 41531456.77610931, 0.7848579041023526)

In [5]:
rf_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mae_rf, mse_rf, r2_rf


(3169.5025021829183, 30017850.51794641, 0.8445009210828296)

In [6]:
print("Linear Regression:")
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)

print("\nRandom Forest:")
print("MAE:", mae_rf)
print("MSE:", mse_rf)
print("R2:", r2_rf)


Linear Regression:
MAE: 4114.012037570073
MSE: 41531456.77610931
R2: 0.7848579041023526

Random Forest:
MAE: 3169.5025021829183
MSE: 30017850.51794641
R2: 0.8445009210828296


In [7]:
import joblib

joblib.dump(rf_model, "../app/best_model.pkl")
"Model saved!"


'Model saved!'

In [8]:
df['year'].value_counts().sort_index()


year
1980      20
1981      17
1982      10
1983       9
1984      20
1985      25
1986      22
1987      26
1988      22
1989      44
1990      32
1991      42
1992      32
1993      47
1994      58
1995      63
1996      79
1997     121
1998     121
1999     184
2000     208
2001     254
2002     358
2003     363
2004     465
2005     525
2006     677
2007     760
2008     853
2009     545
2010     654
2011    1051
2012    1358
2013    1785
2014    1676
2015    1872
2016    1895
2017    2433
2018    2322
2019    1906
2020    1437
2021     184
Name: count, dtype: int64