In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import phik
from phik.report import plot_correlation_matrix
from phik import report

import shap

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

In [40]:
from typing import Literal

In [41]:
cols_to_drop = ['car_id', 'target_class', 'deviation_normal_count']
cols_cats = ['model', 'car_type', 'fuel_type']

def prepare_data(df: pd.DataFrame, cols_to_drop: list() = [], cols_cats: list() = [], type_df: Literal['train', 'test'] = 'train'):
    cols_to_drop = cols_to_drop
    cols_cats = cols_cats
    df_copy = df.copy()
    df_copy.drop(columns=cols_to_drop, inplace=True)
    
    le = LabelEncoder()
    for col in cols_cats:
        df_copy[col] = le.fit_transform(df_copy[col])
    
    df_copy[cols_cats] = df_copy[cols_cats].astype('category')
    if type_df == 'test':
        return df_copy
    X = df_copy.drop('target_reg', axis=1)
    y = df_copy.target_reg
    return X, y

In [9]:
df_train = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_train.csv')
df_train.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,109.99,another_bug,4.737759,12141310.0,0.1,180.855726,0.023174,174,170
1,O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,34.48,electro_bug,4.480517,18039090.0,0.0,187.862734,12.306011,174,174
2,d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,34.93,gear_stick,4.768391,15883660.0,0.1,102.382857,2.513319,174,173
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,32.22,engine_fuel,3.88092,16518830.0,0.1,172.793237,-5.029476,174,170
4,N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,27.51,engine_fuel,4.181149,13983170.0,0.1,203.462289,-14.260456,174,171


In [12]:
df_test = pd.read_csv('https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/quickstart_test.csv')
df_test.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,P17494612l,Skoda Rapid,economy,petrol,4.8,2013,42269,2019,gear_stick,3.746207,14075390.0,0.1,195.454152,10.56622,174,170
1,N-1530212S,Renault Sandero,standart,petrol,4.32,2015,90014,2016,engine_overheat,4.318966,19703900.0,0.0,181.538685,11.807941,174,174
2,B-1154399t,Smart ForTwo,economy,petrol,4.46,2015,82684,2017,electro_bug,5.134655,9314946.0,0.1,118.440645,14.862538,174,172
3,F12725233R,Smart ForFour,economy,petrol,2.8,2014,68833,2021,engine_check,4.617356,9336838.0,0.83,112.829785,20.088904,174,172
4,l-1139189J,Skoda Rapid,economy,petrol,6.56,2013,42442,2021,another_bug,4.287471,11962500.0,0.0,187.846088,3.69846,174,172


(     model car_type fuel_type  car_rating  year_to_start  riders  \
 0        8        1         1        3.78           2015   76163   
 1       23        1         1        3.90           2015   78218   
 2       16        3         1        6.30           2012   23340   
 3       12        0         1        4.04           2011    1263   
 4       16        3         1        4.70           2012   26428   
 ...    ...      ...       ...         ...            ...     ...   
 2332    19        1         1        4.38           2017  121239   
 2333     1        2         1        4.30           2016  107793   
 2334     6        1         1        3.88           2015   80234   
 2335    16        3         1        4.50           2014   60048   
 2336    22        1         1        3.94           2015   92312   
 
       year_to_work  mean_rating  distance_sum  rating_min   speed_max  \
 0             2021     4.737759  1.214131e+07        0.10  180.855726   
 1             2021   