In [1]:
!pip install ydata_profiling prettytable imblearn catboost vininfo -q

In [2]:
import pandas as pd
import numpy as np
import warnings
import time

from vininfo import Vin

from dateutil.parser import parse

from prettytable import PrettyTable

from ydata_profiling import ProfileReport

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector

from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb


warnings.filterwarnings('ignore')


RANDOM_STATE = 54321

In [3]:
try:
    data = pd.read_csv('train.csv')
    data_test = pd.read_csv('test.csv')
except:
    print('Exeption')

In [4]:
data.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,sellingprice,saledate
0,2011,Ford,Edge,SEL,suv,automatic,2fmdk3jc4bba41556,md,4.2,111041.0,black,black,santander consumer,12500,Tue Jun 02 2015 02:30:00 GMT-0700 (PDT)
1,2014,Ford,Fusion,SE,Sedan,automatic,3fa6p0h75er208976,mo,3.5,31034.0,black,black,ars/avis budget group,14500,Wed Feb 25 2015 02:00:00 GMT-0800 (PST)
2,2012,Nissan,Sentra,2.0 SL,sedan,automatic,3n1ab6ap4cl698412,nj,2.2,35619.0,black,black,nissan-infiniti lt,9100,Wed Jun 10 2015 02:30:00 GMT-0700 (PDT)
3,2003,HUMMER,H2,Base,suv,automatic,5grgn23u93h101360,tx,2.8,131301.0,gold,beige,wichita falls ford lin inc,13300,Wed Jun 17 2015 03:00:00 GMT-0700 (PDT)
4,2007,Ford,Fusion,SEL,Sedan,automatic,3fahp08z17r268380,md,2.0,127709.0,black,black,purple heart,1300,Tue Feb 03 2015 04:00:00 GMT-0800 (PST)


In [None]:
# profile = ProfileReport(data, title="Profiling Report", html={'style':{'full_width':True}})
# profile.to_file("report_auto_raw.html")

In [None]:
# profile

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440236 entries, 0 to 440235
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          440236 non-null  int64  
 1   make          432193 non-null  object 
 2   model         432113 non-null  object 
 3   trim          431899 non-null  object 
 4   body          429843 non-null  object 
 5   transmission  388775 non-null  object 
 6   vin           440236 non-null  object 
 7   state         440236 non-null  object 
 8   condition     430831 non-null  float64
 9   odometer      440167 non-null  float64
 10  color         439650 non-null  object 
 11  interior      439650 non-null  object 
 12  seller        440236 non-null  object 
 13  sellingprice  440236 non-null  int64  
 14  saledate      440236 non-null  object 
dtypes: float64(2), int64(2), object(11)
memory usage: 50.4+ MB


In [6]:
data['state'].unique()

array(['md', 'mo', 'nj', 'tx', 'mi', 'nc', 'fl', 'oh', 'ca', 'az', 'ga',
       'tn', 'in', 'pa', 'va', 'wi', 'co', 'ny', 'il', 'nv', 'mn', 'wa',
       'sc', 'la', 'ne', 'ok', 'or', 'on', 'ma', 'ab', 'pr', 'hi', 'ut',
       'qc', 'ms', 'nm', 'ns', 'al'], dtype=object)

In [7]:
# выделим целевой признак
target = data['sellingprice']
# создадим признаки для модели
features = data.drop('sellingprice', axis=1)

features_train, features_test, target_train, target_test = train_test_split(features,
                                                                              target,
                                                                              test_size=0.25,
                                                                              random_state=RANDOM_STATE)

In [8]:
# подготовим кросс-валидацию на
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [9]:
data.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,sellingprice,saledate
0,2011,Ford,Edge,SEL,suv,automatic,2fmdk3jc4bba41556,md,4.2,111041.0,black,black,santander consumer,12500,Tue Jun 02 2015 02:30:00 GMT-0700 (PDT)
1,2014,Ford,Fusion,SE,Sedan,automatic,3fa6p0h75er208976,mo,3.5,31034.0,black,black,ars/avis budget group,14500,Wed Feb 25 2015 02:00:00 GMT-0800 (PST)
2,2012,Nissan,Sentra,2.0 SL,sedan,automatic,3n1ab6ap4cl698412,nj,2.2,35619.0,black,black,nissan-infiniti lt,9100,Wed Jun 10 2015 02:30:00 GMT-0700 (PDT)
3,2003,HUMMER,H2,Base,suv,automatic,5grgn23u93h101360,tx,2.8,131301.0,gold,beige,wichita falls ford lin inc,13300,Wed Jun 17 2015 03:00:00 GMT-0700 (PDT)
4,2007,Ford,Fusion,SEL,Sedan,automatic,3fahp08z17r268380,md,2.0,127709.0,black,black,purple heart,1300,Tue Feb 03 2015 04:00:00 GMT-0800 (PST)


In [10]:
def parse_vin(vin):
    try:
        parsed_vin = Vin(vin)
        return {
            'vin': vin,
            'make': parsed_vin.country,
            'country': parsed_vin.country,
            'manufacturer': parsed_vin.manufacturer,
        }
    except ValueError as e:
        return {'error': str(e)}

def data_preprocessing(df):
    
    df['saledate'] = pd.to_datetime(df['saledate'], utc=True, errors='coerce')

    # df['saledate'] = df['saledate'].apply(lambda x: parse(x))
    # Извлекаем год и месяц
    df['old'] = df['saledate'].dt.year-df['year']
    df['month'] = df['saledate'].dt.month

    # Определяем сезон на основе месяца
    def get_season(month):
        if 3 <= month <= 5:
            return 'Spring'
        elif 6 <= month <= 8:
            return 'Summer'
        elif 9 <= month <= 11:
            return 'Autumn'
        else:
            return 'Winter'

    df['season'] = df['month'].apply(get_season)
    
        
        
    
    # Preprocessing for the "body" column
    df['body'] = df['body'].str.lower()  # Convert to lowercase to handle case variations
    df['body'] = df['body'].fillna('unknown')  # Fill NaN values with 'unknown' or an appropriate placeholder
    
    # Standardize common body types (you can add more mappings as needed)
    body_mappings = {
        'suv': 'suv',
        'sedan': 'sedan',
        'wagon': 'wagon',
        'minivan': 'minivan',
        'hatchback': 'hatchback',
        'convertible': 'convertible',
        'coupe': 'coupe',
        'cab': 'cab',
        'nan': 'unknown'
    }
    
    df['body'] = df['body'].replace(body_mappings)
    
    # Preprocessing for the "make" column
    df['make'] = df['make'].str.lower()  # Convert to lowercase to handle case variations
    df['make'] = df['make'].fillna('unknown')  # Fill NaN values with 'unknown' or an appropriate placeholder
    
    # Standardize common makes (you can add more mappings as needed)
    make_mappings = {
        'ford': 'ford',
        'chevrolet': 'chevrolet',
        'toyota': 'toyota',
        'honda': 'honda',
        'nissan': 'nissan',
        'unknown': 'unknown'  # You can add more mappings here
    }
    
    df['make'] = df['make'].replace(make_mappings)
    
    # Preprocessing for the "model" column
    df['model'] = df['model'].str.lower()  # Convert to lowercase to handle case variations
    df['model'] = df['model'].fillna('unknown')  # Fill NaN values with 'unknown' or an appropriate placeholder
    
    # Standardize common models (you can add more mappings as needed)
    model_mappings = {
        'accord': 'accord',
        'civic': 'civic',
        'camry': 'camry',
        'corolla': 'corolla',
        'altima': 'altima',
        'unknown': 'unknown'  # You can add more mappings here
    }
    
    df['model'] = df['model'].replace(model_mappings)
    
    
    df['vin_country'] = df['vin'].apply(lambda x: parse_vin(x).get('country', 'unknown'))
    df['vin_manufacturer'] = df['vin'].apply(lambda x: parse_vin(x).get('manufacturer', 'unknown'))
    
    # Drop the original date and VIN columns
    df = df.drop(columns=['saledate', 'seller'])
    
    return df


In [11]:
features_train = data_preprocessing(features_train)

In [12]:
features_train.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,old,month,season,vin_country,vin_manufacturer
341924,2013,chevrolet,equinox,LTZ,suv,,2gnflgek2d6322177,pa,4.2,57789.0,white,black,2,5,Spring,Canada,Chevrolet Canada
144910,2014,chrysler,town and country,Touring,minivan,automatic,2c4rc1bg5er324766,fl,4.0,21318.0,gold,black,1,1,Winter,Canada,Chrysler Canada
45871,2013,chevrolet,sonic,LT,sedan,automatic,1g1jc5sh8d4160162,nv,3.3,41923.0,white,black,2,2,Winter,United States,Chevrolet
17060,2013,chrysler,200,Touring,sedan,automatic,1c3ccbbb8dn672268,fl,4.2,37035.0,silver,tan,2,1,Winter,United States,Chrysler
362995,2006,dodge,durango,SLT,suv,automatic,1d4hb48n36f166736,mo,2.5,85421.0,blue,gray,9,1,Winter,United States,Dodge


In [13]:
# Modify the pipeline steps to include the custom preprocessing function
step_FPP = ('FPP', ColumnTransformer([
    ('num', make_pipeline(StandardScaler()), make_column_selector(dtype_include=['number'])),
    ('cat', make_pipeline(OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
     make_column_selector(dtype_exclude=['number']))
]))

step_FPP_RS = ('FPPRS', ColumnTransformer([
    ('num', make_pipeline(RobustScaler(with_centering=True,
                                       with_scaling=True,
                                       quantile_range=(15.0, 85.0),
                                       copy=True, unit_variance=False)),
     make_column_selector(dtype_include=['number']))
]))

In [14]:
# Define a separate step for data preprocessing
# step_data_preprocessing = ('DataPreprocessing', data_preprocessing)

# Define the steps for the models
step_CB = ('CB', CatBoostRegressor(random_state=RANDOM_STATE))
step_LGBM = ('LGBM', lgb.LGBMRegressor(num_boost_round=2000, learning_rate=0.01, random_state=RANDOM_STATE))

# Combine data preprocessing with the model-specific steps for CB and LGBM
pipeline_CB = Pipeline(steps=[step_FPP, step_CB])
pipeline_LGBM = Pipeline(steps=[step_FPP, step_LGBM])

# Define the steps for the models with RobustScaler
step_CB_RS = ('CB', CatBoostRegressor(random_state=RANDOM_STATE))
step_LGBM_RS = ('LGBM', lgb.LGBMRegressor(num_boost_round=2000, learning_rate=0.01, random_state=RANDOM_STATE))

# Combine data preprocessing with RobustScaler and the model-specific steps for CB and LGBM
pipeline_CB_RS = Pipeline(steps=[step_FPP_RS, step_CB_RS])
pipeline_LGBM_RS = Pipeline(steps=[step_FPP_RS, step_LGBM_RS])


In [15]:
# The rest of your code remains the same
params_CB = {
    'CB__iterations': [100, 300, 500, 700, 1000],
    'CB__learning_rate': [0.001, 0.05, 0.1, 0.2],
    'CB__depth': [3, 5, 7, 9, 12]
}

params_LGBM = {
    'LGBM__n_estimators': [100, 500, 1000],
    'LGBM__learning_rate': [0.05, 0.1, 0.2],
    'LGBM__max_depth': [3, 5, 7]
}


# Define the RandomizedSearchCV object for each pipeline with MAPE scoring
# Define the RandomizedSearchCV object for each pipeline with MAPE scoring
rs_CB = RandomizedSearchCV(pipeline_CB,
                           param_distributions=params_CB,
                           scoring='neg_mean_absolute_percentage_error',  # Change scoring metric to MAPE
                           n_jobs=-1,
                           cv=kf,
                           verbose=1,
                           random_state=RANDOM_STATE)

rs_LGBM = RandomizedSearchCV(pipeline_LGBM,
                             param_distributions=params_LGBM,
                             scoring='neg_mean_absolute_percentage_error',  # Change scoring metric to MAPE
                             n_jobs=-1,
                             cv=kf,
                             verbose=1,
                             random_state=RANDOM_STATE)

rs_CB_RS = RandomizedSearchCV(pipeline_CB_RS,
                           param_distributions=params_CB,
                           scoring='neg_mean_absolute_percentage_error',  # Change scoring metric to MAPE
                           n_jobs=-1,
                           cv=kf,
                           verbose=1,
                           random_state=RANDOM_STATE)

rs_LGBM_RS = RandomizedSearchCV(pipeline_LGBM_RS,
                             param_distributions=params_LGBM,
                             scoring='neg_mean_absolute_percentage_error',  # Change scoring metric to MAPE
                             n_jobs=-1,
                             cv=kf,
                             verbose=1,
                             random_state=RANDOM_STATE)

In [16]:
%%time
start_time = time.time()
rs_CB.fit(features_train, target_train)
end_time = time.time()
print(f"Training time for pipeline: {end_time - start_time:.4f} seconds")
print('CatBoostRegressor')
print('Best hyperparameters:', rs_CB.best_params_)
print('MAPE:', -rs_CB.best_score_)
print('CV Results:', rs_CB.cv_results_)
print()

Fitting 5 folds for each of 10 candidates, totalling 50 fits
0:	learn: 8987.8776657	total: 93.1ms	remaining: 1m 33s
1:	learn: 8388.6259594	total: 155ms	remaining: 1m 17s
2:	learn: 7921.1794732	total: 183ms	remaining: 1m
3:	learn: 7582.8480888	total: 221ms	remaining: 55s
4:	learn: 7265.7145857	total: 253ms	remaining: 50.4s
5:	learn: 7035.4746898	total: 307ms	remaining: 50.8s
6:	learn: 6866.0028684	total: 325ms	remaining: 46s
7:	learn: 6722.0459368	total: 340ms	remaining: 42.1s
8:	learn: 6587.9196539	total: 360ms	remaining: 39.6s
9:	learn: 6483.5074248	total: 396ms	remaining: 39.2s
10:	learn: 6397.7134511	total: 441ms	remaining: 39.7s
11:	learn: 6325.1835135	total: 486ms	remaining: 40s
12:	learn: 6260.0513346	total: 531ms	remaining: 40.3s
13:	learn: 6200.3855276	total: 570ms	remaining: 40.1s
14:	learn: 6151.5116280	total: 606ms	remaining: 39.8s
15:	learn: 6100.8265721	total: 650ms	remaining: 40s
16:	learn: 6040.6686508	total: 700ms	remaining: 40.5s
17:	learn: 6004.2297196	total: 733ms	re

In [None]:
%%time
start_time = time.time()
rs_LGBM.fit(features_train, target_train)
end_time = time.time()
print(f"Training time for pipeline: {end_time - start_time:.4f} seconds")
print('LGBMRegressor')
print('Best hyperparameters:', rs_LGBM.best_params_)
print('MAPE:', -rs_LGBM.best_score_)
print()

In [None]:
%%time
start_time = time.time()
rs_CB_RS.fit(features_train, target_train)
end_time = time.time()
print(f"Training time for pipeline: {end_time - start_time:.4f} seconds")
print('LGBMRegressor')
print('Best hyperparameters:', rs_CB_RS.best_params_)
print('MAPE:', -rs_CB_RS.best_score_)
print()

In [None]:
%%time
start_time = time.time()
rs_LGBM_RS.fit(features_train, target_train)
end_time = time.time()
print(f"Training time for pipeline: {end_time - start_time:.4f} seconds")
print('LGBMRegressor')
print('Best hyperparameters:', rs_LGBM_RS.best_params_)
print('MAPE:', -rs_LGBM_RS.best_score_)
print()

In [17]:
table = PrettyTable(['Model', 'RMSE on CV', 'Fit time on CV', 'Prediction time on CV'])
# table.add_row(['LGBMRegressor', 
#                f'{-rs_LGBM.best_score_:.4f}',
#                f'{rs_LGBM.cv_results_["mean_fit_time"].min():.4f}',
#                f'{rs_LGBM.cv_results_["mean_score_time"].min():.4f}'])

table.add_row(['CatBoost',
               f'{-rs_CB.best_score_:.4f}',
               f'{rs_CB.cv_results_["mean_fit_time"].min():.4f}',
               f'{rs_CB.cv_results_["mean_score_time"].min():.4f}'])

# table.add_row(['LGBMRegressor + RS',
#                f'{-rs_LGBM_RS.best_score_:.4f}',
#                f'{rs_LGBM_RS.cv_results_["mean_fit_time"].min():.4f}',
#                f'{rs_LGBM_RS.cv_results_["mean_score_time"].min():.4f}'])

# table.add_row(['CatBoost + RS',
#                f'{-rs_CB_RS.best_score_:.4f}',
#                f'{rs_CB_RS.cv_results_["mean_fit_time"].min():.4f}',
#                f'{rs_CB_RS.cv_results_["mean_score_time"].min():.4f}'])

print(table)

+----------+------------+----------------+-----------------------+
|  Model   | RMSE on CV | Fit time on CV | Prediction time on CV |
+----------+------------+----------------+-----------------------+
| CatBoost |   0.2658   |    97.7246     |         1.9103        |
+----------+------------+----------------+-----------------------+


In [18]:
data_test.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,saledate
0,2005,Cadillac,CTS,Base,Sedan,automatic,1g6dp567450124779,ca,2.7,116970.0,silver,black,lexus of stevens creek,Wed Jan 14 2015 04:30:00 GMT-0800 (PST)
1,2014,GMC,Savana Cargo,2500,Van,,1gtw7fca7e1902207,pa,4.4,6286.0,white,gray,u-haul,Fri Feb 27 2015 01:00:00 GMT-0800 (PST)
2,2013,Nissan,Murano,S,SUV,automatic,jn8az1mw6dw303497,oh,4.6,11831.0,gray,black,nissan-infiniti lt,Tue Feb 24 2015 01:30:00 GMT-0800 (PST)
3,2013,Chevrolet,Impala,LS Fleet,Sedan,automatic,2g1wf5e34d1160703,fl,2.3,57105.0,silver,black,onemain rem/auto club of miami inc dba north dad,Fri Mar 06 2015 02:00:00 GMT-0800 (PST)
4,2013,Nissan,Titan,SV,Crew Cab,automatic,1n6aa0ec3dn301209,tn,2.9,31083.0,black,black,nissan north america inc.,Wed Jun 03 2015 03:30:00 GMT-0700 (PDT)


In [19]:
data_test = data_preprocessing(data_test)

In [20]:
data_test.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,old,month,season,vin_country,vin_manufacturer
0,2005,cadillac,cts,Base,sedan,automatic,1g6dp567450124779,ca,2.7,116970.0,silver,black,10,1,Winter,United States,Cadillac
1,2014,gmc,savana cargo,2500,van,,1gtw7fca7e1902207,pa,4.4,6286.0,white,gray,1,2,Winter,United States,GMC Truck
2,2013,nissan,murano,S,suv,automatic,jn8az1mw6dw303497,oh,4.6,11831.0,gray,black,2,2,Winter,Japan,Nissan
3,2013,chevrolet,impala,LS Fleet,sedan,automatic,2g1wf5e34d1160703,fl,2.3,57105.0,silver,black,2,3,Spring,Canada,Chevrolet
4,2013,nissan,titan,SV,crew cab,automatic,1n6aa0ec3dn301209,tn,2.9,31083.0,black,black,2,6,Summer,United States,Nissan


In [34]:
features_test = data_test
features_train = data_preprocessing(data.drop('sellingprice', axis=1))
target_train = data['sellingprice']

In [36]:
best_LGBM = rs_CB.best_estimator_.fit(features_train, target_train)

start_time = time.time()
pred_LGBM = best_LGBM.predict(features_test)
end_time = time.time()
pred_time_LGBM = end_time - start_time

# rmse_LGBM = np.sqrt(mean_squared_error(target_test, pred_LGBM))

0:	learn: 9453.5069802	total: 38.9ms	remaining: 38.8s
1:	learn: 9171.9498900	total: 72.6ms	remaining: 36.2s
2:	learn: 8906.1118127	total: 103ms	remaining: 34.4s
3:	learn: 8656.5125796	total: 136ms	remaining: 33.8s
4:	learn: 8423.4478760	total: 170ms	remaining: 33.8s
5:	learn: 8204.9646386	total: 202ms	remaining: 33.4s
6:	learn: 7993.8437496	total: 233ms	remaining: 33.1s
7:	learn: 7803.8908679	total: 267ms	remaining: 33.1s
8:	learn: 7626.7554385	total: 298ms	remaining: 32.8s
9:	learn: 7459.3270204	total: 329ms	remaining: 32.6s
10:	learn: 7285.9546809	total: 360ms	remaining: 32.4s
11:	learn: 7135.2377715	total: 391ms	remaining: 32.2s
12:	learn: 6997.8189106	total: 423ms	remaining: 32.1s
13:	learn: 6855.3227519	total: 454ms	remaining: 32s
14:	learn: 6735.2734823	total: 485ms	remaining: 31.8s
15:	learn: 6616.6122217	total: 515ms	remaining: 31.7s
16:	learn: 6505.2836886	total: 545ms	remaining: 31.5s
17:	learn: 6400.7261914	total: 578ms	remaining: 31.5s
18:	learn: 6309.5208949	total: 607ms	r

In [37]:
# print('LGBMRegressor')
# print('RMSE on test set:', rmse_LGBM)
# print()


# print('LGBMRegressor')
# print(f'Prediction time on test set: {pred_time_LGBM:.4f} seconds')
# print()

# table = PrettyTable(['Model', 'RMSE on test set', 'Prediction time on test set'])
# table.add_row(['LGBMRegressor', f'{rmse_LGBM:.4f}', f'{pred_time_LGBM:.4f}'])

# print(table)

In [38]:
predictions_test = pred_LGBM

predictions_test

array([ 3559.38661166, 19782.08404794, 17192.75578301, ...,
        2965.08490085, 17390.40654951, 14952.53480516])

In [39]:
pred_caggle = data_test
pred_caggle['sellingprice'] = predictions_test
pred_caggle = pred_caggle[['vin','sellingprice']]
pred_caggle.to_csv('pred_kaggle_v4.csv',index=False)

In [40]:
pred_caggle


Unnamed: 0,vin,sellingprice
0,1g6dp567450124779,3559.386612
1,1gtw7fca7e1902207,19782.084048
2,jn8az1mw6dw303497,17192.755783
3,2g1wf5e34d1160703,9513.382047
4,1n6aa0ec3dn301209,25994.365794
...,...,...
110053,1j4aa2d17bl584330,20281.387414
110054,jthbf1d23e5007526,22049.940321
110055,1n4ba41e54c831950,2965.084901
110056,jn1az4eh3dm382431,17390.406550
