In [95]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import re
import warnings

from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold, KFold, train_test_split
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE , BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from vecstack import stacking

In [96]:
import import_ipynb
from p3_utils import *

In [97]:
train_df = pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/train.csv")

In [98]:
train_df

Unnamed: 0,id,Nombre,Ciudad,Año,Kilometros,Combustible,Tipo_marchas,Mano,Consumo,Motor_CC,Potencia,Asientos,Descuento,Precio_cat
0,1.0,Maruti Swift Dzire ZDI,G,2012.0,83000.0,Diesel,Manual,First,23.4 kmpl,1248 CC,74 bhp,5.0,,3
1,2.0,Maruti Wagon R LXI Optional,I,2016.0,4800.0,Petrol,Manual,First,20.51 kmpl,998 CC,67.04 bhp,5.0,,3
2,3.0,Mahindra KUV 100 mFALCON D75 K8,F,2016.0,26000.0,Diesel,Manual,First,25.32 kmpl,1198 CC,77 bhp,6.0,,3
3,4.0,Hyundai i20 1.2 Magna,E,2013.0,56127.0,Petrol,Manual,First,18.5 kmpl,1197 CC,80 bhp,5.0,,3
4,5.0,Honda Jazz 1.2 SV i VTEC,H,2017.0,41981.0,Petrol,Manual,First,18.7 kmpl,1199 CC,88.7 bhp,5.0,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4814,4815.0,Maruti Swift VDI,E,2014.0,83274.0,Diesel,Manual,First,22.9 kmpl,1248 CC,74 bhp,5.0,,3
4815,,Porsche Panamera Diesel 250hp,H,2014.0,60033.0,Diesel,Automatic,First,18.18 kmpl,2967 CC,250 bhp,4.0,,5
4816,4817.0,Hyundai Creta 1.6 CRDi SX Option,C,2017.0,40158.0,Diesel,Manual,First,19.67 kmpl,,126.2 bhp,5.0,,4
4817,4818.0,Hyundai Xcent 1.2 VTVT S,E,2015.0,65743.0,Petrol,Manual,First,20.14 kmpl,1197 CC,81.86 bhp,5.0,7.85,3


In [99]:
train_df.shape

(4819, 14)

In [100]:
train_df_c = train_df.copy()

In [101]:
train_df_c.isnull().sum()

id                72
Nombre            72
Ciudad            72
Año               72
Kilometros        72
Combustible       72
Tipo_marchas      72
Mano              72
Consumo           73
Motor_CC         101
Potencia         175
Asientos         106
Descuento       4160
Precio_cat         0
dtype: int64

In [102]:
#drop id columns and descuento (>75% NaN)
train_df_c = train_df_c.drop(columns=["id","Descuento"])

In [103]:
for column in train_df_c.columns:
    train_df_c[column].fillna(train_df_c[column].mode()[0], inplace=True)
train_df_c.isnull().sum()  

Nombre          0
Ciudad          0
Año             0
Kilometros      0
Combustible     0
Tipo_marchas    0
Mano            0
Consumo         0
Motor_CC        0
Potencia        0
Asientos        0
Precio_cat      0
dtype: int64

In [104]:
train_df_encoded = train_df_c.copy()

In [105]:
#ENCODE DATA using LABEL ENCODER
labelNombre = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/nombre.csv").Nombre)
train_df_encoded.Nombre = labelNombre.transform(train_df_encoded.Nombre)

labelCiudad = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/ciudad.csv").Ciudad)
train_df_encoded.Ciudad = labelCiudad.transform(train_df_encoded.Ciudad)

labelAño = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/ao.csv").Año)
train_df_encoded.Año = labelAño.transform(train_df_encoded.Año)

labelKilometros = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/kilometros.csv").Kilometros)
train_df_encoded.Kilometros = labelKilometros.transform(train_df_encoded.Kilometros)

labelCombustible = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/combustible.csv").Combustible)
train_df_encoded.Combustible = labelCombustible.transform(train_df_encoded.Combustible)

labelMano = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/mano.csv").Mano)
train_df_encoded.Mano = labelMano.transform(train_df_encoded.Mano)

labelConsumo = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/consumo.csv").Consumo)
train_df_encoded.Consumo= labelConsumo.transform(train_df_encoded.Consumo)

labelMotor_CC = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/motor_cc.csv").Motor_CC)
train_df_encoded.Motor_CC = labelMotor_CC.transform(train_df_encoded.Motor_CC)

labelPotencia = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/potencia.csv").Potencia)
train_df_encoded.Potencia = labelPotencia.transform(train_df_encoded.Potencia)

labelAsientos = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/asientos.csv").Asientos)
train_df_encoded.Asientos = labelAsientos.transform(train_df_encoded.Asientos)


In [106]:
labelTipo = LabelEncoder().fit(train_df_encoded.Tipo_marchas)
train_df_encoded.Tipo_marchas = labelTipo.transform(train_df_encoded.Tipo_marchas)


In [107]:
train_df_encoded

Unnamed: 0,Nombre,Ciudad,Año,Kilometros,Combustible,Tipo_marchas,Mano,Consumo,Motor_CC,Potencia,Asientos,Precio_cat
0,1143,5,14,2623,1,1,0,353,14,299,3,3
1,1204,7,18,43,4,1,0,290,144,279,3,3
2,846,4,18,545,1,1,0,377,11,308,4,3
3,714,3,15,1763,4,1,0,235,10,312,3,3
4,472,6,19,1171,4,1,0,242,12,344,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...
4814,1159,3,16,2630,1,1,0,343,14,299,3,3
4815,1427,6,16,1923,1,0,0,224,110,188,2,5
4816,512,1,19,1099,1,1,0,268,10,56,3,4
4817,685,3,17,2163,4,1,0,280,10,317,3,3


In [108]:
X = train_df_encoded.drop(columns=["Precio_cat"]) 


In [109]:
X.shape

(4819, 11)

## Feature Engineering

In [110]:
# best results with all features besides Kilometros & Mano
X_noK_noM = train_df_encoded.drop(columns=["Precio_cat", "Kilometros", "Mano"])

#other combinations based on feature selection results
X_just_imp = train_df_encoded.drop(columns=["Precio_cat", "Kilometros", "Nombre", "Ciudad"
                                            , "Mano", "Consumo"])
X_noK_noCon = train_df_encoded.drop(columns=["Precio_cat", "Kilometros", "Consumo"])
X_noK_noN = train_df_encoded.drop(columns=["Precio_cat", "Kilometros", "Nombre"])
X_noK_noM_noCon = train_df_encoded.drop(columns=["Precio_cat", "Kilometros", "Consumo", "Mano"])

In [111]:
#create target dataframe with the column we want to predict
y = train_df_encoded["Precio_cat"].values
y

array([3, 3, 3, ..., 4, 3, 3])

In [112]:
Counter(y)

Counter({3: 2211, 2: 602, 4: 978, 5: 759, 1: 269})

In [113]:
test_df = pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/test.csv")

In [114]:
test_df

Unnamed: 0,id,Nombre,Ciudad,Año,Kilometros,Combustible,Tipo_marchas,Mano,Consumo,Motor_CC,Potencia,Asientos,Descuento
0,4820,Mercedes-Benz New C-Class C 220 CDI Avantgarde,H,2015,77771,Diesel,Automatic,First,19.27 kmpl,2143 CC,170 bhp,5.0,
1,4821,Hyundai i10 Magna 1.1,J,2011,62000,Petrol,Manual,First,19.81 kmpl,1086 CC,68.05 bhp,5.0,
2,4822,Ford Figo Diesel EXI,C,2010,129986,Diesel,Manual,Second,20.0 kmpl,1399 CC,68 bhp,5.0,
3,4823,Maruti Swift Dzire VXI Optional,D,2015,49900,Petrol,Manual,First,20.85 kmpl,1197 CC,83.14 bhp,5.0,
4,4824,Maruti Ritz VXI,J,2010,75319,Petrol,Manual,Second,21.1 kmpl,1197 CC,85.80 bhp,5.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1154,5974,Chevrolet Beat Diesel LT,G,2012,88000,Diesel,Manual,First,25.44 kmpl,936 CC,57.6 bhp,5.0,
1155,5975,Hyundai Verna CRDi,K,2010,72010,Diesel,Manual,First,16.8 kmpl,1493 CC,110 bhp,5.0,
1156,5976,Mercedes-Benz A Class A180 CDI,J,2013,34000,Diesel,Automatic,Second,20.0 kmpl,2143 CC,107.3 bhp,5.0,
1157,5977,BMW X1 M Sport sDrive 20d,B,2016,31000,Diesel,Automatic,First,17.05 kmpl,1995 CC,190 bhp,5.0,50.97


In [115]:
test_df_encoded = test_df.copy()

In [116]:
labelNombre = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/nombre.csv").Nombre)
test_df_encoded.Nombre = labelNombre.transform(test_df_encoded.Nombre)

labelCiudad = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/ciudad.csv").Ciudad)
test_df_encoded.Ciudad = labelCiudad.transform(test_df_encoded.Ciudad)

labelAño = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/ao.csv").Año)
test_df_encoded.Año = labelAño.transform(test_df_encoded.Año)

labelKilometros = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/kilometros.csv").Kilometros)
test_df_encoded.Kilometros = labelKilometros.transform(test_df_encoded.Kilometros)

labelCombustible = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/combustible.csv").Combustible)
test_df_encoded.Combustible = labelCombustible.transform(test_df_encoded.Combustible)

labelMano = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/mano.csv").Mano)
test_df_encoded.Mano = labelMano.transform(test_df_encoded.Mano)

labelConsumo = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/consumo.csv").Consumo)
test_df_encoded.Consumo= labelConsumo.transform(test_df_encoded.Consumo)

labelMotor_CC = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/motor_cc.csv").Motor_CC)
test_df_encoded.Motor_CC = labelMotor_CC.transform(test_df_encoded.Motor_CC)

labelPotencia = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/potencia.csv").Potencia)
test_df_encoded.Potencia = labelPotencia.transform(test_df_encoded.Potencia)

labelAsientos = LabelEncoder().fit(pd.read_csv("/Users/mikhail/Downloads/ugrin2020-vehiculo-usado-multiclase/asientos.csv").Asientos)
test_df_encoded.Asientos = labelAsientos.transform(test_df_encoded.Asientos)


In [117]:
labelTipo = LabelEncoder().fit(train_df_c.Tipo_marchas)
test_df_encoded.Tipo_marchas = labelTipo.transform(test_df_encoded.Tipo_marchas)


In [118]:
test_df_encoded

Unnamed: 0,id,Nombre,Ciudad,Año,Kilometros,Combustible,Tipo_marchas,Mano,Consumo,Motor_CC,Potencia,Asientos,Descuento
0,4820,1318,6,17,2512,1,0,0,257,70,119,3,
1,4821,695,8,13,1999,4,1,0,274,2,284,3,
2,4822,326,1,12,2963,1,1,2,279,26,283,3,
3,4823,1139,2,17,1502,4,1,0,301,10,326,3,
4,4824,1093,8,12,2452,4,1,2,308,10,334,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1154,5974,167,5,14,2709,1,1,0,379,140,259,3,
1155,5975,645,9,12,2365,1,1,0,177,33,27,3,
1156,5976,1235,8,15,848,1,0,2,279,70,20,3,
1157,5977,130,0,18,719,1,0,0,186,64,150,3,50.97


In [119]:
test_df_encoded.drop(columns=["id", "Descuento", "Kilometros", "Mano"], inplace = True)

In [120]:
test_df_encoded

Unnamed: 0,Nombre,Ciudad,Año,Combustible,Tipo_marchas,Consumo,Motor_CC,Potencia,Asientos
0,1318,6,17,1,0,257,70,119,3
1,695,8,13,4,1,274,2,284,3
2,326,1,12,1,1,279,26,283,3
3,1139,2,17,4,1,301,10,326,3
4,1093,8,12,4,1,308,10,334,3
...,...,...,...,...,...,...,...,...,...
1154,167,5,14,1,1,379,140,259,3
1155,645,9,12,1,1,177,33,27,3
1156,1235,8,15,1,0,279,70,20,3
1157,130,0,18,1,0,186,64,150,3


In [121]:
#OVERSAMLING MINORITY CLASS
samples = {1: 400, 3: 2211, 2: 602, 4: 978, 5: 759} #increase minority samples from 269 to 400 (almost doubling amount of instances)
over = BorderlineSMOTE(sampling_strategy=samples,n_jobs=-1, random_state = 42, k_neighbors=3)
X_noK_noM_o,y_o = over.fit_resample(X_noK_noM,y)

In [122]:
#TRAINING MODEL
model = XGBClassifier(objective="multi:softmax", scale_pos_weight=1,
                    learning_rate=0.1, colsample_bytree = 0.8,
                      subsample = 0.8, n_estimators=400, 
                     reg_alpha = 0.3, max_depth=4, gamma=0.6)
model.fit(X_noK_noM_o,y_o)



Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.6, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0.3,
              reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [123]:
#PREDICT ON UNSEESN DATA (TEST SET)
print(model)
predictions = model.predict(test_df_encoded)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.6, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0.3,
              reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [124]:
#Store predicitions in .csv file
data = pd.DataFrame()
data["id"] = test_df.id.values
data["Precio_cat"] = predictions
#prediction = data.to_csv('xgboost_tuned_fs_bov_2.csv', index=False)

In [125]:
#CROSS VALIDATION
xgb = XGBClassifier(objective="multi:softmax",scale_pos_weight=1,learning_rate=0.1, 
                    colsample_bytree = 0.8,subsample = 0.8, n_estimators=400, 
                     reg_alpha = 0.3, max_depth=4, gamma=0.6)

samples = {1: 400, 3: 2211, 2: 602, 4: 978, 5: 759}
samples_under = {1: 400, 3: 2000, 2: 602, 4: 978, 5: 759}

over = BorderlineSMOTE(sampling_strategy=samples, k_neighbors=3, 
                       n_jobs=-1, random_state = 42)
#under = RandomUnderSampler(sampling_strategy=samples_under)

steps = [('over', over), 
         #("under", under),
         ('model',xgb)]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_noK_noM, y, scoring='accuracy', cv=cv, n_jobs=-1)
score = np.mean(scores)
print('Accuracy: %.5f' % score)

Accuracy: 0.83918


## Ensemble Classification using Stacking

In [132]:
#STACKING with vecstack
import lightgbm as lgbm
models = [    
    lgbm.LGBMClassifier(colsample_bytree= 0.7, learning_rate= 0.05, 
                        max_depth=4, n_estimators=350),
    RandomForestClassifier()]


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X_noK_noM, y)

In [134]:
S_train, S_test = stacking(models, X_train, y_train, X_test,regression=False, 
                           mode='oof_pred_bag', needs_proba=False, save_dir=None, 
                           metric=accuracy_score, n_folds=5, stratified=True,
                           shuffle=True,random_state=0,verbose=2)

task:         [classification]
n_classes:    [5]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [2]

model  0:     [LGBMClassifier]
    fold  0:  [0.83125864]
    fold  1:  [0.83817427]
    fold  2:  [0.83955740]
    fold  3:  [0.81466113]
    fold  4:  [0.83518006]
    ----
    MEAN:     [0.83176630] + [0.00901401]
    FULL:     [0.83176536]

model  1:     [RandomForestClassifier]
    fold  0:  [0.80359613]
    fold  1:  [0.81466113]
    fold  2:  [0.81189488]
    fold  3:  [0.79806362]
    fold  4:  [0.81578947]
    ----
    MEAN:     [0.80880105] + [0.00685761]
    FULL:     [0.80879911]



In [137]:
_xgb = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.01, 
                      n_estimators=40, max_depth=3, silent=True, verbose=0)

_xgb = _xgb.fit(S_train, y_train)
y_pred = model.predict(S_test)
print('Final prediction score: [%.5f]' % accuracy_score(y_test, y_pred))


Parameters: { silent, verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Final prediction score: [0.83900]


In [136]:
parameters = {
    'max_depth': [3,4,5],
    'n_estimators': range(20, 50, 10),
    'learning_rate': [0.01,0.025,0.05,0.1]
}
XGB = XGBClassifier(objective="multi:softmax", random_state = 0, silent = True)

gsearch1 = GridSearchCV(XGB, 
param_grid = parameters, scoring='accuracy',n_jobs=-1, cv=5)
gsearch1.fit(S_train,y_train)

print(gsearch1.best_params_)
print(gsearch1.best_score_)



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 40}
0.8314831630287773
