In [1]:
import pandas as pd 
import re
import seaborn as sns 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.preprocessing import StandardScaler

In [22]:
#Obtencion de los datos
data_train=pd.read_csv('./Data/train.csv')
data_test=pd.read_csv('./Data/test.csv')
ids_test=data_test.Id

data_test=data_test.drop(['Id'],axis=1)
X=data_train.copy()

In [23]:
#Procesamiento
# Todo a km
X['running']=X['running'].apply(lambda x: float(x.split(' ')[0]) * 1.60934 if x.split(' ')[2]=='miles' else float(x.split(' ')[0]))
data_test['running']=data_test['running'].apply(lambda x: float(x.split(' ')[0]) * 1.60934 if x.split(' ')[2]=='miles' else float(x.split(' ')[0]))

#Sustituir status por la mediana de su categoria
meds={s :X[X['status']==s]['price'].median() for s in X['status'].unique()}

X['status']=X['status'].map(meds)
data_test['status']=data_test['status'].map(meds)

In [24]:
#Definimos X e y
y=X['price']
X=X.drop('price',axis=1)

In [25]:
#Escalado de los datos
num_cols=X.select_dtypes(include='number').columns
scaler=StandardScaler()
X[num_cols]=scaler.fit_transform(X[num_cols])
data_test[num_cols]=scaler.transform(data_test[num_cols])


In [26]:
#Encoding de las variables categoricas
X=pd.get_dummies(X.drop('color',axis=1)).astype(float)
data_test=pd.get_dummies(data_test.drop('color',axis=1)).astype(float)

X, data_test = X.align(data_test, join='left', axis=1)
data_test = pd.get_dummies(data_test)
X, X_tdata_testest = X.align(data_test, join='left', axis=1)

In [27]:
#Split en Train y Test
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, test_size=0.2,random_state=40)

In [28]:
#CatBoost con Fine Tunning
#Prueba 2- Fine Tunning
param_grid_gb = {
    'iterations': [50, 100, 200],
    'learning_rate': [0.01,0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [ 3, 5, 7],
}

grid_gb = RandomizedSearchCV(
    estimator = CatBoostRegressor(loss_function='MAE', silent = True,random_state=0) ,
    param_distributions = param_grid_gb,
    scoring = 'neg_mean_absolute_error',
    n_jobs = 4,
    verbose=True, 
    cv = 10,
     )
grid_gb.fit(X_train_full,y_train)
print(grid_gb.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
{'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 200, 'depth': 6}


In [29]:
#Evaluacion de resultados
# Calculate MAE
mae_1 = mean_absolute_error(grid_gb.predict(X_valid_full), y_valid)
print("Test Error:" , mae_1)
mae_1 = mean_absolute_error(grid_gb.predict(X_train_full), y_train)
print("Train Error:" , mae_1)

Test Error: 1927.7258733164847
Train Error: 1642.4103081654873


In [30]:
preds=grid_gb.predict(data_test)
output = pd.DataFrame({'Id': ids_test,
                        'Predicted_Price': preds})
output.to_csv('submission.csv', index=False)