In [8]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer,r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, KFold
from xgboost import XGBRegressor, cv
from catboost import CatBoostRegressor
from scipy.stats import norm, skew

df_train = pd.read_csv('/Users/mariolamas/Desktop/Car-pred-Kaggle/Data/train.csv')
df_test=pd.read_csv('/Users/mariolamas/Desktop/Car-pred-Kaggle/Data/test.csv')

df_train = df_train.loc[~df_train.duplicated()].reset_index(drop=True)

(mu, sigma) = norm.fit(df_train['price'])
sns.displot(df_train['price'], kde = True, stat="density", height=6, aspect=2)
plt.xlabel("Cars's sale Price in $", fontsize = 10)
plt.ylabel("Density", fontsize = 10)
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)])
plt.show()

df_train.loc[:, 'price'] = np.log1p(df_train.price)

(mu, sigma) = norm.fit(df_train['price'])
sns.displot(df_train['price'], kde = True, stat="density", height=6, aspect=2)
plt.xlabel("Car's sale Price in $", fontsize = 10)
plt.ylabel("Density", fontsize = 10)
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)])
plt.show()

data = pd.concat([df_train, df_test], axis=0, ignore_index=True, sort=False)
data.running = data.running.apply(lambda x: int(x.split()[0]) if 'km' in x else int(x.split()[0])*1.60934)
data.loc[:, 'running'] = np.log1p(data.running)
data = data.drop(['wheel', 'Id'], axis=1)

#data["running_per_motor"] = data["running"] / data["motor_volume"]
data['running_per_year'] = data['running'] / (2024-data['year'])

corr = data.select_dtypes('number').corr()
plt.figure(figsize=(6, 6))
sns.heatmap(corr , fmt = '0.3f', cmap = 'YlGnBu', annot=True, cbar=False)
plt.tight_layout()
plt.show()

data = pd.get_dummies(data, dtype=int).reset_index(drop=True)
train = data[~data["price"].isnull()]
test = data[data["price"].isnull()]

X = train.drop('price',axis=1)
y = train.price
X_test = test.drop('price', axis=1)

params = {'booster': 'gbtree',
          'max_depth': 3,
          'max_leaves': 769,
          'learning_rate': 0.04538451353216046,
          'n_estimators': 1171,
          'min_child_weight': 13,
          'subsample': 0.6578720167306904,
          'reg_alpha': 0.4622943878867952,
          'reg_lambda': 0.6211309481623339,
          'colsample_bylevel': 0.7985625445322192,
          'colsample_bytree': 0.9634723040072963,
          'colsample_bynode': 0.49814271378837316,
          'random_state': 42,
          'objective': 'reg:absoluteerror',
          'n_jobs': -1,
         }
xgb = XGBRegressor(**params)
xgb.fit(X, y)

parms={ 'iterations': 200, 'learning_rate': 0.05942399107771988, 'depth': 8, 'l2_leaf_reg': 7}
best_regressor=CatBoostRegressor(loss_function='MAE', silent=True,**parms)
best_regressor.fit(X, y) 
test_preds_1 = best_regressor.predict(X_test)

df_test.loc[:, 'price'] = np.expm1(xgb.predict(X_test))*0.7 + np.expm1(test_preds_1)*0.3
submission = df_test[['Id', 'price']]
submission.to_csv("submission.csv", index=False)

1914.482064037366


In [None]:
#Resultado actual 1834