**CatBoost is a gradient boosting algorithm  optimized for handling categorical data efficiently, offering high performance with minimal preprocessing**

In [53]:
pip install catboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [54]:
import catboost
print(catboost.__version__)

1.2.7


In [55]:
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
import pandas as pd
import numpy as np

In [56]:
df = pd.read_csv('datasets/Fuel_Consumption.csv')

In [57]:
df.head()

Unnamed: 0,Year,MAKE,MODEL,VEHICLE CLASS,ENGINE SIZE,CYLINDERS,TRANSMISSION,FUEL,FUEL CONSUMPTION,COEMISSIONS
0,2000,ACURA,1.6EL,COMPACT,1.6,4,A4,X,10.5,216
1,2000,ACURA,1.6EL,COMPACT,1.6,4,M5,X,9.8,205
2,2000,ACURA,3.2TL,MID-SIZE,3.2,6,AS5,Z,13.7,265
3,2000,ACURA,3.5RL,MID-SIZE,3.5,6,A4,Z,15.0,301
4,2000,ACURA,INTEGRA,SUBCOMPACT,1.8,4,A4,X,11.4,230


In [58]:
print(df.columns)

Index(['Year', 'MAKE', 'MODEL', 'VEHICLE CLASS', 'ENGINE SIZE', 'CYLINDERS',
       'TRANSMISSION', 'FUEL', 'FUEL CONSUMPTION', 'COEMISSIONS '],
      dtype='object')


In [59]:
# Trim the spaces from column names
df.columns = df.columns.str.strip()

In [60]:
# identify categorical columns
categorical_features = ['MAKE', 'MODEL', 'VEHICLE CLASS', 'TRANSMISSION', 'FUEL']

# Get the indices of categorical features
cat_feature_indices = [df.columns.get_loc(col) for col in categorical_features]

print('Categorical features indices:', cat_feature_indices)

Categorical features indices: [1, 2, 3, 6, 7]


In [61]:
# Define the feature and target
x =  df.drop(columns=["FUEL CONSUMPTION", "COEMISSIONS"])
y_regression = df["FUEL CONSUMPTION"]
y_classification = pd.qcut(df["COEMISSIONS"], q=3, labels=[0, 1, 2])

In [62]:
# split into train and test
x_train_reg, x_test_reg, y_train_reg, y_test_reg = train_test_split(x, y_regression, test_size=0.2, random_state=100)
x_train_clf, x_test_clf, y_train_clf, y_test_clf = train_test_split(x, y_classification, test_size=0.2, random_state=100)

In [63]:
regressor = CatBoostRegressor(
    iterations=100, 
    learning_rate=0.1, 
    depth=6, 
    cat_features=cat_feature_indices, 
    verbose=10
)

In [64]:
# iteration = 100: The number of boosting iterations (trees) to build
# Learning_rate = 0.1: The step size for gradient descent during training
# Depth = 6: The maximum depth of the decision tree
# cat_features = cat_feature_indices: Specifies the indices of categorical features in the dataset
# verbose = 10:  Logs training progress every 10 iterations

In [65]:
regressor.fit(x_train_reg,y_train_reg)
y_pred_reg = regressor.predict(x_test_reg)

0:	learn: 3.1438154	total: 17.5ms	remaining: 1.73s
10:	learn: 1.9059096	total: 219ms	remaining: 1.77s
20:	learn: 1.3641968	total: 444ms	remaining: 1.67s
30:	learn: 1.1679032	total: 663ms	remaining: 1.48s
40:	learn: 1.0532805	total: 880ms	remaining: 1.27s
50:	learn: 0.9970393	total: 1.1s	remaining: 1.06s
60:	learn: 0.9376839	total: 1.32s	remaining: 846ms
70:	learn: 0.8911410	total: 1.56s	remaining: 637ms
80:	learn: 0.8414095	total: 1.78s	remaining: 417ms
90:	learn: 0.8122649	total: 2s	remaining: 198ms
99:	learn: 0.7860317	total: 2.19s	remaining: 0us


In [66]:
# evaluate the model
reg_mse = mean_squared_error(y_test_reg, y_pred_reg)
print(f"Mean Squared Error for Regression: {reg_mse:.4f}")

Mean Squared Error for Regression: 0.9310


In [67]:
# Train the catboost classifier
classifier = CatBoostClassifier(
    iterations=100, 
    learning_rate=0.1, 
    depth=6, 
    cat_features=cat_feature_indices, 
    verbose=10
)
classifier.fit(x_train_clf, y_train_clf)
y_pred_clf = classifier.predict(x_test_clf)

0:	learn: 0.9971321	total: 24.6ms	remaining: 2.44s
10:	learn: 0.5757861	total: 265ms	remaining: 2.14s
20:	learn: 0.4301107	total: 493ms	remaining: 1.85s
30:	learn: 0.3585641	total: 741ms	remaining: 1.65s
40:	learn: 0.3256181	total: 978ms	remaining: 1.41s
50:	learn: 0.2963946	total: 1.27s	remaining: 1.22s
60:	learn: 0.2741160	total: 1.56s	remaining: 998ms
70:	learn: 0.2510543	total: 1.85s	remaining: 756ms
80:	learn: 0.2331046	total: 2.11s	remaining: 494ms
90:	learn: 0.2167725	total: 2.35s	remaining: 232ms
99:	learn: 0.2039461	total: 2.58s	remaining: 0us


In [70]:
# evaluate the model
clf_mse = mean_squared_error(y_test_clf, y_pred_clf)
print(f"Mean Squared Error for Regression: {clf_mse:.4f}")

Mean Squared Error for Regression: 0.1328
