In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


In [3]:
df = pd.read_csv("clean_data.csv")


In [4]:
df = df.sample(n=100000, random_state=42)  # 100k rows


In [5]:
df['fuel_efficiency_class'].value_counts()

fuel_efficiency_class
High      33364
Medium    33318
Low       33318
Name: count, dtype: int64

In [6]:
df

Unnamed: 0,vehicle_mass_kg,fuel_type,engine_capacity_cc,engine_power_kw,fuel_efficiency_class
24894,1687.0,diesel,2191.0,110.0,Medium
449724,1555.0,petrol,1984.0,167.0,Low
599423,1507.0,petrol,1998.0,137.0,Medium
58225,1400.0,petrol,1998.0,131.0,Medium
249666,1463.0,petrol,1984.0,167.0,Low
...,...,...,...,...,...
1603398,1767.0,diesel,1968.0,147.0,Medium
1671941,1335.0,petrol,998.0,88.0,High
2744068,1315.0,petrol,999.0,92.0,High
2438558,1730.0,diesel,1950.0,110.0,Medium


In [7]:
X = df.drop("fuel_efficiency_class", axis=1)
y = df["fuel_efficiency_class"]

In [8]:
X = pd.get_dummies(X, dtype=int)


In [9]:
X

Unnamed: 0,vehicle_mass_kg,engine_capacity_cc,engine_power_kw,fuel_type_diesel,fuel_type_petrol
24894,1687.0,2191.0,110.0,1,0
449724,1555.0,1984.0,167.0,0,1
599423,1507.0,1998.0,137.0,0,1
58225,1400.0,1998.0,131.0,0,1
249666,1463.0,1984.0,167.0,0,1
...,...,...,...,...,...
1603398,1767.0,1968.0,147.0,1,0
1671941,1335.0,998.0,88.0,0,1
2744068,1315.0,999.0,92.0,0,1
2438558,1730.0,1950.0,110.0,1,0


In [10]:
X=X.drop(columns=["fuel_type_petrol"])

In [11]:
X

Unnamed: 0,vehicle_mass_kg,engine_capacity_cc,engine_power_kw,fuel_type_diesel
24894,1687.0,2191.0,110.0,1
449724,1555.0,1984.0,167.0,0
599423,1507.0,1998.0,137.0,0
58225,1400.0,1998.0,131.0,0
249666,1463.0,1984.0,167.0,0
...,...,...,...,...
1603398,1767.0,1968.0,147.0,1
1671941,1335.0,998.0,88.0,0
2744068,1315.0,999.0,92.0,0
2438558,1730.0,1950.0,110.0,1


In [12]:
target_le = LabelEncoder()
y = target_le.fit_transform(y)

print("Target classes:", target_le.classes_)
# VERY IMPORTANT TO CHECK THIS

Target classes: ['High' 'Low' 'Medium']


In [13]:
y

array([2, 1, 2, ..., 0, 2, 0], shape=(100000,))

In [14]:
from sklearn.preprocessing import StandardScaler

# Copy your sample


# Standardize only numeric columns
numeric_features = ['vehicle_mass_kg', 'engine_capacity_cc', 'engine_power_kw']
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

# Check the result
print(X.head())


        vehicle_mass_kg  engine_capacity_cc  engine_power_kw  fuel_type_diesel
24894          0.604656            1.001137         0.046932                 1
449724         0.160013            0.623904         1.684827                 0
599423        -0.001676            0.649417         0.822777                 0
58225         -0.362107            0.649417         0.650367                 0
249666        -0.149891            0.623904         1.684827                 0


In [15]:

from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
)


In [16]:
X_train

Unnamed: 0,vehicle_mass_kg,engine_capacity_cc,engine_power_kw,fuel_type_diesel
241723,-0.325053,-0.564291,-0.154213,0
1264390,-1.153707,-0.806668,-0.987528,0
1877414,2.108697,2.702333,1.684827,0
476378,-0.048835,-0.806668,-0.355358,0
542711,-1.406346,-1.171145,-1.705904,0
...,...,...,...,...
258774,0.102748,-0.329203,-0.700178,1
1544490,2.108697,2.462689,1.684827,1
2059053,-1.153707,-0.276354,-1.159938,0
873193,1.271622,0.636660,1.684827,0


In [17]:
y_train

array([2, 0, 1, ..., 0, 1, 2], shape=(80000,))

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score



# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest
rf = RandomForestClassifier(random_state=42)

# Grid for tuning (small to keep it fast)
param_grid = {
    'n_estimators': [50, 100],        # number of trees
    'max_depth': [None, 10, 20],      # depth of trees
    'min_samples_split': [2, 5],      # min samples to split a node
    'min_samples_leaf': [1, 2]        # min samples at leaf
}

# Grid Search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,                # 3-fold cross-validation
    scoring='accuracy',
    verbose=2,
    n_jobs=-1            # use all CPUs
)

print("ðŸ”¹ Starting Grid Search for Random Forest...")
grid_search.fit(X_train, y_train)

# Best parameters
print("âœ… Best Parameters:", grid_search.best_params_)

# Evaluate on test set
y_pred = grid_search.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


ðŸ”¹ Starting Grid Search for Random Forest...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
âœ… Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Accuracy: 0.93965
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.94      6542
           1       0.96      0.96      0.96      6711
           2       0.90      0.93      0.92      6747

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000



In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define Decision Tree
dt = DecisionTreeClassifier(random_state=42)

# Grid of hyperparameters
param_grid_dt = {
    'max_depth': [None, 5, 10, 20],          # max depth of tree
    'min_samples_split': [2, 5, 10],         # min samples to split a node
    'min_samples_leaf': [1, 2, 5],           # min samples at a leaf
    'criterion': ['gini', 'entropy']         # splitting criterion
}

# Grid Search
grid_search_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_dt,
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

print("ðŸ”¹ Starting Grid Search for Decision Tree...")
grid_search_dt.fit(X_train, y_train)

# Best parameters
print("âœ… Best Parameters:", grid_search_dt.best_params_)

# Evaluate on test set
y_pred_dt = grid_search_dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))


ðŸ”¹ Starting Grid Search for Decision Tree...
Fitting 3 folds for each of 72 candidates, totalling 216 fits
âœ… Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree Accuracy: 0.9392
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.94      6542
           1       0.96      0.96      0.96      6711
           2       0.90      0.93      0.91      6747

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000



In [20]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svm = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


SVM Accuracy: 0.7902
              precision    recall  f1-score   support

           0       0.85      0.77      0.81      6542
           1       0.83      0.88      0.85      6711
           2       0.70      0.72      0.71      6747

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000



In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Logistic Regression model
log_reg = LogisticRegression(
    max_iter=2000,
    n_jobs=-1,
    random_state=42
)

# Parameter grid (kept SMALL for speed)
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}

# Grid Search
grid_lr = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid_lr,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

print("ðŸ”¹ Starting Grid Search for Logistic Regression...")
grid_lr.fit(X_train, y_train)

# Best model
print("âœ… Best Parameters:", grid_lr.best_params_)

# Evaluation
y_pred = grid_lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


ðŸ”¹ Starting Grid Search for Logistic Regression...
Fitting 3 folds for each of 8 candidates, totalling 24 fits




âœ… Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.74
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.80      0.78      6542
           1       0.76      0.86      0.81      6711
           2       0.69      0.56      0.62      6747

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.73     20000



In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

knn = KNeighborsClassifier()

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(
    estimator=knn,
    param_grid=param_grid_knn,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

print("ðŸ”¹ Starting Grid Search for KNN...")
grid_knn.fit(X_train, y_train)

print("âœ… Best Parameters:", grid_knn.best_params_)

y_pred = grid_knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


ðŸ”¹ Starting Grid Search for KNN...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
âœ… Best Parameters: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'}
Accuracy: 0.93685
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94      6542
           1       0.96      0.96      0.96      6711
           2       0.90      0.92      0.91      6747

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000



In [23]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

nb = GaussianNB()

nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.65845
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.72      0.71      6542
           1       0.71      0.72      0.71      6711
           2       0.56      0.54      0.55      6747

    accuracy                           0.66     20000
   macro avg       0.66      0.66      0.66     20000
weighted avg       0.66      0.66      0.66     20000



In [24]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB(alpha=1.0)
bnb.fit(X_train, y_train)

y_pred_bnb = bnb.predict(X_test)

print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("BernoulliNB Report:\n", classification_report(y_test, y_pred_bnb))


BernoulliNB Accuracy: 0.6062
BernoulliNB Report:
               precision    recall  f1-score   support

           0       0.57      0.77      0.66      6542
           1       0.65      0.85      0.74      6711
           2       0.56      0.20      0.29      6747

    accuracy                           0.61     20000
   macro avg       0.59      0.61      0.56     20000
weighted avg       0.59      0.61      0.56     20000

