In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv("../artifacts/raw/vehicledata.csv")

In [None]:
df.shape

In [None]:
df.info()
df.isnull().sum()

Beginning with RandomForest model, since it is good for large dataset with multiple features. Also there is no need for any data imputation since no missing value is there.

The target here is Maintenance Required or not.

Features like Vehicle ID, Make and model, Vehicle type and Route Info can be easily dropped.



In [None]:
print(df.columns.tolist())

In [None]:
print(df.dtypes)

We need to now convert the object types into numeric data types


In [None]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:", categorical_cols)

In [None]:
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(df[col].value_counts())

In [None]:
df.head()   

In [None]:
for col in categorical_cols:
    if col in df.columns:  # Only process existing columns
        print(f"\nColumn: {col}")
        print(df[col].unique())

In [None]:
print(df.columns.tolist()  )

In [None]:
for col in categorical_cols:
    if col in df.columns:  # Only process existing columns
        print(f"\nColumn: {col}")
        print(df[col].value_counts())


In [None]:
df.head()

Now i need to convert the categorical values into numerical values, and for which i would use different types of encodings. Depending upon the type of data is present in each categorical columns, which we already saw earlier.

In [None]:
df['Last_Maintenance_Date'] = pd.to_datetime(df['Last_Maintenance_Date'])

df['Maintenance_Year'] = df['Last_Maintenance_Date'].dt.year
df['Maintenance_Month'] = df['Last_Maintenance_Date'].dt.month
df['Maintenance_Day'] = df['Last_Maintenance_Date'].dt.day
df['Maintenance_Weekday'] = df['Last_Maintenance_Date'].dt.weekday

df.drop('Last_Maintenance_Date', axis=1, inplace=True)

In [None]:
print(df.columns.tolist())



In [None]:
for col in categorical_cols:
    if col in df.columns:
         print(f"\nColumn: {col}")
         print(df[col].unique())

In [None]:
print(df.columns.tolist())

In [None]:
df = df.drop(['Vehicle_ID','Make_and_Model','Route_Info'], axis = 1)
print(df.columns.tolist())

In [None]:
print(df[['Maintenance_Type', 'Weather_Conditions', 'Road_Conditions']].dtypes)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# 1. Encode binary column: Vehicle_Type
le = LabelEncoder()
df['Vehicle_Type'] = le.fit_transform(df['Vehicle_Type'])

brake_map = {'Good': 2, 'Fair': 1, 'Poor': 0}
df['Brake_Condition'] = df['Brake_Condition'].map(brake_map)

In [None]:
df.columns = df.columns.str.strip()
print(df.columns.tolist())

df = pd.get_dummies(df, columns = ['Maintenance_Type', 'Weather_Conditions', 'Road_Conditions'])

In [None]:
non_numeric_cols = df.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_cols.tolist())


In [None]:
print(df[['Maintenance_Type_Engine Overhaul', 'Maintenance_Type_Oil Change', 'Maintenance_Type_Tire Rotation', 'Weather_Conditions_Clear', 'Weather_Conditions_Rainy', 'Weather_Conditions_Snowy', 'Weather_Conditions_Windy', 'Road_Conditions_Highway', 'Road_Conditions_Rural', 'Road_Conditions_Urban']].dtypes)

In [None]:
# Convert all bool columns to int (optional, safe)
df = df.astype({col: 'int' for col in df.select_dtypes(include='bool').columns})
print(df[['Maintenance_Type_Engine Overhaul', 'Maintenance_Type_Oil Change', 'Maintenance_Type_Tire Rotation', 'Weather_Conditions_Clear', 'Weather_Conditions_Rainy', 'Weather_Conditions_Snowy', 'Weather_Conditions_Windy', 'Road_Conditions_Highway', 'Road_Conditions_Rural', 'Road_Conditions_Urban']].dtypes)

In [None]:
df.columns

In [None]:
df.head(5)

## Let's build the model: RandomForest


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
X = df.drop('Maintenance_Required', axis=1)
y = df['Maintenance_Required']

In [None]:
print (X.columns)

In [None]:
X.head(5)

In [None]:
print(X)

In [None]:
print(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
X_train.select_dtypes(include=['object', 'string', 'category']).columns.tolist()


In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


The accuracy we recieved is too good to be true, there has been some leakage, and we check i# t by seeing the correlation.# #

In [None]:
df.corr()['Maintenance_Required'].sort_values(ascending=False)

Here we see that, some columns have had some high correlation, like

Anomalies_Detected 0.499375
Failure_History 0.448371
Downtime_Maintenance 0.274428
keyboard_arrow_down


What we found
Here, we can see taht some columns have high correlation and we need to fix that, for that we would drop columns like engine temperature as it only has one value, and doesn't really add anything in the model for us. Next up we have leakage features, which the model shouldn't see first hand because that is cheating my friend. It's like you giving an exam and get to see the question paper beforehand.

In [None]:
print(df['Engine_Temperature'].unique())

So here we drop these features that the model shouldn't see.

In [None]:
leakage_features = [
    'Anomalies_Detected',
    'Failure_History',
    'Downtime_Maintenance',
    'Predictive_Score'  # also suspiciously correlated
]
X = df.drop(['Maintenance_Required'] + leakage_features, axis=1)
y = df['Maintenance_Required']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)


In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Now, an accuracy of 77 percent, atleast feels real, but here we need to see if our dataset is balanced, that is, is there enough data for both "maintenance required" and "maintenance not required", or is the data manipulated already to give us a biased result?

In [None]:
y.value_counts(normalize=True)

Clearly, the model is baised to give us result in the favor of "maintenance required"

To balance things out
we do some balancing of our data and for that we use the class weight balanced. And re-evaluate our model.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Balanced class weights
rf_balanced = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

rf_balanced.fit(X_train, y_train)

# Predict and evaluate
y_pred_balanced = rf_balanced.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_balanced))
print("\nClassification Report:\n", classification_report(y_test, y_pred_balanced))
print("Accuracy Score:", accuracy_score(y_test, y_pred_balanced))

So, no such great changes have been achieved, that means our dataset is unbalanced to such extent that only adding some weight won't suffice.

Let's use SMOTE (Synthetic Minority Over-Sampling Technique)

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Step 1: Balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 2: Split the new balanced dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Step 3: Train the model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Step 4: Evaluate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = rf.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

What does SMOTE do? It generates synthetic data that would cover up for the unbalanced number of data for a particular outcome. Here, the dataset was unbalanced in the favour of giving results for "maintenance required". So SMOTE made additions of some synthetic/fake dataset that would balance things out.

In [None]:
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')
plt.title("Top 15 Feature Importances")
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_resampled, y_resampled, cv=5)
print("CV Accuracy scores:", scores)
print("Mean CV Accuracy:", scores.mean())

Cross-validation is a technique to:

Test how well your model generalizes to unseen data
Reduce the risk of overfitting or underfitting
Provide a more robust estimate of model performance