In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("vehicle_maintenance_data.csv")

In [3]:
data.head(5)

Unnamed: 0,Vehicle_Model,Mileage,Maintenance_History,Reported_Issues,Vehicle_Age,Fuel_Type,Transmission_Type,Engine_Size,Odometer_Reading,Last_Service_Date,Warranty_Expiry_Date,Owner_Type,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Tire_Condition,Brake_Condition,Battery_Status,Need_Maintenance
0,Truck,58765,Good,0,4,Electric,Automatic,2000,28524,2023-11-23,2025-06-24,Second,20782,6,3,13.622204,New,New,Weak,1
1,Van,60353,Average,1,7,Electric,Automatic,2500,133630,2023-09-21,2025-06-04,Second,23489,7,0,13.625307,New,New,Weak,1
2,Bus,68072,Poor,0,2,Electric,Automatic,1500,34022,2023-06-27,2025-04-27,First,17979,7,0,14.306302,New,Good,Weak,1
3,Bus,60849,Average,4,5,Petrol,Automatic,2500,81636,2023-08-24,2025-11-05,Second,6220,7,3,18.709467,New,Worn Out,New,1
4,Bus,45742,Poor,5,1,Petrol,Manual,2000,97162,2023-05-25,2025-09-14,Third,16446,6,2,16.977482,Good,Good,Weak,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Vehicle_Model         50000 non-null  object 
 1   Mileage               50000 non-null  int64  
 2   Maintenance_History   50000 non-null  object 
 3   Reported_Issues       50000 non-null  int64  
 4   Vehicle_Age           50000 non-null  int64  
 5   Fuel_Type             50000 non-null  object 
 6   Transmission_Type     50000 non-null  object 
 7   Engine_Size           50000 non-null  int64  
 8   Odometer_Reading      50000 non-null  int64  
 9   Last_Service_Date     50000 non-null  object 
 10  Warranty_Expiry_Date  50000 non-null  object 
 11  Owner_Type            50000 non-null  object 
 12  Insurance_Premium     50000 non-null  int64  
 13  Service_History       50000 non-null  int64  
 14  Accident_History      50000 non-null  int64  
 15  Fuel_Efficiency    

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

data['Vehicle_Model'] = label_encoder.fit_transform(data['Vehicle_Model'])
data['Maintenance_History'] = label_encoder.fit_transform(data['Maintenance_History'])
data['Fuel_Type'] = label_encoder.fit_transform(data['Fuel_Type'])
data['Transmission_Type'] = label_encoder.fit_transform(data['Transmission_Type'])
data['Owner_Type'] = label_encoder.fit_transform(data['Owner_Type'])
data['Tire_Condition'] = label_encoder.fit_transform(data['Tire_Condition'])
data['Brake_Condition'] = label_encoder.fit_transform(data['Brake_Condition'])
data['Battery_Status'] = label_encoder.fit_transform(data['Battery_Status'])

In [6]:
X = data.drop(['Need_Maintenance', 'Last_Service_Date', 'Warranty_Expiry_Date'], axis=1)
y = data.Need_Maintenance

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
X_train.head(4)

Unnamed: 0,Vehicle_Model,Mileage,Maintenance_History,Reported_Issues,Vehicle_Age,Fuel_Type,Transmission_Type,Engine_Size,Odometer_Reading,Owner_Type,Insurance_Premium,Service_History,Accident_History,Fuel_Efficiency,Tire_Condition,Brake_Condition,Battery_Status
38094,3,59332,2,4,2,0,1,1500,61116,1,23511,7,1,19.037102,2,1,0
40624,0,72285,0,2,5,1,1,2000,86169,0,10934,7,3,19.549713,1,2,0
49425,1,79581,0,1,7,2,0,1000,105637,1,10340,9,1,15.562817,1,0,1
35734,4,37648,1,1,3,0,1,1500,89920,0,25518,5,3,12.335439,2,2,2


In [9]:
y_train.head(5)

38094    1
40624    1
49425    0
35734    1
41708    0
Name: Need_Maintenance, dtype: int64

In [10]:
def score_results(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred=clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Results:\n========================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) *100:.2f}%")
        print("-----------------------")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
   
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Results:\n========================")
        print(f"Accuracy Score: {accuracy_score(y_test, pred) *100:.2f}%")
        print("-----------------------")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [11]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

score_results(lr, X_train, y_train, X_test, y_test, train=True)
score_results(lr, X_train, y_train, X_test, y_test, train=False)

Train Results:
Accuracy Score: 83.58%
-----------------------
CLASSIFICATION REPORT:
                     0             1  accuracy     macro avg  weighted avg
precision     0.613539      0.864115  0.835771      0.738827      0.816527
recall        0.365428      0.946037  0.835771      0.655733      0.835771
f1-score      0.458043      0.903223  0.835771      0.680633      0.818677
support    6647.000000  28353.000000  0.835771  35000.000000  35000.000000
Confusion Matrix: 
 [[ 2429  4218]
 [ 1530 26823]]

Test Results:
Accuracy Score: 83.29%
-----------------------
CLASSIFICATION REPORT:
                     0             1  accuracy     macro avg  weighted avg
precision     0.601163      0.862877  0.832867      0.732020      0.813064
recall        0.362172      0.943516  0.832867      0.652844      0.832867
f1-score      0.452022      0.901396  0.832867      0.676709      0.815865
support    2855.000000  12145.000000  0.832867  15000.000000  15000.000000
Confusion Matrix: 
 [[ 1034  