In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

In [2]:
df = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\Computer Science\CS-04\CS4-01\Machine Learning\tasks\salaries(1).csv")
print("Dataset Loaded Successfully!")
print(df.head())

Dataset Loaded Successfully!
  age_group bmi_category family_history glucose_level  has_diabetes
0     young       normal             No          high             0
1     young       normal            Yes           low             0
2     young   overweight             NO        normal             0
3     young   overweight            Yes        normal             1
4     young        obese             No          high             1


In [3]:
target_col = "salary" if "salary" in df.columns else df.columns[-1]
print("\nTarget column is:", target_col)


Target column is: has_diabetes


In [4]:
df = df.dropna(subset=[target_col]).reset_index(drop=True)

for col in df.columns:
    if df[col].isna().any():
        if df[col].dtype in [np.float64, np.int64]:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode().iloc[0], inplace=True)

print("\nMissing values handled!")


Missing values handled!


In [5]:
le_dict = {}

for col in df.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

print("\nLabel Encoding Done!")
print(df.head())


Label Encoding Done!
   age_group  bmi_category  family_history  glucose_level  has_diabetes
0          2             0               1              0             0
1          2             0               2              1             0
2          2             2               0              2             0
3          2             2               2              2             1
4          2             1               1              0             1


In [6]:
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("\nData Split Completed!")


Data Split Completed!


In [7]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

print("\nModel Trained Successfully!")


Model Trained Successfully!


In [8]:
y_pred = model.predict(X_test)

print("\n========== MODEL RESULTS ==========\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.5

Confusion Matrix:
 [[0 2]
 [0 2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
joblib.dump(model, "decision_tree_model.pkl")
joblib.dump(le_dict, "label_encoders.pkl")
df.to_csv("cleaned_salaries.csv", index=False)

print("\nFiles Saved Successfully:")
print("- decision_tree_model.pkl")
print("- label_encoders.pkl")
print("- cleaned_salaries.csv")


Files Saved Successfully:
- decision_tree_model.pkl
- label_encoders.pkl
- cleaned_salaries.csv


In [10]:
importances = pd.Series(model.feature_importances_, index=X.columns)
print("\nTop Features:\n")
print(importances.sort_values(ascending=False))


Top Features:

glucose_level     0.583333
family_history    0.416667
age_group         0.000000
bmi_category      0.000000
dtype: float64
