In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Step 2: Load Dataset
data = pd.read_csv("Processed Dataset.csv")

# Display first 5 rows
data.head()


Unnamed: 0,Age,Gender,TO,TH,AH,BH,OX2,OXK,OX9,A,M,Rickettsia_Suspect,Acute_typhoid,Paratyphoid_A,Paratyphoid_B,Typhoid
0,5y,Male,"""1:80""","""1:80""","""1:80""","""1:80""","""1:320""","""1:320""","""1:80""","""1:80""","""1:80""",Yes,No,No,No,Negative
1,3.5y,Male,"""1:160""","""1:80""","""1:80""","""1:80""","""1:80""","""1:80""","""1:80""","""1:80""","""1:80""",No,Yes,No,No,Minimal
2,45y,Male,"""1:80""","""1:80""","""1:80""","""1:80""","""1:160""","""1:160""","""1:80""","""1:80""","""1:80""",Yes,No,No,No,Negative
3,13y,Female,"""1:80""","""1:160""","""1:80""","""1:80""","""1:160""","""1:320""","""1:80""","""1:80""","""1:160""",Yes,No,No,No,Minimal
4,12y,Female,"""1:160""","""1:320""","""1:80""","""1:80""","""1:160""","""1:160""","""1:320""","""1:80""","""1:80""",Yes,Yes,No,No,Positive


In [3]:
# Step 3: Basic Cleaning
# Remove extra quotes from cells
data = data.replace(r'\"', '', regex=True)

# Convert target column 'Typhoid' into True/False
data['Typhoid'] = data['Typhoid'].str.strip().str.lower().map({
    'positive': True,
    'negative': False
})

# Check for missing values
data.isnull().sum()


Age                     0
Gender                  0
TO                      0
TH                      0
AH                      0
BH                      0
OX2                     0
OXK                     0
OX9                     0
A                       0
M                       0
Rickettsia_Suspect      0
Acute_typhoid           0
Paratyphoid_A           0
Paratyphoid_B           0
Typhoid               560
dtype: int64

In [4]:
# Step 4: Encode Categorical Columns
label_encoder = LabelEncoder()

for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = label_encoder.fit_transform(data[col].astype(str))

data.head()


Unnamed: 0,Age,Gender,TO,TH,AH,BH,OX2,OXK,OX9,A,M,Rickettsia_Suspect,Acute_typhoid,Paratyphoid_A,Paratyphoid_B,Typhoid
0,80,1,2,2,2,2,1,1,2,0,1,1,0,0,0,0
1,42,1,0,2,2,2,2,2,2,0,1,0,1,0,0,2
2,60,1,2,2,2,2,0,0,2,0,1,1,0,0,0,0
3,11,0,2,0,2,2,0,1,2,0,0,1,0,0,0,2
4,9,0,0,1,2,2,0,0,1,0,1,1,1,0,0,1


In [5]:
# Step 5: Split Dataset
X = data.drop('Typhoid', axis=1)
y = data['Typhoid']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Step 6: Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [7]:
# Step 7: Evaluate the Model
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9863636363636363

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99        90
           1       1.00      0.88      0.93        16
           2       0.98      0.99      0.99       114

    accuracy                           0.99       220
   macro avg       0.99      0.96      0.97       220
weighted avg       0.99      0.99      0.99       220


Confusion Matrix:
 [[ 90   0   0]
 [  0  14   2]
 [  1   0 113]]


In [8]:
# Step 8: Predict for a New Patient Example (robust to feature ordering)
# Construct a new patient record that matches the model's training features (X columns).
example_values = {'Age': 25, 'Gender': 1, 'TO': 2, 'TH': 1}

# Create a template Series with the same index/order as training features
if 'X' in globals():
    template = pd.Series(0, index=X.columns, name=0)
else:
    # fallback: use columns from the dataset 'data' if X not present
    template = pd.Series(0, index=data.drop('Typhoid', axis=1).columns, name=0)

for k, v in example_values.items():
    if k in template.index:
        template[k] = v
    else:
        print(f"Warning: column '{k}' not found in training features; skipping assignment")

new_patient = template.to_frame().T

# Ensure dtypes match (optional): attempt to cast to training dtypes if available
try:
    for col in new_patient.columns:
        if col in X.dtypes.index:
            new_patient[col] = new_patient[col].astype(X.dtypes[col])
except Exception:
    # If casting fails, continue — model will raise a helpful error if necessary
    pass

# Make Prediction
try:
    prediction = model.predict(new_patient)[0]
    print("Typhoid Detected" if prediction else "No Typhoid Detected")
except Exception as e:
    print("Prediction failed:", e)


Typhoid Detected


In [9]:
import joblib
joblib.dump(model, "typhoid_model.pkl")
print("✅ Model saved as typhoid_model.pkl")


✅ Model saved as typhoid_model.pkl


In [10]:
# To load your model later:
model = joblib.load("typhoid_model.pkl")


In [11]:
# After training your model, for example:
model = RandomForestClassifier()
model.fit(X_train, y_train)

# ✅ Save only the trained model
import pickle
with open("typhoid_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")


Model saved successfully!


In [12]:
with open("typhoid_model.pkl", "rb") as f:
    test_model = pickle.load(f)

print(type(test_model))
# Expected output: <class 'sklearn.ensemble._forest.RandomForestClassifier'>


<class 'sklearn.ensemble._forest.RandomForestClassifier'>
