In [None]:
import pandas as pd

In [None]:
# Define the data
data = {
    'Disease': ['Malaria', 'Pneumonia', 'Pulmonary TB', 'Urinary Tract Infections', 'Pelvic Inflammatory Disease'],
    'Symptoms': [
        'Fever, General body weakness, Joint pains, Chills, Headache, Abdominal pain, Nausea and vomiting, Confusion and disorientation, Anemia, Seizures',
        'Cough, Fever, Difficulty breathing, Chest pain, Feeling weak, Nausea and vomiting, Chills',
        'Cough more than 2 weeks, Chest pain, Fatigue, Loss of appetite, Chills, Night sweats, Shortness of breath',
        'Burning sensation while urinating, Pain in the lower abdomen, Painful urination, Back pain, Fever and chills, Leaking of urine, Urgent need to urinate',
        'Lower abdominal pain, Fever, Painful intercourse, Painful urination, Abnormal vaginal bleeding, Foul-smelling vaginal discharge, Pelvis swelling, Spotting or bleeding between periods'
    ]
}

In [None]:
# Create a DataFrame
df = pd.DataFrame(data)
print(df)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split the symptoms into a list of individual symptoms
df['Symptoms'] = df['Symptoms'].apply(lambda x: x.split(', '))

# One-Hot Encode the symptoms
mlb = MultiLabelBinarizer()
encoded_symptoms = mlb.fit_transform(df['Symptoms'])

# Convert back to DataFrame and merge with original
symptom_df = pd.DataFrame(encoded_symptoms, columns=mlb.classes_)
df_encoded = pd.concat([df['Disease'], symptom_df], axis=1)

print(df_encoded)


In [None]:
# Features are the encoded symptoms
X = df_encoded.drop('Disease', axis=1)

# Labels are the diseases
y = df_encoded['Disease']

In [None]:
from sklearn.model_selection import train_test_split

# Split into training and testing data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier()

# Train the model
model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Detailed classification report
print(classification_report(y_test, y_pred))

In [None]:
# New input data (encoded symptoms)
new_symptoms = ['Chest pain', 'fatigue', 'loss of appetite']  # Example symptoms
new_symptoms_encoded = mlb.transform([new_symptoms])

# Predict the disease
prediction = model.predict(new_symptoms_encoded)
print(f'Predicted Disease: {prediction[0]}')


In [None]:
import pickle

# Save the trained model
with open('disease_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the MultiLabelBinarizer
with open('mlb.pkl', 'wb') as mlb_file:
    pickle.dump(mlb, mlb_file)
