In [26]:
import pandas as pd

url = 'https://raw.githubusercontent.com/nelsonrandy111/Team7CC/main/data/smote_balanced_symbipredict_2022.csv'
df = pd.read_csv(url)

# Workflow for Patient Symptom Prediction System

## 1. Nurse Interface
### Service: **AWS S3 (Static Website Hosting)**
- **Action**:  
  - The nurse accesses a web interface to input patient details.
  - Generates a unique link for the patient to input symptoms.
- **Output**:  
  - A unique link is created and sent to the patient’s phone.

---

## 2. Patient Interaction
### Services: **AWS S3 + AWS SNS**
- **AWS SNS (Simple Notification Service)**: Sends the unique link to the patient’s phone via SMS.
- **AWS S3**: Hosts the web form for the patient to input symptoms.

### Action:
1. The patient receives an SMS with the link.
2. They click the link, access the web form, and enter their symptoms via a user-friendly interface.

---

## 3. API Endpoint
### Services: **AWS API Gateway + AWS Lambda**
- **AWS API Gateway**: Exposes an endpoint to receive patient symptom data.
- **AWS Lambda**: Processes the data and runs the ML model.

### Action:
1. Web form submission triggers an API call to the endpoint.
2. **AWS Lambda** processes the submitted data using the deployed machine learning model.

---

## 4. Machine Learning Model Processing
### Services: **AWS Lambda + AWS S3**
- **AWS Lambda**: Executes the machine learning model (Logistic Regression).
- **AWS S3**: Stores the serialized model and any required dependencies.

### Action:
1. Lambda retrieves the serialized ML model from **AWS S3**.
2. The model predicts the top 5 likely prognoses along with their probabilities based on the patient’s symptoms.



### **Data Assessment**

In [27]:
df['prognosis'].unique()

array(['Fungal Infection', 'Allergy', 'GERD', 'Chronic Cholestasis',
       'Drug Reaction', 'Peptic Ulcer Disease', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical Spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chickenpox', 'Dengue', 'Typhoid', 'Hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic Hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic Hemmorhoids (piles)', 'Heart Attack', 'Varicose Veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthritis', 'Arthritis', 'Vertigo', 'Acne',
       'Urinary Tract Infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [28]:
df.isna().sum().sort_values(ascending=True)

itching                 0
irritability            0
depression              0
toxic_look_(typhos)     0
internal_itching        0
                       ..
pain_behind_the_eyes    0
loss_of_appetite        0
nausea                  0
malaise                 0
prognosis               0
Length: 133, dtype: int64

### **Pipeline**

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = df.drop(columns=['prognosis', 'family_history'])
y = df['prognosis']

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

steps=[('lg',LogisticRegression())]
pipe=Pipeline(steps)

### **Assessment**

In [30]:
from sklearn.metrics import classification_report

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Predictions
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                               precision    recall  f1-score   support

                         AIDS       1.00      1.00      1.00        29
                         Acne       1.00      1.00      1.00        25
          Alcoholic Hepatitis       1.00      1.00      1.00        29
                      Allergy       1.00      1.00      1.00        26
                    Arthritis       1.00      1.00      1.00        24
             Bronchial Asthma       1.00      1.00      1.00        29
         Cervical Spondylosis       1.00      1.00      1.00        21
                   Chickenpox       1.00      1.00      1.00        20
          Chronic Cholestasis       1.00      1.00      1.00        24
                  Common Cold       1.00      1.00      1.00        20
                       Dengue       1.00      1.00      1.00        29
                    Diabetes        1.00      1.00      1.00        21
Dimorphic Hemmorhoids (piles)       1.00      1.00   

In [31]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipe, X, y, cv=5, scoring='f1_macro')  # Adjust scoring if necessary
print(f"Cross-Validation F1 Scores: {scores}")
print(f"Mean F1 Score: {scores.mean():.4f}")


Cross-Validation F1 Scores: [1. 1. 1. 1. 1.]
Mean F1 Score: 1.0000


### **Example Patient**

In [32]:
# List of all symptoms (columns from the dataset)
symptoms_list = list(X.columns)

# Patient inputs their symptoms as a list
patient_symptoms = ["skin_rash", "nodal_skin_eruptions", "headache", ]  # Example input

# Create a dictionary with all symptoms set to 0
patient_data = {symptom: 0 for symptom in symptoms_list}

# Update the symptoms specified by the patient to 1
for symptom in patient_symptoms:
    if symptom in patient_data:
        patient_data[symptom] = 1
    else:
        print(f"Warning: {symptom} is not a recognized symptom.")

# Convert to a DataFrame for model input
new_patient = pd.DataFrame([patient_data])

# Predict prognosis using the trained model
prediction = pipe.predict(new_patient)

# Display the result
print(f"Predicted prognosis: {prediction[0]}")


Predicted prognosis: Fungal Infection


### **Random Patient Test**

In [34]:
import random

# Randomly select a subset of symptoms for a simulated patient
random_symptoms = random.sample(X.columns.tolist(), k=5)  # Pick 5 random symptoms

# Create the input for the model based on the random symptoms
random_patient_data = {symptom: 0 for symptom in X.columns}
for symptom in random_symptoms:
    random_patient_data[symptom] = 1

# Convert to a DataFrame
random_patient = pd.DataFrame([random_patient_data])

# Display the random symptoms and the predicted prognosis
print(f'Symptoms: {random_symptoms}')

# Get the probabilities for each class
probabilities = pipe.predict_proba(random_patient)

# Combine class labels with their probabilities
class_probabilities = list(zip(pipe.classes_, probabilities[0]))

# Sort the classes by probability in descending order
sorted_probabilities = sorted(class_probabilities, key=lambda x: x[1], reverse=True)

# Get the top 5 most likely prognoses
top_5 = sorted_probabilities[:5]

# Display the top 5 prognoses
print("Top 5 Most Likely Prognoses:")
for label, prob in top_5:
    print(f"{label}: {prob * 100:.2f}%")


Symptoms: ['headache', 'blurred_and_distorted_vision', 'depression', 'lack_of_concentration', 'dark_urine']
Top 5 Most Likely Prognoses:
Hypertension : 43.18%
Migraine: 13.69%
Paralysis (brain hemorrhage): 6.85%
Urinary Tract Infection: 1.98%
Allergy: 1.90%
