In [5]:
# Extended Processed Data EDA (with decoding of encoded features)

import pandas as pd
import joblib

# --- Step 1: Load processed data ---

df = pd.read_csv(r"C:\Users\Oscar's PC\Enhanced_fever_medication_recommendation\data\processed\processed_data.csv")

print("Shape of processed data:", df.shape)
print("\nColumns:", list(df.columns))

# --- Step 2: Numerical features summary ---
num_cols = df.select_dtypes(include=["number"]).columns
print("\n--- Numerical Features Summary ---")
print(df[num_cols].describe().T)

# --- Step 3: Categorical features summary (raw form, before encoding) ---
cat_cols = df.select_dtypes(include=["object"]).columns
print("\n--- Categorical Features Unique Values ---")
for col in cat_cols:
    print(f"\nColumn: {col}")
    print("Unique values:", df[col].unique())
    print("Counts:\n", df[col].value_counts())

# --- Step 4: Target distribution ---
target_col = "Recommended_Medication"
if target_col in df.columns:
    print(f"\n--- Target Distribution ({target_col}) ---")
    print(df[target_col].value_counts(normalize=True))

# --- Step 5: Decode one-hot encoded categories from the trained model ---
print("\n--- Encoded Feature Categories (from trained model) ---")
pipeline = joblib.load(r"C:\Users\Oscar's PC\Enhanced_fever_medication_recommendation\models\fever_model.pkl")
preprocessor = pipeline.named_steps["preprocessor"]

# The categorical transformer is in position 1 (second tuple)
cat_encoder = preprocessor.named_transformers_["cat"].named_steps["onehot"]

# Original categorical column names
cat_feature_names = preprocessor.transformers_[1][2]

# Encoded one-hot column names
encoded_feature_names = cat_encoder.get_feature_names_out(cat_feature_names)

print("Original categorical columns:", cat_feature_names)
print("Encoded feature names:", encoded_feature_names)

import pandas as pd
from sklearn.utils import resample

# Load processed data
df = pd.read_csv(r"C:\Users\Oscar's PC\Enhanced_fever_medication_recommendation\data\processed\processed_data.csv")

# Separate features and target
X = df.drop("Recommended_Medication", axis=1)
y = df["Recommended_Medication"]

# Combine for upsampling
df_combined = pd.concat([X, y], axis=1)

# Separate majority and minority classes
df_majority = df_combined[df_combined["Recommended_Medication"] == "Ibuprofen"]
df_minority = df_combined[df_combined["Recommended_Medication"] == "Paracetamol"]

# Upsample minority class
df_minority_upsampled = resample(
    df_minority,
    replace=True,          # sample with replacement
    n_samples=len(df_majority),  # match majority class
    random_state=42
)

# Combine majority and upsampled minority
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Shuffle
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the target distribution
print("--- Balanced Target Distribution ---")
print(df_balanced["Recommended_Medication"].value_counts())




Shape of processed data: (800, 19)

Columns: ['Temperature', 'Fever_Severity', 'Age', 'Gender', 'BMI', 'Headache', 'Body_Ache', 'Fatigue', 'Chronic_Conditions', 'Allergies', 'Smoking_History', 'Alcohol_Consumption', 'Humidity', 'AQI', 'Physical_Activity', 'Diet_Type', 'Heart_Rate', 'Blood_Pressure', 'Recommended_Medication']

--- Numerical Features Summary ---
             count        mean         std   min     25%     50%      75%  \
Temperature  800.0   38.092500    1.150140  36.0   37.10   38.15   39.100   
Age          800.0   51.466250   28.999225   1.0   26.00   52.50   76.000   
BMI          800.0   26.338250    4.902487  18.0   22.20   26.40   30.525   
Humidity     800.0   60.603625   17.231201  30.0   46.60   60.80   75.400   
AQI          800.0  250.742500  144.846036   0.0  127.00  250.00  377.250   
Heart_Rate   800.0   80.236250   11.694074  60.0   70.75   80.00   90.000   

               max  
Temperature   40.0  
Age          100.0  
BMI           35.0  
Humidity     