In [3]:
import pandas as pd
import numpy as np

In [4]:
# Load cleaned data
main_data = pd.read_csv('../../data/processed/cleaned_main_data.csv')
severity_data = pd.read_csv('../../data/raw/Symptom-severity.csv')

In [5]:
# create severity mapping
severity_map = dict(zip(severity_data['Symptom'], severity_data['weight']))

In [6]:
# create enhanced features
def create_feature_matrix(df, severity_map):

    symptom_cols = [col for col in df.columns if 'Symptom' in col]
    all_symptoms = df[symptom_cols].values.ravel()
    unique_symptoms = pd.unique(all_symptoms[pd.notna(all_symptoms)])

    
    # Initialize feature matrix
    feature_matrix = []
    
    for _, row in df.iterrows():
        features = {}
        
        # Get symptoms for this row
        symptoms = row[symptom_cols].dropna().values
        
        # Binary features
        for symptom in unique_symptoms:
            features[f'has_{symptom}'] = 1 if symptom in symptoms else 0
            
        # Add severity features
        total_severity = 0
        for symptom in symptoms:
            severity = severity_map.get(symptom, 0)
            total_severity += severity
            
        features['total_severity'] = total_severity
        features['symptom_count'] = len(symptoms)
        features['avg_severity'] = total_severity / len(symptoms) if len(symptoms) > 0 else 0
        
        feature_matrix.append(features)
    
    return pd.DataFrame(feature_matrix)


In [7]:
# Create enhanced features
X = create_feature_matrix(main_data, severity_map)
y = main_data['Disease']

In [8]:
# Save features
X.to_csv('../../data/processed/enhanced_features.csv', index=False)
pd.Series(y).to_csv('../../data/processed/target.csv', index=False)