In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib

In [13]:
# Load and examine raw data
dataset = pd.read_csv('../../data/raw/dataset.csv')
severity = pd.read_csv('../../data/raw/Symptom-severity.csv')

print("Dataset shape:", dataset.shape)
print("\nDataset columns:")
print(dataset.columns.tolist())


# Check for missing values
print("\nMissing values in dataset:")
print(dataset.isnull().sum())

Dataset shape: (4920, 18)

Dataset columns:
['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']

Missing values in dataset:
Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64


In [14]:
# Create severity dictionary
severity_dict = dict(zip(severity['Symptom'], severity['weight']))


In [15]:
# Get all unique symptoms (this part worked before)
all_symptoms = []
for col in dataset.columns:
    if 'Symptom_' in col:
        all_symptoms.extend(dataset[col].dropna().unique())
unique_symptoms = list(set(all_symptoms) | set(severity['Symptom'].unique()))

print("Number of unique symptoms:", len(unique_symptoms))
print("\nSample of unique symptoms:", unique_symptoms[:5])

Number of unique symptoms: 262

Sample of unique symptoms: [' increased_appetite', ' distention_of_abdomen', ' loss_of_balance', ' irritability', ' weakness_of_one_body_side']


In [16]:
def create_symptom_features(row):
    symptoms_dict = {symptom: 0 for symptom in unique_symptoms}
    
    for col in dataset.columns:
        if 'Symptom_' in col and pd.notna(row[col]):
            symptom = row[col].strip()
            if symptom in severity_dict:
                symptoms_dict[symptom] = severity_dict[symptom]
    return pd.Series(symptoms_dict)

In [17]:
# Create feature matrix
X = dataset.apply(create_symptom_features, axis=1)

In [18]:

# Print diagnostic information
print("\nSample of created features:")
print(X.head())
print("\nNumber of non-zero values:", (X != 0).sum().sum())


Sample of created features:
    increased_appetite   distention_of_abdomen   loss_of_balance  \
0                    0                       0                 0   
1                    0                       0                 0   
2                    0                       0                 0   
3                    0                       0                 0   
4                    0                       0                 0   

    irritability   weakness_of_one_body_side  loss_of_balance  sunken_eyes  \
0              0                           0                0            0   
1              0                           0                0            0   
2              0                           0                0            0   
3              0                           0                0            0   
4              0                           0                0            0   

   bladder_discomfort  swelling_joints  patches_in_throat  ...  \
0                   0      

In [19]:
# Encode target
le = LabelEncoder()
y = le.fit_transform(dataset['Disease'])


In [20]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [21]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [22]:
# Save processed data
processed_data = {
    'X_train': X_train_scaled,
    'X_test': X_test_scaled,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': X.columns.tolist(),
    'target_names': le.classes_
}

In [23]:
joblib.dump(processed_data, '../../data/processed/processed_data_v2.joblib')
joblib.dump(scaler, '../../data/processed/scaler_v2.joblib')
joblib.dump(le, '../../data/processed/label_encoder_v2.joblib')

print("\nData preprocessing completed!")
print(f"Number of features: {X.shape[1]}")
print(f"Number of classes: {len(le.classes_)}")


Data preprocessing completed!
Number of features: 262
Number of classes: 41


In [24]:
# Load the processed data
processed_data = joblib.load('../../data/processed/processed_data_v2.joblib')

# Print the keys in the processed data
print("Keys in processed data:", processed_data.keys())

# Print information about each component
print("\nTraining data shape:", processed_data['X_train'].shape)
print("Test data shape:", processed_data['X_test'].shape)
print("\nNumber of features:", len(processed_data['feature_names']))
print("Number of classes:", len(processed_data['target_names']))



Keys in processed data: dict_keys(['X_train', 'X_test', 'y_train', 'y_test', 'feature_names', 'target_names'])

Training data shape: (3936, 262)
Test data shape: (984, 262)

Number of features: 262
Number of classes: 41


In [25]:
# Print sample of feature names
print("\nSample of feature names:")
processed_data['feature_names'][:5]



Sample of feature names:


[' increased_appetite',
 ' distention_of_abdomen',
 ' loss_of_balance',
 ' irritability',
 ' weakness_of_one_body_side']

In [26]:

# Print sample of target names (diseases)
print("\nSample of target names (diseases):")
processed_data['target_names'][:5]




Sample of target names (diseases):


array(['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne',
       'Alcoholic hepatitis', 'Allergy'], dtype=object)

In [27]:

print("\nSample of training data:")
processed_data['X_train'][:2]


Sample of training data:


array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         3.67749714, -0.14856712, -0.15297364, -0.22081891, -0.14945789,
        -0.32163376,  0.        ,  0.        , -0.15811388, -0.15297364,
        -0.32018902, -0.15384115,  0.        , -0.36063873,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        , -0.15470424,
         0.        ,  0.        , -0.61847223, -0.15811388,  0.        ,
         0.        ,  6.69084762, -0.22018542,  0.        ,  0.        ,
         0.        , -0.21763644, -0.21763644,  0.        ,  0.        ,
         0.        ,  0.        , -0.14945789, -0.14856712,  0.        ,
         0.        , -0.14945789, -0.15811388,  0.        ,  0.        ,
         0.        , -0.15384115,  0.        ,  0.        , -0.15556299,
         0.        ,  0.        , -0.15470424, -0.15556299, -0.15811388,
         0.        , -0.14945789,  0.        , -0.15297364,  0.        ,
         0.        , -0.15811388, -0.15556299, -0.1