Label Encodings for changes to numeric values

In [65]:
# Import necessary libraries
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import ast

In [66]:
# Load the data from the CSV file
file_path = r'D:\\CADT University\\CADT-Y3\\CodeAllSub\\Capstone\\Capstone-Project-I\\data\\processed\\lyleab_file\\Cleaned_Top15Diseases.csv'
df = pd.read_csv(file_path)
df.head(2)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Symptom_Count,Symptoms
0,Diabetes,fatigue,weight_loss,restlessness,lethargy,irregular_sugar_level,blurred_and_distorted_vision,obesity,excessive_hunger,increased_appetite,polyuria,,,,,,,,10,"[' fatigue', ' weight_loss', ' restlessness', ..."
1,Diabetes,fatigue,weight_loss,restlessness,lethargy,irregular_sugar_level,blurred_and_distorted_vision,obesity,excessive_hunger,increased_appetite,polyuria,,,,,,,,10,"[' fatigue', ' weight_loss', ' restlessness', ..."


In [67]:
# Count the occurrences of each disease in the dataset
disease_counts = df['Disease'].value_counts()
print(f'Disease counts:\n{disease_counts}')

Disease counts:
Disease
Diabetes           120
Migraine           120
Chicken pox        120
Dengue             120
Typhoid            120
hepatitis A        120
Hepatitis B        120
Hepatitis D        120
Hepatitis E        120
Tuberculosis       120
Common Cold        120
Pneumonia          120
Hypothyroidism     120
Hyperthyroidism    120
Hypoglycemia       120
Name: count, dtype: int64


In [68]:
# Drop all columns in the dataset except "Disease", "Symptom_Count", and "Symptoms" columns
df_filtered = df[["Disease", "Symptom_Count", "Symptoms"]]
df_filtered.head(2)

Unnamed: 0,Disease,Symptom_Count,Symptoms
0,Diabetes,10,"[' fatigue', ' weight_loss', ' restlessness', ..."
1,Diabetes,10,"[' fatigue', ' weight_loss', ' restlessness', ..."


In [69]:
# Encode the "Disease" column
label_encoder_disease = LabelEncoder()
df_filtered["Disease_Encoded"] = label_encoder_disease.fit_transform(df_filtered['Disease'])
df_filtered.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["Disease_Encoded"] = label_encoder_disease.fit_transform(df_filtered['Disease'])


Unnamed: 0,Disease,Symptom_Count,Symptoms,Disease_Encoded
0,Diabetes,10,"[' fatigue', ' weight_loss', ' restlessness', ...",3
1,Diabetes,10,"[' fatigue', ' weight_loss', ' restlessness', ...",3


In [70]:
# Create the mapping of encoded values to original values for diseases
disease_mapping = dict(zip(label_encoder_disease.classes_, label_encoder_disease.transform(label_encoder_disease.classes_)))
disease_mapping

{'Chicken pox': 0,
 'Common Cold': 1,
 'Dengue': 2,
 'Diabetes ': 3,
 'Hepatitis B': 4,
 'Hepatitis D': 5,
 'Hepatitis E': 6,
 'Hyperthyroidism': 7,
 'Hypoglycemia': 8,
 'Hypothyroidism': 9,
 'Migraine': 10,
 'Pneumonia': 11,
 'Tuberculosis': 12,
 'Typhoid': 13,
 'hepatitis A': 14}

In [71]:
# Count the unique symptom names in the "Symptoms" column
unique_symptoms = set()
for symptoms_list in df_filtered["Symptoms"]:
    symptoms = ast.literal_eval(symptoms_list)
    unique_symptoms.update(symptoms)
print("Number of unique symptoms:", len(unique_symptoms))

Number of unique symptoms: 75


In [72]:
# Apply MultiLabelBinarizer to encode the symptoms
mlb = MultiLabelBinarizer()
df_symptoms_encoded = pd.DataFrame(mlb.fit_transform(df_filtered["Symptoms"]), columns=mlb.classes_)

# Concatenate the encoded symptoms with the filtered dataset, dropping the original Symptoms column
df_encoded = pd.concat([df_filtered.drop("Symptoms", axis=1), df_symptoms_encoded], axis=1)
df_encoded.head(2)

Unnamed: 0,Disease,Symptom_Count,Disease_Encoded,Unnamed: 4,',(,),",",[,],...,p,r,s,t,u,v,w,x,y,z
0,Diabetes,10,3,1,1,0,0,1,1,1,...,1,1,1,1,1,1,1,1,1,0
1,Diabetes,10,3,1,1,0,0,1,1,1,...,1,1,1,1,1,1,1,1,1,0


In [73]:
df_encoded = df_encoded.drop(columns=["Disease", "Symptom_Count"])
df_encoded.head(2)

Unnamed: 0,Disease_Encoded,Unnamed: 2,',(,),",",[,],_,a,...,p,r,s,t,u,v,w,x,y,z
0,3,1,1,0,0,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
1,3,1,1,0,0,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0


In [76]:
print(df_encoded['Disease_Encoded'].value_counts())

Disease_Encoded
3     120
10    120
0     120
2     120
13    120
14    120
4     120
5     120
6     120
12    120
1     120
11    120
9     120
7     120
8     120
Name: count, dtype: int64


In [74]:
# Save the encoded dataset to a CSV file
encoded_file_path = 'D:\CADT University\CADT-Y3\CodeAllSub\Capstone\Capstone-Project-I\data\processed\lyleab_file\encoded_cleaned_top15diseases.csv'
df_encoded.to_csv(encoded_file_path, index=False)

# Optionally save the LabelEncoders for Disease and Symptoms (for future use)
joblib.dump(label_encoder_disease, 'disease_encoder.pkl')
joblib.dump(mlb, 'symptom_encoder.pkl')

# Confirmation
print(f'Encoded data saved to {encoded_file_path}')
print('Label encoders saved: disease_encoder.pkl, symptom_encoder.pkl')


Encoded data saved to D:\CADT University\CADT-Y3\CodeAllSub\Capstone\Capstone-Project-I\data\processed\lyleab_file\encoded_cleaned_top15diseases.csv
Label encoders saved: disease_encoder.pkl, symptom_encoder.pkl


  encoded_file_path = 'D:\CADT University\CADT-Y3\CodeAllSub\Capstone\Capstone-Project-I\data\processed\lyleab_file\encoded_cleaned_top15diseases.csv'
