In [27]:
# Load the libraries as required.

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler, FunctionTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
import shap
import pickle
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pandas as pd
import warnings

In [28]:
############################## LOAD DATA SET BELOW #################################

# Load the training and testing datasets
training_file = "raw/Training.csv"
testing_file = "raw/Testing.csv"
disease_training = pd.read_csv(training_file)
disease_testing = pd.read_csv(testing_file)

# Drop any unwanted columns (e.g., unnamed columns)
if 'Unnamed: 133' in disease_training.columns:
    disease_training.drop(['Unnamed: 133'], axis=1, inplace=True)

    

In [29]:
############################## SPLIT X AND Y BELOW #################################

# Split into X (all columns except the target) and Y (target column)
X_train = disease_training.drop(columns=['prognosis'])
X_test = disease_testing.drop(columns=['prognosis'])

y_train = disease_training['prognosis']
y_test = disease_testing['prognosis']

X_train.info()

X_train
y_train
y_test

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 132 entries, itching to yellow_crust_ooze
dtypes: int64(132)
memory usage: 5.0 MB


0                            Fungal infection
1                                     Allergy
2                                        GERD
3                         Chronic cholestasis
4                               Drug Reaction
5                         Peptic ulcer diseae
6                                        AIDS
7                                   Diabetes 
8                             Gastroenteritis
9                            Bronchial Asthma
10                              Hypertension 
11                                   Migraine
12                       Cervical spondylosis
13               Paralysis (brain hemorrhage)
14                                   Jaundice
15                                    Malaria
16                                Chicken pox
17                                     Dengue
18                                    Typhoid
19                                hepatitis A
20                                Hepatitis B
21                                

In [30]:
############################## ENCODE TARGET BELOW #################################

###### TRAIN SET #########
# As is in raw data
y_train

# Encode the target variable (categorical y -> numerical encoding)
label_num_encoder = LabelEncoder()
y_train_num_encoded = label_num_encoder.fit_transform(y_train)  # Convert to integers

print(f'label encoder classes: {y_train_num_encoded}')

# Encode the target variable (categorical y -> one-hot encoding)
one_hot_encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse
y_train_hot_encoded = one_hot_encoder.fit_transform(y_train.values.reshape(-1, 1))

print(f"One-hot encoder categories: {one_hot_encoder.categories_}")

###### TEST SET #########
# As is in raw data
y_test

# Encode the target variable (categorical y -> numerical encoding)
label_num_encoder_test = LabelEncoder()
y_test_num_encoded = label_num_encoder_test.fit_transform(y_test)  # Convert to integers

# Encode the target variable (categorical y -> one-hot encoding)
one_hot_encoder_test = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse
y_test_hot_encoded = one_hot_encoder_test.fit_transform(y_test.values.reshape(-1, 1))

label encoder classes: [15 15 15 ... 38 35 27]
One-hot encoder categories: [array(['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne',
       'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma',
       'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis',
       'Common Cold', 'Dengue', 'Diabetes ',
       'Dimorphic hemmorhoids(piles)', 'Drug Reaction',
       'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Hypertension ', 'Hyperthyroidism', 'Hypoglycemia',
       'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine',
       'Osteoarthristis', 'Paralysis (brain hemorrhage)',
       'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis',
       'Typhoid', 'Urinary tract infection', 'Varicose veins',
       'hepatitis A'], dtype=object)]


In [38]:
############################## SAVE DATA TO PICKLE FILE BELOW #################################


############## TRAIN DATA ################
### y_train as is below ###
# Bundle the data into a dictionary
train_data = {
    "X_train": X_train,
    "y_train": y_train
}

# Save the train dictionary to a pickle file
with open("train_data.pkl", "wb") as f:
    pickle.dump(train_data, f)


### y_train as numerical below ###
# Bundle the data into a dictionary
train_data_to_save_numerical = {
    "X_train": X_train,
    "y_train_num_encoded": y_train_num_encoded
}

# Save the train dictionary to a pickle file
with open("train_data_numerical.pkl", "wb") as f:
    pickle.dump(train_data_to_save_numerical, f)

### y_train one hot encoded below ###
train_data_to_save_hot_encoded = {
    "X_train": X_train,
    "y_train_hot_encoded": y_train_hot_encoded
}

# Save the train dictionary to a pickle file
with open("train_data_hot_encoded.pkl", "wb") as f:
    pickle.dump(train_data_to_save_hot_encoded, f)

############## TEST DATA ################
### y_test as is below ###
# Bundle the data into a dictionary
test_data = {
    "X_test": X_test,
    "y_test": y_test
}

# Save the train dictionary to a pickle file
with open("test_data.pkl", "wb") as f:
    pickle.dump(test_data, f)


### y_test as numerical below ###
# Bundle the data into a dictionary
test_data_to_save_numerical = {
    "X_test": X_test,
    "y_test_num_encoded": y_test_num_encoded
}

# Save the train dictionary to a pickle file
with open("test_data_numerical.pkl", "wb") as f:
    pickle.dump(test_data_to_save_numerical, f)
    
### y_test one hot encoded below ###
# Bundle the data into a dictionary
test_data_to_save_hot_encoded = {
    "X_test": X_test,
    "y_test_hot_encoded": y_test_hot_encoded
}

# Save the train dictionary to a pickle file
with open("test_data_hot_encoded.pkl", "wb") as f:
    pickle.dump(test_data_to_save_hot_encoded, f)

In [39]:
############################## LOAD PICKLE BELOW #################################


############### Load train data ###############
### Load train data as is ###
with open("train_data.pkl", "rb") as f:
    loaded_data_raw = pickle.load(f)

# Access the data
loaded_X_train = loaded_data_raw["X_train"]
loaded_y_train = loaded_data_raw["y_train"]

### Load train data as with y as numerical value ###
with open("train_data_numerical.pkl", "rb") as f:
    loaded_data_numerical = pickle.load(f)

# Access the data
loaded_X_train_num_encoded = loaded_data_numerical["X_train"]
loaded_y_train_num_encoded = loaded_data_numerical["y_train_num_encoded"]

### Load train data with one hot encoding ###
# To load the data
with open("train_data_hot_encoded.pkl", "rb") as f:
    loaded_data_hot_encoded = pickle.load(f)

# Access the data
loaded_X_train_hot_encoded = loaded_data_hot_encoded["X_train"]
loaded_y_train_hot_encoded = loaded_data_hot_encoded["y_train_hot_encoded"]

############### Load test data  ###############
### Load test data as is ###
with open("test_data.pkl", "rb") as f:
    loaded_data = pickle.load(f)

# Access the data
loaded_X_test = loaded_data["X_test"]
loaded_y_test = loaded_data["y_test"]

### Load test data with target as numerical value ###
with open("test_data_numerical.pkl", "rb") as f:
    loaded_data = pickle.load(f)

# Access the data
loaded_X_test_num_encoded = loaded_data["X_test"]
loaded_y_test_num_encoded = loaded_data["y_test_num_encoded"]

### Load train data with one hot encoding ###
# To load the data
with open("test_data_hot_encoded.pkl", "rb") as f:
    loaded_data = pickle.load(f)

# Access the data
loaded_X_test_hot_encoded = loaded_data["X_test"]
loaded_y_test_hot_encoded = loaded_data["y_test_hot_encoded"]