In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import shap
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [39]:
encoded_df = pd.read_csv('../data/processed/encoded_data.csv')

In [40]:
disease_col = 'Disease' #make sure this is the disease column name.
X = encoded_df.drop(disease_col, axis=1)
y = encoded_df[disease_col]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
# Scaling (if needed for specific models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
# Feature Selection (SHAP)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

explainer = shap.Explainer(model.predict, X_train_scaled)
shap_values = explainer(X_test_scaled)
# shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns)

# Calculate average absolute SHAP values
shap_values_abs = np.abs(shap_values.values).mean(axis=0)

# Adjust the threshold as needed
threshold = 0.1  # Experiment with different thresholds

# Select important features
important_features = X.columns[shap_values_abs > threshold]

# Filter train and test data
important_feature_indices = [i for i, col in enumerate(X.columns) if col in important_features]
X_train_selected = X_train_scaled[:, important_feature_indices]
X_test_selected = X_test_scaled[:, important_feature_indices]

print("Selected Features:", important_features)

PermutationExplainer explainer: 361it [01:49,  2.98it/s]                         

Selected Features: Index(['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_7',
       'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12',
       'Symptom_13', 'Symptom_14', 'Symptom_15'],
      dtype='object')





In [44]:
# # Feature Selection (SHAP example - using Random Forest as an example model)
# model = RandomForestClassifier(random_state=42)
# model.fit(X_train_scaled, y_train)

In [45]:
# import matplotlib
# print(matplotlib.__version__)

In [46]:
# explainer = shap.Explainer(model.predict, X_train_scaled)
# shap_values = explainer(X_test_scaled)
# # shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns) #show feature names.

In [47]:
# shap_values_abs = np.abs(shap_values.values).mean(axis=0)
# important_features = X.columns[shap_values_abs > 0.1] #change 0.1 to your threshold.

In [48]:
# X_train_selected = X_train_scaled[:, [i for i, col in enumerate(X.columns) if col in important_features]]
# X_test_selected = X_test_scaled[:, [i for i, col in enumerate(X.columns) if col in important_features]]

# print("Selected Features:", important_features)