In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Load and Clean the Dataset
df = pd.read_csv('../processed/Top15diseases_clean.csv')
# Assuming symptoms are numerical, so we don't need encoding for symptoms, but diseases need OneHotEncoding
# Separate features (X) and target (y)
X = df.drop(columns=[col for col in df.columns if col.startswith('Disease')])  # Symptoms
y = df[[col for col in df.columns if col.startswith('Disease')]]  # Diseases (one-hot encoded)

# Step 2: Apply Imbalanced Technique – SMOTE
# Convert y (target) to the disease labels (index of the disease column)
y_labels = y.idxmax(axis=1)

# Split data into training and testing (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.3, random_state=42)

# Apply SMOTE (oversampling the minority class in training data)
smote = SMOTE(sampling_strategy=0.55, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 3: Data Pre-processing
# Label encoding for symptoms is not needed since they are already numerical

# Step 4: Train-Test Split (already done above)

# Step 5: Try Random Forest Classifier
# Initialize Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluation Metric

# Make predictions
y_pred = model.predict(X_test)

# Convert predictions back to labels (disease names)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print Classification Report (Precision, Recall, F1-score)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Visualize the Confusion Matrix
plt.figure(figsize=(10,7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=y_labels.unique(), yticklabels=y_labels.unique())
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


ValueError: could not convert string to float: ' fatigue'