In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

file_path = '/content/Disease_symptom_and_patient_profile_dataset.csv'
data = pd.read_csv(file_path)

df = data.copy()

if df.isnull().sum().sum() > 0:
    df = df.fillna(df.mode().iloc[0])

categorical_columns = ['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing',
                       'Gender', 'Blood Pressure', 'Cholesterol Level']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

relevant_features = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing',
                     'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level']
X = df[relevant_features]
y = df['Disease']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

class_counts = y.value_counts()
min_class_count = class_counts.min()

if min_class_count < 2:
    print("Some classes have fewer than 2 samples. Removing these classes.")
    valid_classes = class_counts[class_counts >= 2].index
    mask = y.isin(valid_classes)
    X_scaled = X_scaled[mask]
    y = y[mask].reset_index(drop=True)

class_counts = y.value_counts()
min_class_count = class_counts.min()

k_neighbors_value = max(1, min(5, min_class_count - 1))
smote = SMOTE(random_state=42, k_neighbors=k_neighbors_value)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

columns = X.columns
df_resampled = pd.DataFrame(X_resampled, columns=columns)
df_resampled['Disease'] = y_resampled

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

n_neighbors_max = min(15, len(X_train))
param_grid = {
    'n_neighbors': range(3, n_neighbors_max + 1),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_knn = grid_search.best_estimator_
print(f"Best KNN Parameters: {grid_search.best_params_}")

y_pred = best_knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of the optimized KNN model: {accuracy * 100:.2f}%")

unique_classes = sorted(y_test.unique())
target_names = [label_encoders['Disease'].inverse_transform([cls])[0] for cls in unique_classes]
print("\nClassification Report:")
report = classification_report(y_test, y_pred, target_names=target_names)
print(report)

new_patient = {}
features = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level']
for feature in features:
    value = input(f"Enter {feature}: ")
    new_patient[feature] = value

try:
    new_patient_encoded = [label_encoders[col].transform([new_patient[col]])[0] if col in label_encoders else new_patient[col]
                           for col in X.columns]
    new_patient_scaled = scaler.transform([new_patient_encoded])

    predicted_disease = best_knn.predict(new_patient_scaled)
    predicted_disease_label = label_encoders['Disease'].inverse_transform(predicted_disease)
    print(f"\nPredicted Disease for the new patient: {predicted_disease_label[0]}")

    similarities = cosine_similarity(new_patient_scaled, X_resampled)
    df_resampled['Similarity_Score'] = similarities[0]

    similar_cases = df_resampled.sort_values(by='Similarity_Score', ascending=False)

    similar_cases['Disease'] = similar_cases['Disease'].apply(
        lambda x: label_encoders['Disease'].inverse_transform([int(x)])[0]
    )
    top_unique_diseases = similar_cases['Disease'].drop_duplicates().head(3).tolist()

    print("\nTop 3 Predicted Diseases:")
    print(top_unique_diseases)

except Exception as e:
    print(f"Error encoding or predicting for the new patient: {e}")

Some classes have fewer than 2 samples. Removing these classes.
Best KNN Parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

Accuracy of the optimized KNN model: 87.75%

Classification Report:
                                              precision    recall  f1-score   support

                           Allergic Rhinitis       1.00      1.00      1.00         7
                         Alzheimer's Disease       1.00      1.00      1.00         6
                           Anxiety Disorders       0.83      1.00      0.91         5
                                      Asthma       1.00      0.50      0.67         8
                                  Bronchitis       1.00      0.80      0.89         5
                                  Chickenpox       1.00      1.00      1.00         6
                                     Cholera       1.00      1.00      1.00         3
Chronic Obstructive Pulmonary Disease (COPD)       1.00      1.00      1.00         5
      