In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


In [2]:
train = pd.read_csv("training.csv")
test = pd.read_csv("Testing.csv")


In [3]:
# Unnamed: 133 -> NAN and fluid_overload -> only contains 0
train= train.drop(columns=['fluid_overload'], axis=1)
test= test.drop(columns=['fluid_overload'], axis=1)

In [4]:
train.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [5]:
test.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction


In [6]:
X_train_before = train.drop(columns=['prognosis'], axis=1)
y_train_before = train['prognosis']
X_test_before = test.drop(columns=['prognosis'], axis=1)
y_test_before = test['prognosis']

In [7]:
train_dropped= train.drop(columns=['burning_micturition', 'swelled_lymph_nodes', 'malaise', 'phlegm', 'dischromic _patches', 'family_history','mucoid_sputum',
 'rusty_sputum','coma','palpitations'], axis=1)
test_dropped= test.drop(columns=['burning_micturition', 'swelled_lymph_nodes', 'malaise', 'phlegm', 'dischromic _patches', 'family_history','mucoid_sputum',
 'rusty_sputum','coma','palpitations'], axis=1)

In [8]:
X_train = train_dropped.drop(columns=['prognosis'], axis=1)
y_train = train_dropped['prognosis']
X_test = test_dropped.drop(columns=['prognosis'], axis=1)
y_test = test_dropped['prognosis']

In [9]:
dct = DecisionTreeClassifier()

# Create the RFE model and select 80 features
rfe = RFE(estimator=dct, n_features_to_select=40)

# Fit the RFE model
rfe.fit(X_train, y_train)

# Transform the data to select the top 80 features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [10]:
model1 = DecisionTreeClassifier()
model1.fit(X_train_rfe, y_train)
y_pred1 = model1.predict(X_test_rfe)

accuracy = accuracy_score(y_test, y_pred1)
print(f'Accuracy after feature selection: {accuracy * 100:.8f}%')

from sklearn.metrics import precision_score

precision = precision_score(y_test, y_pred1, average='macro')  
print(f'Precision (macro): {precision:.8f}')

from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred1, average='macro') 
print(f'F1 Score (macro): {f1:.8f}')

Accuracy after feature selection: 97.61904762%
Precision (macro): 0.98780488
F1 Score (macro): 0.98373984


In [11]:
model2 = DecisionTreeClassifier()
model2.fit(X_train_before, y_train_before)
y_pred2 = model2.predict(X_test_before)

accuracy = accuracy_score(y_test_before, y_pred2)
print(f'Accuracy before feature selection: {accuracy * 100:.8f}%')

precision = precision_score(y_test_before, y_pred2, average='macro')  # You can change 'macro' to 'micro' or 'weighted'
print(f'Precision (macro): {precision:.8f}')

f1 = f1_score(y_test, y_pred1, average='macro')  # You can change 'macro' to 'micro' or 'weighted'
print(f'F1 Score (macro): {f1:.8f}')

Accuracy before feature selection: 97.61904762%
Precision (macro): 0.98780488
F1 Score (macro): 0.98373984


In [12]:
selected_features = np.where(rfe.support_)[0]
print(f'Selected features: {selected_features}')

Selected features: [  0   2   5   7  11  12  13  18  21  23  26  32  35  38  39  40  51  53
  54  59  69  72  74  75  81  84  87  92  93  94  96  99 100 103 104 105
 109 112 115 118]


In [13]:
selected_features = list(selected_features)

columns = list(X_train.columns)
final_features = []
for idx in selected_features:
    final_features.append(columns[idx])

In [14]:
print(final_features)

['itching', 'nodal_skin_eruptions', 'chills', 'stomach_pain', 'vomiting', 'spotting_ urination', 'fatigue', 'weight_loss', 'patches_in_throat', 'cough', 'breathlessness', 'dark_urine', 'pain_behind_the_eyes', 'abdominal_pain', 'diarrhoea', 'mild_fever', 'chest_pain', 'fast_heart_rate', 'pain_during_bowel_movements', 'dizziness', 'excessive_hunger', 'slurred_speech', 'hip_joint_pain', 'muscle_weakness', 'unsteadiness', 'bladder_discomfort', 'passage_of_gases', 'muscle_pain', 'altered_sensorium', 'red_spots_over_body', 'abnormal_menstruation', 'polyuria', 'lack_of_concentration', 'receiving_unsterile_injections', 'stomach_bleeding', 'distention_of_abdomen', 'prominent_veins_on_calf', 'blackheads', 'silver_like_dusting', 'blister']


In [15]:
X_train_final = X_train_before[final_features]
y_train_final = y_train_before
X_test_final = X_test_before[final_features]
y_test_final = y_test_before

In [16]:
X_train_final.head(10)

Unnamed: 0,itching,nodal_skin_eruptions,chills,stomach_pain,vomiting,spotting_ urination,fatigue,weight_loss,patches_in_throat,cough,...,abnormal_menstruation,polyuria,lack_of_concentration,receiving_unsterile_injections,stomach_bleeding,distention_of_abdomen,prominent_veins_on_calf,blackheads,silver_like_dusting,blister
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model Prediction

In [17]:
# Random Forrest Classifier
from sklearn.ensemble import RandomForestClassifier
model_RFC = RandomForestClassifier(n_estimators=150, criterion="entropy")
model_RFC.fit(X_train_final, y_train_final)
RFC_pred = model_RFC.predict(X_test_final)
# accuracy_score(y_test_final, RFC_pred)
print(classification_report(y_test_final, RFC_pred))

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       0.50      1.00      0.67         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

In [18]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors=3, metric="euclidean")
model_knn.fit(X_train_final, y_train_final)
knn_pred = model_knn.predict(X_test_final)
# accuracy_score(y_test_final, RFC_pred)
print(classification_report(y_test_final, knn_pred))

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

In [19]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
model_gbc = GradientBoostingClassifier(n_estimators=150)
model_gbc.fit(X_train_final, y_train_final)
gbc_pred = model_gbc.predict(X_test_final)
print(classification_report(y_test_final, gbc_pred));

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       0.50      1.00      0.67         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

In [20]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib

# Initialize base models
base_models = [
    ('knn', KNeighborsClassifier(n_neighbors=3, metric="euclidean")),
    ('rfc', RandomForestClassifier(n_estimators=150, criterion="entropy")),
    ('gbc', GradientBoostingClassifier(n_estimators=150))
]

# Initialize meta-learner
meta_learner = LogisticRegression()

# Create and train the Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_learner)
stacking_clf.fit(X_train_final, y_train_final)



In [21]:
final_pred = stacking_clf.predict(X_test_final)
print(classification_report(y_test_final, final_pred));
print(accuracy_score(y_test_final, final_pred))

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

In [36]:
# Saving the Model
joblib.dump(stacking_clf, 'disease_prediction_model.pkl')

['disease_prediction_model.pkl']