In [127]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [128]:
train = pd.read_csv("../dataset/Training.csv")
train.shape

(4920, 133)

## Train Test split

In [129]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [130]:
X = train.iloc[: , :-1]
y= train.iloc[:, -1]
X.shape,y.shape

((4920, 132), (4920,))

In [131]:
X.columns

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'pus_filled_pimples', 'blackheads', 'scurring', 'skin_peeling',
       'silver_like_dusting', 'small_dents_in_nails', 'inflammatory_nails',
       'blister', 'red_sore_around_nose', 'yellow_crust_ooze'],
      dtype='object', length=132)

<!-- ### using label encoder for y or target variable -->

In [132]:
le = LabelEncoder()
le.fit(y)
Y=le.transform(y)
Y



array([15, 15, 15, ..., 38, 35, 27], shape=(4920,))

In [133]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3, random_state=20)

## Training top models

In [134]:
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB 

from sklearn.metrics import accuracy_score, confusion_matrix


In [135]:
# creating a dictionary to store models
models={
    "SVC":SVC(kernel='linear'),
    # train an svm that tries to sepearte data using straight line or flat hyperplane 
    # since our dataset contains symptoms(for each column) as 0 or 1 not continous values 
    # in such cases of sparse or high dimensional the data points of differnet classes 
    # are often seperable by straight boundary so linear hyperplane works perfect

    "RandomForest":RandomForestClassifier(n_estimators=100, random_state=42),
    # 100 trees is a sweet spot between acuracy and stability 
    # it gives resonable trainnig time and memory usage 
    # after 100 there is very tiny imporvemnet 

    "GradientBoosting":GradientBoostingClassifier(n_estimators=100,random_state=42),

    "KNN":KNeighborsClassifier(n_neighbors=5),
    #n_neighbors=5 default value of scikit learn 
    # balances bias and variance well in most cases 
    # avoids being too sensitive to noise (k=1)
    # keeps computaion resonable for large dataset 
    # all in all a good starting point for knn 

    "MultinomialNB":MultinomialNB()

}



In [136]:
for model_name , model in models.items():
      
    #train model
    model.fit(X_train,y_train)
    #test model
    predictions=model.predict(X_test)
    #calculate accuracy 
    accuracy=accuracy_score(y_test,predictions)
    #calculate confusion matrix
    cm=confusion_matrix(y_test,predictions)

    print(f"{model_name} accuracy:{accuracy}")
    print('Confusion Matrix ')
    print(np.array2string(cm,separator=','))
    

SVC accuracy:1.0
Confusion Matrix 
[[40, 0, 0,..., 0, 0, 0],
 [ 0,43, 0,..., 0, 0, 0],
 [ 0, 0,28,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,34, 0, 0],
 [ 0, 0, 0,..., 0,41, 0],
 [ 0, 0, 0,..., 0, 0,31]]
RandomForest accuracy:1.0
Confusion Matrix 
[[40, 0, 0,..., 0, 0, 0],
 [ 0,43, 0,..., 0, 0, 0],
 [ 0, 0,28,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,34, 0, 0],
 [ 0, 0, 0,..., 0,41, 0],
 [ 0, 0, 0,..., 0, 0,31]]
GradientBoosting accuracy:1.0
Confusion Matrix 
[[40, 0, 0,..., 0, 0, 0],
 [ 0,43, 0,..., 0, 0, 0],
 [ 0, 0,28,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,34, 0, 0],
 [ 0, 0, 0,..., 0,41, 0],
 [ 0, 0, 0,..., 0, 0,31]]
KNN accuracy:1.0
Confusion Matrix 
[[40, 0, 0,..., 0, 0, 0],
 [ 0,43, 0,..., 0, 0, 0],
 [ 0, 0,28,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,34, 0, 0],
 [ 0, 0, 0,..., 0,41, 0],
 [ 0, 0, 0,..., 0, 0,31]]
MultinomialNB accuracy:1.0
Confusion Matrix 
[[40, 0, 0,..., 0, 0, 0],
 [ 0,43, 0,..., 0, 0, 0],
 [ 0, 0,28,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,34, 0, 0],
 [ 0, 0, 0,..., 0,41, 0],
 [ 0, 0, 

## Single Predictions


In [137]:
svc=SVC(kernel='linear')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
accuracy_score(y_test,y_pred)

1.0

In [138]:
# saving model
import pickle 
# pickle.dump(svc,open("../models/svc.pkl","wb"))

In [139]:
#load model
svc=pickle.load(open('../models/svc.pkl','rb'))

In [140]:
#X_test.iloc[0].values
# reshaping since this was 1d array but our model assumes input to be 2d 
#test1 
svc_x=X_test.iloc[0].values.reshape(1,-1)
print("Predicted Label:",svc.predict(svc_x))
print("Actual Level:",y_test[0])



Predicted Label: [40]
Actual Level: 40




In [141]:
svc_x=X_test.iloc[10].values.reshape(1,-1)
print("Predicted Label:",svc.predict(svc_x))
print("Actual Level:",y_test[10])

Predicted Label: [20]
Actual Level: 20




Recommendation System and Prediction

Load databases and use logic for recommnedations

In [142]:
sym_disease=pd.read_csv('../dataset/symtoms_df.csv')
precautions=pd.read_csv('../dataset/precautions_df.csv')
workout=pd.read_csv('../dataset/workout_df.csv')
description=pd.read_csv('../dataset/description.csv')
medication=pd.read_csv('../dataset/medications.csv')
diets=pd.read_csv('../dataset/diets.csv')



In [143]:
#mapping function to get the labels for differnet diseases.
def get_label_mapping(y):
    """
    Returns a dictionary mapping encoded label numbers
    to their corresponding disease names.
    Example: {0: 'Fungal infection', 1: 'Allergy', ...}
    """
    mapping = dict(zip(le.transform(le.classes_), le.classes_))
    return mapping

disease_list=get_label_mapping(y)
disease_list

{np.int64(0): '(vertigo) Paroymsal  Positional Vertigo',
 np.int64(1): 'AIDS',
 np.int64(2): 'Acne',
 np.int64(3): 'Alcoholic hepatitis',
 np.int64(4): 'Allergy',
 np.int64(5): 'Arthritis',
 np.int64(6): 'Bronchial Asthma',
 np.int64(7): 'Cervical spondylosis',
 np.int64(8): 'Chicken pox',
 np.int64(9): 'Chronic cholestasis',
 np.int64(10): 'Common Cold',
 np.int64(11): 'Dengue',
 np.int64(12): 'Diabetes ',
 np.int64(13): 'Dimorphic hemmorhoids(piles)',
 np.int64(14): 'Drug Reaction',
 np.int64(15): 'Fungal infection',
 np.int64(16): 'GERD',
 np.int64(17): 'Gastroenteritis',
 np.int64(18): 'Heart attack',
 np.int64(19): 'Hepatitis B',
 np.int64(20): 'Hepatitis C',
 np.int64(21): 'Hepatitis D',
 np.int64(22): 'Hepatitis E',
 np.int64(23): 'Hypertension ',
 np.int64(24): 'Hyperthyroidism',
 np.int64(25): 'Hypoglycemia',
 np.int64(26): 'Hypothyroidism',
 np.int64(27): 'Impetigo',
 np.int64(28): 'Jaundice',
 np.int64(29): 'Malaria',
 np.int64(30): 'Migraine',
 np.int64(31): 'Osteoarthristi

In [144]:
def get_feature_mapping(X):
    """
    Returns a dictionary where each symptom (column name)
    is mapped to a unique integer starting from 0.
    Example: {'itching': 0, 'skin_rash': 1, ...}
    """
    feature_mapping = {feature: idx for idx, feature in enumerate(X.columns)}
    return feature_mapping


symptoms_dict = get_feature_mapping(X)
symptoms_dict


{'itching': 0,
 'skin_rash': 1,
 'nodal_skin_eruptions': 2,
 'continuous_sneezing': 3,
 'shivering': 4,
 'chills': 5,
 'joint_pain': 6,
 'stomach_pain': 7,
 'acidity': 8,
 'ulcers_on_tongue': 9,
 'muscle_wasting': 10,
 'vomiting': 11,
 'burning_micturition': 12,
 'spotting_ urination': 13,
 'fatigue': 14,
 'weight_gain': 15,
 'anxiety': 16,
 'cold_hands_and_feets': 17,
 'mood_swings': 18,
 'weight_loss': 19,
 'restlessness': 20,
 'lethargy': 21,
 'patches_in_throat': 22,
 'irregular_sugar_level': 23,
 'cough': 24,
 'high_fever': 25,
 'sunken_eyes': 26,
 'breathlessness': 27,
 'sweating': 28,
 'dehydration': 29,
 'indigestion': 30,
 'headache': 31,
 'yellowish_skin': 32,
 'dark_urine': 33,
 'nausea': 34,
 'loss_of_appetite': 35,
 'pain_behind_the_eyes': 36,
 'back_pain': 37,
 'constipation': 38,
 'abdominal_pain': 39,
 'diarrhoea': 40,
 'mild_fever': 41,
 'yellow_urine': 42,
 'yellowing_of_eyes': 43,
 'acute_liver_failure': 44,
 'fluid_overload': 45,
 'swelling_of_stomach': 46,
 'swelle

In [145]:
#helper funtion
def helper(dis):
    desc = description[description['Disease'] == dis]['Description']
    desc = "".join([w for w in desc])

    pre = precautions[precautions['Disease']==dis][['Precaution_1','Precaution_2','Precaution_3','Precaution_4']] 
    pre = [col for col in pre.values]

    med = medication[medication['Disease']==dis]['Medication']
    med = [med for med in med.values]

    diet = diets[diets['Disease']==dis]['Diet']
    diet = [diet for diet in diet.values]

    workout_info=workout[workout['disease']==dis]['workout']

    return desc,pre,med,diet,workout_info


# model prediction  fucntion 


def get_predicted_value(patient_symptoms):
    #converting the input from user into a vector
    input_vector =np.zeros(len(symptoms_dict))
    for item in patient_symptoms:
        input_vector[symptoms_dict[item]] = 1
    
    #using our model to predict values
    return disease_list[svc.predict([input_vector])[0]]



In [152]:
# --- test 1 ---
import ast
symptoms = input("Enter your symptoms (comma separated): ")

# Clean & split (handles spaces, quotes, brackets)
user_symptoms = [
    s.strip().strip("[]'\"").lower().replace(" ", "_")
    for s in symptoms.split(',')
    if s.strip()
]

#prediction
pred_disease=get_predicted_value(user_symptoms)
desc,pre,med,diet,workout_info=helper(pred_disease)

print("=============Predicted Disease=============")
print(pred_disease)
print("===============Description=================")
print(desc)
print("===============Precatuion==================")
i=1
for p_i in pre[0]:
    print(i,":",p_i)
    i+=1

print("==============Medications===============")
i = 1
for m_i in med:
    # Convert string like "['A', 'B']" to a real list
    if isinstance(m_i, str) and m_i.startswith('[') and m_i.endswith(']'):
        m_i = ast.literal_eval(m_i)
    # If still a list, iterate inside
    if isinstance(m_i, list):
        for sub_m in m_i:
            print(f"{i} : {sub_m}")
            i += 1
    else:
        print(f"{i} : {m_i}")
        i += 1

print("=================Workout====================")
i=1
for w_i in workout_info:
    print(i,":",w_i)
    i+=1

print("==============Diet===============")
i = 1
for d_i in diet:
    if isinstance(d_i, str) and d_i.startswith('[') and d_i.endswith(']'):
        d_i = ast.literal_eval(d_i)
    if isinstance(d_i, list):
        for sub_d in d_i:
            print(f"{i} : {sub_d}")
            i += 1
    else:
        print(f"{i} : {d_i}")
        i += 1






Fungal infection
Fungal infection is a common skin condition caused by fungi.
1 : bath twice
2 : use detol or neem in bathing water
3 : keep infected area dry
4 : use clean cloths
1 : Antifungal Cream
2 : Fluconazole
3 : Terbinafine
4 : Clotrimazole
5 : Ketoconazole
1 : Avoid sugary foods
2 : Consume probiotics
3 : Increase intake of garlic
4 : Include yogurt in diet
5 : Limit processed foods
6 : Stay hydrated
7 : Consume green tea
8 : Eat foods rich in zinc
9 : Include turmeric in diet
10 : Eat fruits and vegetables
1 : Antifungal Diet
2 : Probiotics
3 : Garlic
4 : Coconut oil
5 : Turmeric


