In [None]:
!pip install scikit-learn==1.4.2



In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter as ctr
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
dff=pd.read_csv('/content/dummy_dataset.csv')

In [None]:
df = pd.read_csv('/content/Symptom2Disease.csv')
df.sample(10)

Unnamed: 0.1,Unnamed: 0,label,text
1021,121,gastroesophageal reflux disease,I have heartburn and indigestion. I regularly ...
589,289,Acne,I woke up this morning to find that I had a ma...
384,84,Common Cold,I'm coughing nonstop and am really chilly. My ...
318,18,Fungal infection,"I've been experiencing a lot of itching, which..."
207,207,Impetigo,I am suffering from extreme fever and weakness...
1168,268,diabetes,"I have trouble breathing, especially when exer..."
849,249,Jaundice,"I've been losing weight, feeling really fatigu..."
764,164,Cervical spondylosis,"Back pain, a productive cough, and limb weakne..."
436,136,Pneumonia,"Lately I've been experiencing chills, fatigue,..."
1190,290,diabetes,Both my water intake and frequency of urinatio...


In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
ctr(df['label'])

Counter({'Psoriasis': 50,
         'Varicose Veins': 50,
         'Typhoid': 50,
         'Chicken pox': 50,
         'Impetigo': 50,
         'Dengue': 50,
         'Fungal infection': 50,
         'Common Cold': 50,
         'Pneumonia': 50,
         'Dimorphic Hemorrhoids': 50,
         'Arthritis': 50,
         'Acne': 50,
         'Bronchial Asthma': 50,
         'Hypertension': 50,
         'Migraine': 50,
         'Cervical spondylosis': 50,
         'Jaundice': 50,
         'Malaria': 50,
         'urinary tract infection': 50,
         'allergy': 50,
         'gastroesophageal reflux disease': 50,
         'drug reaction': 50,
         'peptic ulcer disease': 50,
         'diabetes': 50})

In [None]:
df.sample(10,random_state=42)

Unnamed: 0,label,text
1178,diabetes,"I have respiratory issues, especially when doi..."
865,Malaria,"I have a high fever, chills, and severe itchin..."
101,Typhoid,"I've also had some diarrhea, which has been re..."
439,Pneumonia,"I've recently been suffering with chills, leth..."
58,Varicose Veins,The swelling in my legs has gotten worse over ...
1120,peptic ulcer disease,I occasionally have burning in my upper abdome...
323,Fungal infection,A rash that appears to be developing throughou...
974,allergy,"I get breathing issues and wheezing, which are..."
411,Pneumonia,"My temperature is very high, and I'm having a ..."
855,Malaria,"I've had a high fever, chills, and intense itc..."


In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    snowball_stemmer = SnowballStemmer('english')
    tokens = [snowball_stemmer.stem(token.lower()) for token in tokens if token.isalpha()]
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)

In [None]:
df.sample(10,random_state=42)

Unnamed: 0,label,text
1178,diabetes,i have respiratori issu especi when do out une...
865,Malaria,i have a high fever chill and sever itch in ad...
101,Typhoid,i also had some diarrhea which has been realli...
439,Pneumonia,i recent been suffer with chill lethargi a cou...
58,Varicose Veins,the swell in my leg has gotten wors over the p...
1120,peptic ulcer disease,i occasion have burn in my upper abdomen throu...
323,Fungal infection,a rash that appear to be develop throughout my...
974,allergy,i get breath issu and wheez which are asthma s...
411,Pneumonia,my temperatur is veri high and i have a hard t...
855,Malaria,i had a high fever chill and intens itch in ad...


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

X=list(df['text'])

cv = CountVectorizer(max_features=3000)
X = (cv.fit_transform(X).toarray())

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
base_models = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('lr', LogisticRegression())
]

In [None]:
voting_classifier = VotingClassifier(estimators=base_models, voting='hard')

In [None]:
voting_classifier.fit(X_train, y_train)

In [None]:
accuracy = voting_classifier.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9708333333333333


In [None]:
# Sample text
sample_text = "I have been experiencing a skin rash on my arm for the past few weeks."
sample_text_processed = preprocess_text(sample_text)
sample_text_transformed = cv.transform([sample_text_processed])
predicted_label = label_encoder.inverse_transform(voting_classifier.predict(sample_text_transformed))

print("Predicted Label:", predicted_label)

Predicted Label: ['Psoriasis']


In [None]:
text = 'i been realli weari and ill i been suffer from..'

sample_text = text
sample_text_processed = preprocess_text(sample_text)
sample_text_transformed = cv.transform([sample_text_processed])
predicted_label = label_encoder.inverse_transform(voting_classifier.predict(sample_text_transformed))

print("Predicted Label:", predicted_label)

Predicted Label: ['Dimorphic Hemorrhoids']


In [None]:
df.sample()

Unnamed: 0,label,text
439,Pneumonia,i recent been suffer with chill lethargi a cou...


In [None]:
# Compute accuracy
accuracy = accuracy_score(y_test, y_test)
print("Accuracy:", accuracy)

# Compute precision
precision = precision_score(y_test, y_test, average='macro')  # 'macro' computes precision for each label, and returns the average
print("Precision:", precision)

# Compute recall
recall = recall_score(y_test, y_test, average='macro')  # 'macro' computes recall for each label, and returns the average
print("Recall:", recall)

# Compute F1-score
f1 = f1_score(y_test, y_test, average='macro')  # 'macro' computes F1-score for each label, and returns the average
print("F1-score:", f1)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_test)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
Confusion Matrix:
[[ 7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 12  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 12  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 12  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  

In [None]:
# Sample text
sample_text = "I hae ben experiencingggg contant emoticon fluctuatone."

# Preprocess the sample text
def preprocess_text(text):
    tokens = word_tokenize(text)
    snowball_stemmer = SnowballStemmer('english')
    tokens = [snowball_stemmer.stem(token.lower()) for token in tokens if token.isalpha()]
    return ' '.join(tokens)

sample_text_processed = preprocess_text(sample_text)

# Transform the preprocessed sample text using the loaded vectorizer
sample_text_transformed = cv.transform([sample_text_processed])

# Predict using the loaded model
predicted_label_encoded = voting_classifier.predict(sample_text_transformed)

# Decode the predicted label
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

x=predicted_label[0]
y='08:00'
# Filter the DataFrame to only include rows where 'Associated_Diseases' is 'x' and the doctor is available at time 'y'
filtered_dff = dff[(dff['Associated_Diseases'] == x ) & (dff['Arrival_Time'] <= y) & (dff['Departure_Time'] >= y)]

# If no doctor is available at time 'y', print a message and exit
if filtered_dff.empty:
    print("No doctor is available at the specified time.")
else:
    # Find the row with the maximum rating
    best_doctor = filtered_dff[filtered_dff['Rating'] == filtered_dff['Rating'].max()]

    # Get the 'Doctor_ID' and 'Contact_Number' of the best doctor
    best_doctor_id = best_doctor['Doctor_ID'].values[0]
    best_doctor_contact = best_doctor['Contact_Number'].values[0]

    print(f"The doctor with the highest rating who can treat disease \"{x}\" and is available at time \"{y}\" is Doctor {best_doctor_id}. You can contact them at {best_doctor_contact}.")


The doctor with the highest rating who can treat disease "drug reaction" and is available at time "08:00" is Doctor 921. You can contact them at 001-664-794-0561x8836.


In [None]:
import pickle

In [None]:
with open('Disease_prediction.pkl','wb') as file:
  pickle.dump(voting_classifier,file)

In [None]:
with open('Count_vector.pkl','wb') as file:
  pickle.dump(cv,file)

In [None]:
import sklearn
print(sklearn.__version__)



1.4.2


In [None]:
with open('Encoder.pkl','wb') as file:
  pickle.dump(label_encoder,file)