In [2]:
import pandas as pd
df = pd.read_csv('augmented_diseases_extended.csv')

In [3]:
df

Unnamed: 0,DiseaseID,Category,Disease,Symptom1,Symptom2,Symptom3,Symptom4,FirstLineMeds,Dosage,ProtocolStep1,...,DietRecommendation2,WorkoutRecommendation1,WorkoutRecommendation2,Symptoms,Protocol,Comorbidities,Precautions,Diet,Workout,Age
0,D001,Endocrine,Type 2 Diabetes,Thirst,Fatigue,Blurred vision,Weight loss,Metformin,500mg BID,Start Metformin,...,Low-carb intake,30 min daily walking,Resistance training,"Thirst, Fatigue, Blurred vision, Weight loss",Start Metformin\nAdd SGLT2 if CVD risk\nMonito...,"Obesity, Hypertension","Regular screenings, Avoid processed sugar","High-fiber diet, Low-carb intake","30 min daily walking, Resistance training",
1,D002,Cardiology,Hypertension,Headaches,Dizziness,Chest pain,Fatigue,Lisinopril,10mg daily,Start ACEI,...,High potassium foods,150 min/week swimming,Aerobic jogging,"Headaches, Dizziness, Chest pain, Fatigue",Start ACEI\nThiazide if Black patient\nCheck B...,"Diabetes, CKD","Monitor BP, Reduce sodium","DASH diet, High potassium foods","150 min/week swimming, Aerobic jogging",
2,D003,Psychiatry,Major Depression,Low mood,Anhedonia,Sleep issues,Fatigue,Escitalopram,10mg daily,Start SSRI,...,No processed sugars,Yoga 3x/week,Strength training,"Low mood, Anhedonia, Sleep issues, Fatigue",Start SSRI\nAdd CBT for mild cases\nPHQ-9 qvisit,"Anxiety, Chronic Pain","Maintain sleep, Avoid alcohol","Omega-3 rich foods, No processed sugars","Yoga 3x/week, Strength training",
3,D004,Infectious,Influenza,Fever,Cough,Myalgia,Headache,Oseltamivir,75mg BID,Start within 48h,...,Lean proteins,Moderate exercise,Hydration focus,"Fever, Cough, Myalgia, Headache",Start within 48h\nVaccinate annually\nMonitor ...,"Asthma, Pregnancy","Hand hygiene, Stay hydrated","Balanced diet, Lean proteins","Moderate exercise, Hydration focus",
4,D005,Hematology,Sickle Cell Anemia,Pain crises,Fatigue,Jaundice,Swelling,Hydroxyurea,15mg/kg daily,Start hydroxyurea,...,Lean proteins,Low-impact walking,Stretching daily,"Pain crises, Fatigue, Jaundice, Swelling",Start hydroxyurea\nHydrate well\nVaccinate vs ...,"Stroke, Infections","Avoid dehydration, Monitor pain","High-fluid intake, Lean proteins","Low-impact walking, Stretching daily",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17746,,,Systemic Sclerosis Variant 485,,,,,,,,...,,,,,,,,,,
17747,,,Pituitary Adenoma,,,,,,,,...,,,,,,,,,,
17748,,,Pulmonary Fibrosis,,,,,,,,...,,,,,,,,,,
17749,,,Somatization Disorder,,,,,,,,...,,,,,,,,,,


In [4]:
print("Unique AgeGroup values:", df['AgeGroup'].unique())

Unique AgeGroup values: ['18-64' '0-120' '0-64' '65+' '13-64' '0-17' nan]


In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

class DiseasePredictor:
    def __init__(self, diseases_df):
        self.df = diseases_df
        
        # Preprocess symptoms and additional context
        self.df['All_Symptoms'] = self.df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(
            lambda x: ' '.join(x.dropna().astype(str)), axis=1)
        self.df['Context'] = self.df['Category'] + ' ' + self.df['Comorbidity1'].fillna('') + ' ' + self.df['Comorbidity2'].fillna('')
        
        # Combine symptoms and context for richer input
        self.X_full = self.df['All_Symptoms'] + ' ' + self.df['Context']
        
        # Create multi-label targets
        self.mlb = MultiLabelBinarizer()
        self.targets = self.mlb.fit_transform(self.df['Disease'].apply(lambda x: [x]))
        self.classes = self.mlb.classes_
        
        # Build pipeline
        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
            ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=100)))
        ])
        
    def train(self):
        self.pipeline.fit(self.X_full, self.targets)
        
    def predict(self, symptoms, age=None, gender=None, pregnancy_status=None, comorbidities=None, top_n=5):
        symptoms_text = ' '.join(symptoms) if isinstance(symptoms, list) else symptoms
        if comorbidities:
            symptoms_text += ' ' + ' '.join(comorbidities)
        
        probas = self.pipeline.predict_proba([symptoms_text])
        disease_probs = [(self.classes[i], probas[i][0][1]) for i in range(len(probas))]
        
        filtered_diseases = []
        for disease, prob in disease_probs:
            disease_row = self.df[self.df['Disease'] == disease].iloc[0]
            age_ok = True
            if age and pd.notna(disease_row['AgeGroup']):
                age_range = str(disease_row['AgeGroup'])
                if '+' in age_range:
                    min_age = int(age_range.replace('+', ''))
                    age_ok = min_age <= age
                elif '-' in age_range:
                    min_age, max_age = map(int, age_range.split('-'))
                    age_ok = min_age <= age <= max_age
            
            gender_ok = True if not gender else disease_row['Gender'] in ['M=F', gender]
            pregnancy_ok = True if pregnancy_status != 'Pregnant' else disease_row['PregnancySafety'] in ['N/A', 'A', 'B']
            
            if age_ok and gender_ok and pregnancy_ok:
                filtered_diseases.append((disease, prob))
        
        filtered_diseases.sort(key=lambda x: x[1], reverse=True)
        return filtered_diseases[:top_n]

# Load dataset
diseases_df = pd.read_csv("diseases_350.csv")

# Initialize and train predictor
predictor = DiseasePredictor(diseases_df)
predictor.train()

# Example prediction
sample_symptoms = ['Fever', 'Cough']
sample_age = 25
sample_gender = 'M'
predictions = predictor.predict(sample_symptoms, age=sample_age, gender=sample_gender, top_n=3)
print("\nSample Prediction:")
print(f"Symptoms: {sample_symptoms}, Age: {sample_age}, Gender: {sample_gender}")
print("Top 3 Predicted Diseases:", predictions)


FileNotFoundError: [Errno 2] No such file or directory: 'diseases_350.csv'

In [7]:
# Create a simpler symptom-based disease prediction system

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
diseases_df = pd.read_csv('augmented_diseases_extended.csv', encoding='Windows-1252')

# Create a symptom-based search system
class SymptomBasedDiseaseSearch:
    def __init__(self, df):
        self.df = df.copy()
        
        # Fill NaN values
        self.df.fillna('', inplace=True)
        
        # Create a combined symptoms column
        self.df['All_Symptoms'] = self.df[['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']].apply(
            lambda x: ' '.join([str(s) for s in x if str(s) != '']), axis=1
        )
        
        # Create TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.symptom_vectors = self.vectorizer.fit_transform(self.df['All_Symptoms'])
        
    def search(self, symptoms, category=None, top_n=5):
        # Convert symptoms to string
        symptoms_text = ' '.join(symptoms) if isinstance(symptoms, list) else symptoms
        
        # Transform input symptoms
        input_vector = self.vectorizer.transform([symptoms_text])
        
        # Calculate similarity
        similarities = cosine_similarity(input_vector, self.symptom_vectors).flatten()
        
        # Get indices of top matches
        if category:
            # Filter by category if provided
            category_mask = self.df['Category'] == category
            if category_mask.sum() > 0:
                # Get similarities only for the specified category
                category_similarities = similarities.copy()
                category_similarities[~category_mask] = 0
                top_indices = category_similarities.argsort()[-top_n:][::-1]
            else:
                # If category not found, use all diseases
                top_indices = similarities.argsort()[-top_n:][::-1]
        else:
            top_indices = similarities.argsort()[-top_n:][::-1]
        
        # Return top matches with similarity scores
        results = []
        for idx in top_indices:
            disease = self.df.iloc[idx]['Disease']
            category = self.df.iloc[idx]['Category']
            symptoms = [self.df.iloc[idx][f'Symptom{i}'] for i in range(1, 5) if self.df.iloc[idx][f'Symptom{i}'] != '']
            meds = self.df.iloc[idx]['FirstLineMeds']
            protocol = [self.df.iloc[idx][f'ProtocolStep{i}'] for i in range(1, 4) if self.df.iloc[idx][f'ProtocolStep{i}'] != '']
            
            results.append({
                'Disease': disease,
                'Category': category,
                'Similarity': similarities[idx],
                'Symptoms': symptoms,
                'FirstLineMeds': meds,
                'Protocol': protocol
            })
        
        return results

# Create the search system
search_system = SymptomBasedDiseaseSearch(diseases_df)

# Example search
sample_symptoms = ['Fever', 'Cough', 'Fatigue']
results = search_system.search(sample_symptoms, top_n=5)

print("Top 5 diseases matching symptoms:", sample_symptoms)
for i, result in enumerate(results, 1):
    print(f"\
{i}. {result['Disease']} ({result['Category']}) - Similarity: {result['Similarity']:.4f}")
    print(f"   Symptoms: {', '.join(result['Symptoms'])}")
    print(f"   First-line medication: {result['FirstLineMeds']}")
    print(f"   Protocol: {', '.join(result['Protocol'])}")

  self.df.fillna('', inplace=True)


Top 5 diseases matching symptoms: ['Fever', 'Cough', 'Fatigue']
1. Tuberculous Pleurisy (Pulmonology) - Similarity: 0.8238
   Symptoms: Dyspnea, Cough, Fever, Fatigue
   First-line medication: Rifampin + INH
   Protocol: Start multidrug therapy, Drain effusion, Monitor recurrence
2. Hypersensitivity Pneumonitis (Pulmonology) - Similarity: 0.8238
   Symptoms: Dyspnea, Cough, Fatigue, Fever
   First-line medication: Prednisone
   Protocol: Start steroids, Remove allergen, Monitor PFTs
3. Hypersensitivity Pneumonitis (Pulmonology) - Similarity: 0.8238
   Symptoms: Dyspnea, Cough, Fatigue, Fever
   First-line medication: Prednisone
   Protocol: Start steroids, Remove allergen, Monitor PFTs
4. Hypersensitivity Pneumonitis (Pulmonology) - Similarity: 0.8238
   Symptoms: Dyspnea, Cough, Fatigue, Fever
   First-line medication: Prednisone
   Protocol: Start steroids, Remove allergen, Monitor PFTs
5. Hypersensitivity Pneumonitis (Pulmonology) - Similarity: 0.8238
   Symptoms: Dyspnea, Cough, Fa

In [8]:
# Preprocessing the dataframe to combine symptom and protocol columns
# Final refined model.
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Load dataframe
df = pd.read_csv('augmented_diseases_extended.csv', encoding='Windows-1252')

# Combine symptom columns into one 'Symptoms' column
symptom_columns = ['Symptom1', 'Symptom2', 'Symptom3', 'Symptom4']

def combine_symptoms(row):
    symptoms = []
    for col in symptom_columns:
        val = row[col]
        if isinstance(val, str) and val.strip() != '':
            symptoms.append(val.strip())
    return ', '.join(symptoms)

# Combine protocol columns into one 'Protocol' column
protocol_columns = ['ProtocolStep1', 'ProtocolStep2', 'ProtocolStep3']

def combine_protocol(row):
    steps = []
    for col in protocol_columns:
        val = row[col]
        if isinstance(val, str) and val.strip() != '':
            steps.append(val.strip())
    return ', '.join(steps)

# Create new columns
df['Symptoms'] = df.apply(combine_symptoms, axis=1)
df['Protocol'] = df.apply(combine_protocol, axis=1)

# Clean up: Remove rows where Symptoms is empty
df = df[df['Symptoms'] != '']

# Define the enhanced search class
class EnhancedDiseaseSearch:
    def __init__(self, disease_data):
        self.df = disease_data
        self.vectorizer = CountVectorizer(binary=True)
        
        # Build symptom vectors using the combined 'Symptoms' column
        self.symptom_matrix = self.vectorizer.fit_transform(self.df['Symptoms'])
        
        # Add age and gender relevance (sample enhancements)
        np.random.seed(42)  # For reproducibility
        
        # Age ranges: pediatric, young adult, adult, elderly
        age_ranges = ['0-18', '19-40', '41-65', '65+']
        self.df['AgeRelevance'] = [np.random.choice(age_ranges, size=np.random.randint(1, len(age_ranges)+1), replace=False).tolist() for _ in range(len(self.df))]
        
        # Gender relevance
        gender_options = ['M', 'F', 'Both']
        self.df['GenderRelevance'] = [np.random.choice(gender_options) for _ in range(len(self.df))]
        
    def search(self, symptoms, age=None, gender=None, top_n=5):
        # Vectorize input symptoms
        input_text = ', '.join(symptoms)
        input_vector = self.vectorizer.transform([input_text])
        
        # Calculate cosine similarity
        similarities = cosine_similarity(input_vector, self.symptom_matrix).flatten()
        
        # Create results with similarity scores
        results = []
        for i, sim in enumerate(similarities):
            result = {
                'Disease': self.df.iloc[i]['Disease'],
                'Category': self.df.iloc[i]['Category'],
                'Symptoms': self.df.iloc[i]['Symptoms'].split(', '),
                'Similarity': sim,
                'FirstLineMeds': self.df.iloc[i]['FirstLineMeds'],
                'Protocol': self.df.iloc[i]['Protocol'].split(', '),
                'AgeRelevance': self.df.iloc[i]['AgeRelevance'],
                'GenderRelevance': self.df.iloc[i]['GenderRelevance']
            }
            results.append(result)
        
        # Sort by similarity
        results.sort(key=lambda x: x['Similarity'], reverse=True)
        
        # Apply age filtering if provided
        if age is not None:
            age_value = int(age)
            if age_value <= 18:
                age_range = '0-18'
            elif age_value <= 40:
                age_range = '19-40'
            elif age_value <= 65:
                age_range = '41-65'
            else:
                age_range = '65+'
            
            for result in results:
                if age_range in result['AgeRelevance']:
                    result['Similarity'] += 0.1  
            results.sort(key=lambda x: x['Similarity'], reverse=True)
        
        # Apply gender filter if provided
        if gender is not None:
            gender_letter = gender.upper()[0]  
            for result in results:
                if result['GenderRelevance'] == gender_letter or result['GenderRelevance'] == 'Both':
                    result['Similarity'] += 0.05  
            results.sort(key=lambda x: x['Similarity'], reverse=True)
        
        return results[:top_n]

# Instantiate the enhanced search system
enhanced_search = EnhancedDiseaseSearch(df)

# Test with symptoms, age, and gender
test_symptoms = ['Headache', 'Blurred vision', 'Weight loss']
results_with_filters = enhanced_search.search(test_symptoms, age=45, gender='F', top_n=5)

print('Top 5 diseases matching symptoms for a 45-year-old female:')
print('Symptoms:', test_symptoms)
print('\
Results:')
for i, result in enumerate(results_with_filters, 1):
    print('\
' + str(i) + '. ' + result['Disease'] + ' (' + result['Category'] + ') - Similarity: ' + str(round(result['Similarity'], 4)))
    print('   Symptoms: ' + ', '.join(result['Symptoms']))
    print('   First-line medication: ' + str(result['FirstLineMeds']))
    print('   Protocol: ' + ', '.join(result['Protocol']))
    print('   Age relevance: ' + str(result['AgeRelevance']))
    print('   Gender relevance: ' + str(result['GenderRelevance']))

# Compare with a different demographic
results_different = enhanced_search.search(test_symptoms, age=12, gender='M', top_n=5)

print('\
\
Top 5 diseases matching the same symptoms for a 12-year-old male:')
print('Symptoms:', test_symptoms)
print('\
Results:')
for i, result in enumerate(results_different, 1):
    print('\
' + str(i) + '. ' + result['Disease'] + ' (' + result['Category'] + ') - Similarity: ' + str(round(result['Similarity'], 4)))
    print('   Symptoms: ' + ', '.join(result['Symptoms']))
    print('   First-line medication: ' + str(result['FirstLineMeds']))
    print('   Protocol: ' + ', '.join(result['Protocol']))
   

Top 5 diseases matching symptoms for a 45-year-old female:
Symptoms: ['Headache', 'Blurred vision', 'Weight loss']
Results:
1. Type 2 Diabetes (Endocrine) - Similarity: 0.8803
   Symptoms: Thirst, Fatigue, Blurred vision, Weight loss
   First-line medication: Metformin
   Protocol: Start Metformin, Add SGLT2 if CVD risk, Monitor A1C q3mo
   Age relevance: ['0-18', '41-65', '65+', '19-40']
   Gender relevance: F
2. Type 2 Diabetes (Endocrine) - Similarity: 0.8803
   Symptoms: Thirst, Fatigue, Blurred vision, Weight loss
   First-line medication: Metformin
   Protocol: Start Metformin, Add SGLT2 if CVD risk, Monitor A1C q3mo
   Age relevance: ['65+', '41-65']
   Gender relevance: F
3. Type 2 Diabetes (Endocrine) - Similarity: 0.8803
   Symptoms: Thirst, Fatigue, Blurred vision, Weight loss
   First-line medication: Metformin
   Protocol: Start Metformin, Add SGLT2 if CVD risk, Monitor A1C q3mo
   Age relevance: ['41-65', '19-40', '0-18']
   Gender relevance: F
4. Type 2 Diabetes (Endocri