In [1]:
import csv
from decimal import Decimal

# create a dictionary to store the symptom counts for each disease
disease_symptom_counts = {}


In [2]:
# read in the CSV file
with open('Training.csv') as csvfile:
    reader = csv.reader(csvfile)
    headers = next(reader)  # skip the header row
    total_counts = {header: 0 for header in headers[:-1]}
    for row in reader:
        disease = row[-1]
        symptoms = row[:-1]

        # if the disease isn't already in the dictionary, add it
        if disease not in disease_symptom_counts:
            disease_symptom_counts[disease] = {}

        # count the number of times each symptom appears for this disease
        for i, symptom in enumerate(symptoms):
            if symptom == '1':
                if headers[i] not in disease_symptom_counts[disease]:
                    disease_symptom_counts[disease][headers[i]] = 1
                else:
                    disease_symptom_counts[disease][headers[i]] += 1
                total_counts[headers[i]] += 1


In [3]:
# write the resulting dataset to a new CSV file
with open('SymptomCounts.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)
    for disease, symptom_counts in disease_symptom_counts.items():
        row = [symptom_counts.get(header, 0) for header in headers[:-1]] + [disease]
        writer.writerow(row)

In [4]:
# calculate percentages and store in a new dictionary
disease_symptom_percentages = {}
for disease, symptom_counts in disease_symptom_counts.items():
    disease_symptom_percentages[disease] = {}
    for symptom, count in symptom_counts.items():
        percentage = (count / total_counts[symptom]) * 100
        disease_symptom_percentages[disease][symptom] = percentage

# print out the resulting dataset and write to a CSV file
with open('SymptomCounts%.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)
    for disease, symptom_percentages in disease_symptom_percentages.items():
        row = [symptom_percentages.get(header, 0) for header in headers[:-1]]
        row.append(disease)
        writer.writerow(row)

In [5]:
# read in the CSV file and perform calculations
with open('SymptomCounts%.csv') as csvfile:
    reader = csv.reader(csvfile)
    headers = next(reader)  # skip the header row
    data = []
    for row in reader:
        symptom_counts = {headers[i]: float(row[i]) for i in range(len(headers) - 1)}
        disease = row[-1]
        # calculate the total count of symptoms for all diseases
        total_symptom_count = sum(symptom_counts.values())

        # calculate the percentage of each symptom count compared to the total count of symptoms
        symptom_percentages = {symptom: round(count, 1) for symptom, count in symptom_counts.items()}
                
        # get the 5 main symptoms for this disease
        main_symptoms = sorted(symptom_percentages.items(), key=lambda x: x[1], reverse=True)[:5]
        other_symptoms = []
        # Check if any other symptoms have percentage greater than 80 and not already in main symptoms
        for symptom, percentage in symptom_percentages.items():
            if percentage > 80 and symptom not in [s for s, _ in main_symptoms]:
                other_symptoms.append((symptom, percentage))
        # Add other symptoms to main symptoms if they exist
        main_symptoms += other_symptoms
        # add the results to the data list
        symptoms_list = [f'{symptom}: {percentage}%' for symptom, percentage in main_symptoms]       
        row_data = [disease] + symptoms_list
        data.append(row_data)

# write the new dataset into a csv file
with open('top_5_Symptoms.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Total Symptoms'] + ['Top Symptom ' + str(i+1) for i in range(5)])
    for row in data:
        writer.writerow(row)

In [6]:
# read in the CSV file and perform calculations
disease_symptom_prob = {}
with open('top_5_Symptoms.csv') as csvfile:
    reader = csv.reader(csvfile)
    headers = next(reader)  # skip the header row
    for row in reader:
        disease=row[0]
        for col in row[1:]:
            symptom,value =col.split(': ')
            symptom_prob = float(value.strip('%'))
            
            if disease not in disease_symptom_prob:
                disease_symptom_prob[disease] = {}
    
            disease_symptom_prob[disease][symptom] = symptom_prob

print(disease_symptom_prob)

            

{'Fungal infection': {'nodal_skin_eruptions': 100.0, 'dischromic _patches': 100.0, 'itching': 15.9, 'skin_rash': 13.7, 'continuous_sneezing': 0.0}, 'Allergy': {'shivering': 100.0, 'watering_from_eyes': 100.0, 'continuous_sneezing': 48.6, 'chills': 13.5, 'itching': 0.0}, 'GERD': {'ulcers_on_tongue': 100.0, 'stomach_pain': 51.4, 'acidity': 48.6, 'cough': 20.2, 'chest_pain': 16.4}, 'Chronic cholestasis': {'itching': 16.8, 'yellowing_of_eyes': 14.0, 'yellowish_skin': 12.5, 'abdominal_pain': 11.0, 'nausea': 9.9}, 'Drug Reaction': {'spotting_ urination': 100.0, 'burning_micturition': 50.0, 'stomach_pain': 48.6, 'itching': 16.8, 'skin_rash': 13.7}, 'Peptic ulcer diseae': {'passage_of_gases': 100.0, 'internal_itching': 100.0, 'indigestion': 48.6, 'abdominal_pain': 11.0, 'loss_of_appetite': 9.4}, 'AIDS': {'muscle_wasting': 100.0, 'patches_in_throat': 100.0, 'extra_marital_contacts': 100.0, 'high_fever': 8.4, 'itching': 0.0}, 'Diabetes ': {'irregular_sugar_level': 100.0, 'increased_appetite': 10