### Transforming disease data

This part of the notebook shows the transformations over data related to diseases and symptoms. For each unique symptom, a new column was created in the dataframe. If symptom is typical for a paticular disease, then the column related to that symptom is set to 1, and otherwise to 0.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pickle

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
df = df.drop_duplicates()

In [4]:
symptoms = np.concatenate((df.Symptom_1.unique(), df.Symptom_2.unique(), df.Symptom_3.unique(), df.Symptom_4.unique(), 
                          df.Symptom_5.unique(), df.Symptom_6.unique(), df.Symptom_7.unique(), df.Symptom_8.unique(), 
                          df.Symptom_9.unique(), df.Symptom_10.unique(), df.Symptom_11.unique(), df.Symptom_12.unique(), 
                          df.Symptom_13.unique(), df.Symptom_14.unique(), df.Symptom_15.unique(), df.Symptom_16.unique(), 
                          df.Symptom_17.unique()))

In [5]:
symptoms_unique = list((set(symptoms)))

In [6]:
len(symptoms_unique)

132

In [7]:
i = 18
for each in symptoms_unique:
    df.insert(i, each, 0)
    i = i + 1

In [8]:
df = df.fillna(0)

In [9]:
for index, row in df.iterrows(): 
    disease_symptoms = [symptom for symptom in list(row)[1:] if symptom != 0]
    for each in disease_symptoms: 
        df.at[index, each] = 1

In [10]:
df = df.drop(columns = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 
             'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16'
             , 'Symptom_17'])

In [11]:
df = df.loc[:, df.columns.notnull()]
df.columns = df.columns.str.replace(' ', '')
df = df.reindex(sorted(df.columns), axis=1)

In [12]:
list(df.columns)

['Disease',
 'abdominal_pain',
 'abnormal_menstruation',
 'acidity',
 'acute_liver_failure',
 'altered_sensorium',
 'anxiety',
 'back_pain',
 'belly_pain',
 'blackheads',
 'bladder_discomfort',
 'blister',
 'blood_in_sputum',
 'bloody_stool',
 'blurred_and_distorted_vision',
 'breathlessness',
 'brittle_nails',
 'bruising',
 'burning_micturition',
 'chest_pain',
 'chills',
 'cold_hands_and_feets',
 'coma',
 'congestion',
 'constipation',
 'continuous_feel_of_urine',
 'continuous_sneezing',
 'cough',
 'cramps',
 'dark_urine',
 'dehydration',
 'depression',
 'diarrhoea',
 'dischromic_patches',
 'distention_of_abdomen',
 'dizziness',
 'drying_and_tingling_lips',
 'enlarged_thyroid',
 'excessive_hunger',
 'extra_marital_contacts',
 'family_history',
 'fast_heart_rate',
 'fatigue',
 'fluid_overload',
 'foul_smell_ofurine',
 'headache',
 'high_fever',
 'hip_joint_pain',
 'history_of_alcohol_consumption',
 'increased_appetite',
 'indigestion',
 'inflammatory_nails',
 'internal_itching',
 'irr

### Description, Precaution and Symptom Severity data

In [13]:
#treba uz predikciju vratiti i taj opis
diseases_description = pd.read_csv("symptom_Description.csv")

In [14]:
#treba nekak uključiti u odluku treba li posjetiti doktora ili ne (ovisno o ukupnom severityu)
symptom_severity = pd.read_csv("Symptom-severity.csv")

In [15]:
#treba to uz predikciju složiti kao neku rečenicu - NLP?
disease_precaution = pd.read_csv("symptom_precaution.csv")

### Building the machine learning model

In [16]:
labels = df.to_numpy()[:, :1]
examples = df.to_numpy()[:, 1:]
list_of_symptoms = list(df.columns)[1:]

with open('list_of_symptoms.pickle', 'wb') as data_file: 
    pickle.dump(list_of_symptoms, data_file)

print(len(labels))
print(len(examples))
print(len(list_of_symptoms))

304
304
131


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)
X_train, X_test, y_train, y_test = train_test_split(examples, labels.ravel(), test_size=0.3)
knn.fit(X_train, y_train)

with open('fitted_model.pickle', 'wb') as modelFile:
    model = pickle.dump(knn, modelFile)

In [18]:
symptoms = ['stomach_pain', 'headache'] 
x_test = []

with open('list_of_symptoms.pickle', 'rb') as data_file:
    symptoms_list = pickle.load(data_file)

for each in symptoms_list: 
    if each in symptoms:
        x_test.append(1)
    else: 
        x_test.append(0)

with open('fitted_model.pickle', 'rb') as modelFile:
    model = pickle.load(modelFile)

x_test = np.asarray(x_test)            
disease = model.predict(x_test.reshape(1,-1))[0]
print(disease)

Drug Reaction


In [19]:
knn.score(X_test, y_test)

0.7934782608695652

In [20]:
description = diseases_description.loc[diseases_description['Disease'] == disease, 'Description'].iloc[0]

In [21]:
symptom_severity.sort_values(by='weight')

Unnamed: 0,Symptom,weight
0,itching,1
21,lethargy,2
98,altered_sensorium,2
121,painful_walking,2
97,muscle_pain,2
...,...,...
25,high_fever,7
56,chest_pain,7
57,weakness_in_limbs,7
46,swelling_of_stomach,7


In [22]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)

dc_tree = clf.fit(X_train, y_train)
dc_tree.predict(x_test.reshape(1,-1))

array(['Drug Reaction'], dtype=object)

In [23]:
dc_tree.score(X_test, y_test)

0.7065217391304348