In [18]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

Podaci za treniranje i testiranje su vec razdvojeni u datoteke `Training.csv` i `Testing.csv`.

In [19]:
df_train = pd.read_csv('Training.csv')
df_test = pd.read_csv('Testing.csv')

In [20]:
df_train.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,


In [21]:
df_train.isna()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4916,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4917,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4918,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


Vidimo da cela poslednja kolna ima nedostajuce vrednosti. Zbog toga je brisemo iz skupa podataka.
`axis=1` znaci da brisemo kolone koje imaju nedostajuce vrednosti, `inplace=True` da se brisanje uradi `u mestu`
bez kreiranja nove instance `DataFrame`.

In [22]:
df_train.dropna(axis=1, inplace=True)
df_test.dropna(axis=1, inplace=True)

Enumerisemo sve kategorije  kako bi mogli da mapiramo kategoricke vrednosti naziva bolesti u jedinstveni broj.

In [23]:
categories = {cat:num for num, cat in enumerate(df_train['prognosis'].unique())}

In [24]:
categories

{'Fungal infection': 0,
 'Allergy': 1,
 'GERD': 2,
 'Chronic cholestasis': 3,
 'Drug Reaction': 4,
 'Peptic ulcer diseae': 5,
 'AIDS': 6,
 'Diabetes ': 7,
 'Gastroenteritis': 8,
 'Bronchial Asthma': 9,
 'Hypertension ': 10,
 'Migraine': 11,
 'Cervical spondylosis': 12,
 'Paralysis (brain hemorrhage)': 13,
 'Jaundice': 14,
 'Malaria': 15,
 'Chicken pox': 16,
 'Dengue': 17,
 'Typhoid': 18,
 'hepatitis A': 19,
 'Hepatitis B': 20,
 'Hepatitis C': 21,
 'Hepatitis D': 22,
 'Hepatitis E': 23,
 'Alcoholic hepatitis': 24,
 'Tuberculosis': 25,
 'Common Cold': 26,
 'Pneumonia': 27,
 'Dimorphic hemmorhoids(piles)': 28,
 'Heart attack': 29,
 'Varicose veins': 30,
 'Hypothyroidism': 31,
 'Hyperthyroidism': 32,
 'Hypoglycemia': 33,
 'Osteoarthristis': 34,
 'Arthritis': 35,
 '(vertigo) Paroymsal  Positional Vertigo': 36,
 'Acne': 37,
 'Urinary tract infection': 38,
 'Psoriasis': 39,
 'Impetigo': 40}

Mapiramo nazive bolesti u `int`.

In [25]:
df_train['prognosis'] = df_train['prognosis'].map(categories)
df_test['prognosis'] = df_test['prognosis'].map(categories)

Delimo podataka na osnovu kojih radimo predvidjanje `X` i ciljnu vrednost `y` koju zelimo da predvidimo.

In [26]:
X_train = df_train.drop(['prognosis'], axis=1)
y_train = df_train[['prognosis']]
X_test = df_test.drop(['prognosis'], axis=1)
y_test = df_test[['prognosis']]

Izvlacimo broj jedinstvenih kategorija kako bi znali koliko neurona treba da ubacimo u poslednji sloj mreze.

In [27]:
y_num_of_cat = len(df_train['prognosis'].unique())
y_num_of_cat

41

In [28]:
X_train.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Pre treniranja moramo pretvoriti numericke vrednosti ciljne promenljive u kategoricke. 
Funkcijom `to_categorical` svaka vrednost iz ciljne kolone `y_train|y_test` postaje vektor. 
Na primer, ako je vrednost ciljne promenljive bila `3` (bolest koja ima redni broj 3), onda se ona preslikava u vektor `[0, 0, 0, 1.0, 0, ..., 0]` koji predstavlja vektor verovatnoca pripadnosti instance datim kategorijama.

In [29]:
y_train_cat = keras.utils.to_categorical(y_train)
y_test_cat = keras.utils.to_categorical(y_test)

In [30]:

def build_model():
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(y_num_of_cat, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss=keras.losses.CategoricalCrossentropy(), metrics=[tf.metrics.CategoricalAccuracy(), tf.metrics.CategoricalCrossentropy()])
    
    return model

In [31]:
model = build_model()

In [32]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               17024     
                                                                 
 dense_4 (Dense)             (None, 32)                4128      
                                                                 
 dense_5 (Dense)             (None, 41)                1353      
                                                                 
Total params: 22,505
Trainable params: 22,505
Non-trainable params: 0
_________________________________________________________________


In [33]:
history = model.fit(X_train, y_train_cat, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
model.evaluate(X_test, y_test_cat)



[0.01705722138285637, 1.0, 0.01705722138285637]