In [2]:
# general libiraries
import pandas as pd
import numpy as np

# plotting
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
# modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

import warnings 
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/kaggle/input/diagnosis-data-filtered-translated/my_data.csv')

In [4]:
df.columns

Index(['AGE', 'SEX', 'Bronchitis', 'Pneumonia', 'URTI', 'Bronchiectasis',
       'Tuberculosis', 'Influenza', 'HIV (initial infection)', 'Chagas',
       ...
       'Have you breastfed one of your children for more than 9 months?',
       'Have you felt confused or disorientated lately?',
       'In the last month, have you been in contact with anyone infected with the Ebola virus?',
       'Have you noticed any unusual bleeding or bruising related to your consultation today?',
       'Do you live in the suburbs?',
       'Do you ever temporarily stop breathing while you’re asleep?',
       'Do you have a decrease in appetite?',
       'Does your mother suffer from asthma?', 'Do you live in a rural area?',
       'Are you of Asian descent?'],
      dtype='object', length=274)

In [5]:
X = df.drop(columns = df.columns[2:51],axis =1)
y = df[df.columns[2:51]]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

scaler_x = StandardScaler()
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

In [6]:
max_columns_per_row = y.idxmax(axis=1)

In [7]:
count_per_column = max_columns_per_row.value_counts().reset_index()
count_per_column.columns = ['Column', 'Count']

In [8]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080881 entries, 0 to 1080880
Data columns (total 49 columns):
 #   Column                                    Non-Null Count    Dtype  
---  ------                                    --------------    -----  
 0   Bronchitis                                1080881 non-null  float64
 1   Pneumonia                                 1080881 non-null  float64
 2   URTI                                      1080881 non-null  float64
 3   Bronchiectasis                            1080881 non-null  float64
 4   Tuberculosis                              1080881 non-null  float64
 5   Influenza                                 1080881 non-null  float64
 6   HIV (initial infection)                   1080881 non-null  float64
 7   Chagas                                    1080881 non-null  float64
 8   Scombroid food poisoning                  1080881 non-null  float64
 9   Sarcoidosis                               1080881 non-null  float64
 10  Unstab

In [9]:
y_train
y_train = np.argmax(y_train,axis =1)
y_test = np.argmax(y_test,axis =1)

In [10]:
model = Sequential(
   
    [
        Dense(units = 256,activation  = "relu",input_dim= X_train.shape[1]),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(units = 256,activation  = "relu",input_dim= X_train.shape[1]),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(units = 256,activation  = "relu",input_dim= X_train.shape[1]),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(units = 128,activation  = "relu",input_dim= X_train.shape[1]),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(units = 128,activation  = "relu",input_dim= X_train.shape[1]),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(units = 128,activation  = "relu",input_dim= X_train.shape[1]),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(units = 64,activation  = "relu"),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(units = 64,activation  = "relu"),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(units = 49,activation  = "sigmoid"),
    ]
)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
)

I0000 00:00:1748024019.948384      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [11]:
model.summary()

In [12]:
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

In [13]:
y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

In [14]:
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          epochs= 30, batch_size=700,
          callbacks=[early_stop, reduce_lr])

Epoch 1/30


I0000 00:00:1748024031.514854     102 service.cc:148] XLA service 0x7985ec009b50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748024031.515264     102 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1748024032.342361     102 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  37/1236[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 4ms/step - accuracy: 0.0345 - loss: 4.3804 

I0000 00:00:1748024038.013982     102 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.3429 - loss: 2.4028 - val_accuracy: 0.6538 - val_loss: 1.1389 - learning_rate: 0.0010
Epoch 2/30
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6082 - loss: 1.2721 - val_accuracy: 0.6937 - val_loss: 0.9510 - learning_rate: 0.0010
Epoch 3/30
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6617 - loss: 1.0908 - val_accuracy: 0.7063 - val_loss: 0.8989 - learning_rate: 0.0010
Epoch 4/30
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6816 - loss: 1.0153 - val_accuracy: 0.7113 - val_loss: 0.8649 - learning_rate: 0.0010
Epoch 5/30
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6914 - loss: 0.9734 - val_accuracy: 0.7164 - val_loss: 0.8414 - learning_rate: 0.0010
Epoch 6/30
[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x79880c199c90>

In [15]:
y_pred = model.predict(X_test)


[1m6756/6756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step


In [23]:
pred_copy=y_pred.copy()


In [24]:
pred_copy

array([[3.31583291e-01, 5.33024706e-02, 7.46045932e-02, ...,
        1.35318565e-04, 2.33901925e-02, 6.65814616e-04],
       [9.96118426e-01, 9.97960210e-01, 9.29887712e-01, ...,
        2.30604201e-03, 2.88702804e-03, 5.80225060e-05],
       [3.33138674e-01, 3.49269918e-04, 1.11043315e-04, ...,
        5.63095127e-05, 6.94818755e-06, 9.29622317e-08],
       ...,
       [1.65947750e-01, 2.27717814e-04, 1.00190996e-03, ...,
        4.09854529e-03, 1.37794882e-06, 9.53300571e-07],
       [9.97085273e-01, 1.33930743e-01, 1.16393268e-01, ...,
        1.20735192e-03, 5.99561073e-03, 1.76048411e-06],
       [9.97786403e-01, 9.94645596e-01, 9.99538302e-01, ...,
        2.19653919e-03, 4.17031705e-01, 7.90078229e-06]], dtype=float32)

In [25]:
len(y.columns)

49

In [26]:


columns = [
    'Bronchitis', 'Pneumonia', 'URTI', 'Bronchiectasis', 'Tuberculosis', 'Influenza',
    'HIV (initial infection)', 'Chagas', 'Scombroid food poisoning', 'Sarcoidosis',
    'Unstable angina', 'Possible NSTEMI / STEMI', 'Boerhaave', 'Spontaneous rib fracture',
    'Pericarditis', 'Stable angina', 'GERD', 'Panic attack', 'Cluster headache',
    'Chronic rhinosinusitis', 'Acute rhinosinusitis', 'Anemia', 'Acute laryngitis',
    'Viral pharyngitis', 'Anaphylaxis', 'PSVT', 'Atrial fibrillation', 'Acute pulmonary edema',
    'Guillain-Barré syndrome', 'Myocarditis', 'Acute dystonic reactions', 'Myasthenia gravis',
    'SLE', 'Allergic sinusitis', 'Croup', 'Larygospasm', 'Pancreatic neoplasm',
    'Inguinal hernia', 'Pulmonary embolism', 'Spontaneous pneumothorax',
    'Pulmonary neoplasm', 'Epiglottitis', 'Whooping cough',
    'Bronchospasm / acute asthma exacerbation', 'Localized edema',
    'Acute COPD exacerbation / infection', 'Ebola', 'Acute otitis media', 'Bronchiolitis'
]

df_pred = pd.DataFrame(pred_copy, columns=columns)


In [27]:
max_columns_per_row = df_pred.idxmax(axis=1)

In [28]:
count_per_column = max_columns_per_row.value_counts().reset_index()
count_per_column.columns = ['Column', 'Count']


In [29]:
fig = px.pie(count_per_column, values='Count', names='Column',
             title='Distribution of Columns with Max Values per Row y_pred'
             ,height=1000)
fig.show()

In [30]:

fig = px.bar(count_per_column, x='Column', y='Count', color='Column',
             title='Distribution of Columns with Max Values per Row',
             labels={'Count': 'Count', 'Column': 'Column Name'})


fig.update_layout(
    xaxis_tickangle=-45,
    width=1200,   
    height=700    
)

fig.show()

In [31]:
y_pred = np.argmax(y_pred,axis =1)
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.15      0.45      0.23      7214
           1       0.63      0.60      0.61      6503
           2       0.75      0.73      0.74     16866
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.88      0.62      0.73      6753
           6       0.95      0.96      0.96      7420
           7       0.86      0.40      0.54      1792
           8       1.00      0.99      1.00      4711
           9       0.99      0.76      0.86      4699
          10       0.79      0.71      0.75     10624
          11       0.41      0.96      0.58      6454
          12       0.99      0.68      0.81      3873
          13       0.96      0.59      0.74      1320
          14       0.93      0.62      0.74      5769
          16       0.95      0.74      0.83      6546
          17       0.91      0.89      0.90      6528
          18       0.89    

In [32]:
y_pred = np.argmax(y_pred)

In [33]:
import joblib
joblib.dump(model, 'model.pkl')  

['model.pkl']

In [34]:
model.save("model_73%_test_accuracy.h5")

In [35]:
data_target = {
    'y': list(y.columns.values),
}

data_input = {
    'X' : list(X.columns.values)
}

Data_Target = pd.DataFrame(data = data_input)
Data_input = pd.DataFrame(data = data_target)

In [36]:
Data_Target.to_csv("Data_Target.csv")
Data_input.to_csv("Data_input.csv")