In [1]:
import sys, os, pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [2]:
current_path=os.getcwd()
loader_path=os.path.abspath(
    os.path.join(current_path, '..', 'disease_prediction','data',
        )
    )
sys.path.append(loader_path)
import datasets as ds
df=ds.load_datasets(
    subsets=['train', 'test', 'validate'],
    directory='../ddx-dataset/'
)
dp_data=pd.concat(
    [df['train'],df['test'],df['validate']],
    axis=0, 
    ignore_index=True
    )
dp_data.reset_index(drop=True,inplace=True)

Features and Labels

In [3]:
X=dp_data.drop('PATHOLOGY', axis=1)
y=dp_data['PATHOLOGY']

Label Encoding

In [4]:
label_encoder=LabelEncoder()
y_encoded=label_encoder.fit_transform(y)
pathologies=label_encoder.classes_

Features Encoding

In [5]:
numerical_features=X.select_dtypes(include='int64').columns.tolist()
categorical_features=X.select_dtypes(include='object').columns.tolist()
features_preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numerical_features),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_features)
    ]
)

Data Split

In [6]:
X_train, X_test, y_train, y_test=train_test_split(
    X, y_encoded, test_size=0.30, random_state=42
)

In [7]:
pipeline = Pipeline(
    steps=[
            ('preprocessor', features_preprocessor),
            ('classifier', LogisticRegression(
                multi_class='multinomial', solver='lbfgs', max_iter=1000
                )
            )
        ]
)

pipeline.fit(X_train, y_train)
y_predicted=pipeline.predict(X_test)



In [8]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_predicted)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('Classification Report:')
print(classification_report(y_test, y_predicted))

Accuracy: 0.60
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      9198
           1       0.79      0.44      0.57     10729
           2       0.99      0.24      0.39      3408
           3       1.00      0.22      0.36       275
           4       0.39      0.53      0.45     11029
           5       0.65      0.33      0.44     10190
           6       0.53      1.00      0.69     10697
           7       0.87      0.30      0.44      4407
           8       0.58      0.72      0.64      8125
           9       0.53      0.57      0.55      6041
          10       1.00      1.00      1.00      2130

    accuracy                           0.60     76229
   macro avg       0.75      0.56      0.58     76229
weighted avg       0.66      0.60      0.59     76229

