This notebook provides all the code from the readme in a single place for convenient use.

In [None]:
!pip install git+https://github.com/nina-adhikari/disease_prediction

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DIRECTORY = 'drive/My Drive/Disease-Prediction/ddx-dataset/'

## Classification using the random forest model

In [None]:
from disease_prediction.data import datasets as ds

# The datasets we want to load; you can choose fewer if you'd like
SUBSETS = ['train', 'validate', 'test']

df = ds.load_datasets(
	    subsets=SUBSETS,
	    directory=DIRECTORY
)

for subset in SUBSETS:
	df[subset].set_index('index', inplace=True)

In [None]:
d = {'Y': 1, 'N': 0}

# drop the columns that have a single value in all three datasets and convert Y/N to 1/0
for subset in SUBSETS:
    df[subset].drop(columns=['pain_radiate', 'lesions_peeling'], inplace=True)
    df[subset]['lesion_larger_than_1cm'] = df[subset]['lesion_larger_than_1cm'].map(d)

In [None]:
CATEGORICAL_FEATURES = [col for col in df['train'].columns if df['train'][col].dtype == 'object']
CATEGORICAL_FEATURES.remove('PATHOLOGY')

NUMERICAL_FEATURES = [col for col in df['train'].columns if (set(df['train'][col].unique()) != set([0,1])) and (df['train'][col].dtype != 'object')]

X = {}
y = {}

for subset in SUBSETS:
	X[subset] = df[subset].drop(columns=['PATHOLOGY'])
	y[subset] = df[subset].PATHOLOGY.copy()

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

# optimal model hyperparameters
n_estimators = 500
max_depth = 20
min_samples_leaf = 5
bootstrap = False

rf = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_leaf=min_samples_leaf,
    bootstrap=bootstrap
)

rf_pipeline = make_pipeline(
    ColumnTransformer(
	    [
	    ('categorical', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES),
	    ('numerical', StandardScaler(), NUMERICAL_FEATURES)
	    ],
	    remainder='passthrough'),
    rf
)

In [None]:
rf_pipeline.fit(X['train'], y['train'])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y['validate'], rf_pipeline.predict(X['validate']), digits=4))
print(classification_report(y['test'], rf_pipeline.predict(X['test']), digits=4))

## Classification using the DistilBERT transformer

In [None]:
!pip install datasets transformers evaluate imblearn

In [None]:
from disease_prediction.models import text_classification as tc
from disease_prediction.models import classification_helper as ch

In [None]:
import pandas as pd

df = {}
SUBSETS = ['train', 'validation', 'test']

for subset in SUBSETS:
	df[subset] = pd.read_json(DIRECTORY + 'text-' + subset + '-gpt.json')

In [None]:
from sklearn.model_selection import train_test_split

df_combined = pd.concat([df['train'], df['validation'], df['test']])
X_train, X_test, y_train, y_test = train_test_split(df_combined['sentence1'], df_combined['label'], test_size=0.1, random_state=42)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = rus.fit_resample(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy())

In [None]:
df_resampled = pd.DataFrame({'sentence1': X_resampled.reshape(-1), 'label': y_resampled})

df['train'] = df_resampled
df['validation'] = pd.DataFrame({'sentence1': X_test, 'label': y_test})
df['test'] = pd.DataFrame({'sentence1': X_test, 'label': y_test})

In [None]:
ch.DATA_ARGS.max_train_samples = len(X_resampled)
ch.DATA_ARGS.max_val_samples = len(X_test)

# This is not relevant since we are not predicting anything, but we define it anyway to placate the transformer
ch.DATA_ARGS.max_test_samples = len(X_test)

In [None]:
tc.setup_from_scratch(df)

In [None]:
tc.train()

In [None]:
tc.evaluate()

In [None]:
tc.WRAPPER.save_pretrained(DIRECTORY + 'model')

In [None]:
tc.setup_from_finetuned(DIRECTORY + 'model')