In [1]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pandas as pd
import os

In [2]:
def shuffle_dataframe(df):
    # Generate a range of indices equal to the number of rows in the DataFrame
    indices = np.arange(df.shape[0])
    # Shuffle the indices randomly
    np.random.shuffle(indices)
    # Use the shuffled indices to reorder the DataFrame
    shuffled_df = df.iloc[indices]
    # Reset the index of the shuffled DataFrame and return it
    return shuffled_df.reset_index(drop=True)

In [3]:
# Initialize and fit these in a setup or training script
scaler = StandardScaler()
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
label_encoder = LabelEncoder()

def fit_preprocessors(train_df_data, num_cols, cat_cols, label_col='primary_mechanism'):
    global scaler, one_hot_encoder, label_encoder
    scaler.fit(train_df_data[num_cols])
    one_hot_encoder.fit(train_df_data[cat_cols])
    label_encoder.fit(train_df_data[label_col])

def process_non_txt_data(in_df, has_label=True, n_classes=7, label_col='primary_mechanism', num_cols=['age'], cat_cols=['diagnosis', 'sex', 'body_part']):
    features_scaled = scaler.transform(in_df[num_cols])
    one_hot_tab_features = one_hot_encoder.transform(in_df[cat_cols]).toarray()
    
    tabular_features_scaled = np.hstack([features_scaled, one_hot_tab_features])
    
    if has_label:
        labels_np = np.array(in_df[label_col])
        y_integer_encoded = label_encoder.transform(labels_np)
        y_one_hot = to_categorical(y_integer_encoded, num_classes=n_classes)
        return (tabular_features_scaled, y_one_hot)
    else:
        return tabular_features_scaled

In [4]:
# Initialize the tokenizer globally
tokenizer = Tokenizer(num_words=100000)
def fit_tokenizer(txt_data):
    text_np = np.array(txt_data)
    tokenizer.fit_on_texts(text_np)

def tokenize_samples(txt_data, maxlen=207):
    text_np = np.array(txt_data)
    sequences = tokenizer.texts_to_sequences(text_np)
    text_input_clean = pad_sequences(sequences, maxlen=maxlen)
    print('Shape of data tensor:', text_input_clean.shape)
    return(text_input_clean)

In [5]:
narrative_path = '../../data_to_model/narrative_tbl_all.csv'
train_path = '../../data_to_model/Train and Test Sets/train_df_count_bin.csv'
test_path = '../../data_to_model/Train and Test Sets/test_df_count_bin.csv'
new_path = '../../data_to_model/new_data_to_model_count.csv'

In [6]:
narrative_df = pd.read_csv(narrative_path)
train_df = shuffle_dataframe(pd.read_csv(train_path).merge(narrative_df,how='left', on=['cpsc_case_number'] ))
test_df = shuffle_dataframe(pd.read_csv(test_path).merge(narrative_df,how='left', on=['cpsc_case_number'] ))
new_df = shuffle_dataframe(pd.read_csv(new_path).merge(narrative_df,how='left', on=['cpsc_case_number'] ))
#train_df['primary_mechanism_bin']

In [7]:
print(train_df.shape)
print(test_df.shape)
print(new_df.shape)

(1592, 23)
(399, 23)
(989, 22)


In [8]:
fit_preprocessors(train_df, num_cols =['age'], cat_cols =['diagnosis', 'sex', 'body_part'], label_col='primary_mechanism_bin')
fit_tokenizer(train_df['narrative'])

In [9]:
train_tabular_input, y_train_one_hot = process_non_txt_data(in_df = train_df, n_classes=2, label_col='primary_mechanism_bin')
test_tabular_input, y_test_one_hot = process_non_txt_data(in_df = test_df, n_classes=2, label_col='primary_mechanism_bin')
new_tabular_input = process_non_txt_data(in_df = new_df, has_label = False)

print(train_tabular_input.shape)
print(test_tabular_input.shape)
print(new_tabular_input.shape)
#new_tabular_input

(1592, 36)
(399, 36)
(989, 36)


In [None]:
#y_train_one_hot

In [None]:
#y_test_one_hot

In [10]:
print(y_train_one_hot.shape)
print(y_test_one_hot.shape)

(1592, 2)
(399, 2)


In [11]:
train_txt_tokens = tokenize_samples(train_df['narrative'])
test_txt_tokens = tokenize_samples(test_df['narrative'])
new_txt_tokens = tokenize_samples(new_df['narrative'])

Shape of data tensor: (1592, 207)
Shape of data tensor: (399, 207)
Shape of data tensor: (989, 207)


In [12]:
from keras.layers import Input, Embedding, Flatten, Dense, concatenate
from keras.models import Model

maxlen=207
max_words = 100000

# Text input branch
text_input = Input(shape=(maxlen,), dtype='int32', name='text_input')
embedded_text = Embedding(max_words, 8)(text_input)
flattened_text = Flatten()(embedded_text)

# Tabular input branch
tabular_input = Input(shape=(train_tabular_input.shape[1],), name='tabular_input')
dense_tabular = Dense(16, activation='relu')(tabular_input)

# Combine branches
concatenated = concatenate([flattened_text, dense_tabular], axis=-1)
output = Dense(2, activation='sigmoid')(concatenated)

# Create and compile the model
model_combined = Model(inputs=[text_input, tabular_input], outputs=output)
model_combined.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model_combined.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_input (InputLayer)         [(None, 207)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 207, 8)       800000      text_input[0][0]                 
__________________________________________________________________________________________________
tabular_input (InputLayer)      [(None, 36)]         0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 1656)         0           embedding[0][0]                  
______________________________________________________________________________________________

In [13]:
history_combined = model_combined.fit([train_txt_tokens, train_tabular_input], y_train_one_hot,
epochs=80,
batch_size=32,
validation_split=.1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [14]:
from keras.layers import Dropout

In [15]:
# Text input branch
text_input_deeper = Input(shape=(maxlen,), dtype='int32', name='text_input')
embedded_text_deeper = Embedding(max_words, 8)(text_input_deeper)
flattened_text_deeper = Flatten()(embedded_text_deeper)

# Tabular input branch
tabular_input_deeper = Input(shape=(train_tabular_input.shape[1],), name='tabular_input')
dense_tabular_p1 = Dense(64, activation='relu')(tabular_input_deeper)
dropout1 = Dropout(0.5)(dense_tabular_p1)  # Dropout for regularization
dense_tabular_deeper = Dense(32, activation='relu')(dropout1)

# Combine branches
concatenated_deeper = concatenate([flattened_text_deeper, dense_tabular_deeper], axis=-1)
output_deeper = Dense(2, activation='sigmoid')(concatenated_deeper)

# Create and compile the model
model_combined_deeper = Model(inputs=[text_input_deeper, tabular_input_deeper], outputs=output_deeper)
model_combined_deeper.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model_combined_deeper.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tabular_input (InputLayer)      [(None, 36)]         0                                            
__________________________________________________________________________________________________
text_input (InputLayer)         [(None, 207)]        0                                            
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 64)           2368        tabular_input[0][0]              
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 207, 8)       800000      text_input[0][0]                 
____________________________________________________________________________________________

In [16]:
history_combined_deeper = model_combined_deeper.fit([train_txt_tokens, train_tabular_input], y_train_one_hot,
epochs=80,
batch_size=32,
validation_split=.1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [19]:
test_predictions = model_combined.predict([test_txt_tokens, test_tabular_input])
test_predicted_classes = np.argmax(test_predictions, axis=1)
test_class_names = label_encoder.inverse_transform(test_predicted_classes)

In [20]:
test_class_names[0:5]

array(['Other', 'Other', 'Other', 'Other', 'Other'], dtype=object)

In [21]:
new_predictions = model_combined.predict([new_txt_tokens, new_tabular_input])
new_predicted_classes = np.argmax(new_predictions, axis=1)
new_class_names = label_encoder.inverse_transform(new_predicted_classes)

In [22]:
new_class_names[0:5]

array(['Other', 'Other', 'Other', 'Other', 'Other'], dtype=object)

In [23]:
new_class_names.shape

(989,)

In [24]:
test_pred_df = pd.DataFrame( {'cpsc_case_number': test_df['cpsc_case_number'],  'pred': test_class_names} )
new_pred_df = pd.DataFrame( {'cpsc_case_number': new_df['cpsc_case_number'],  'pred': new_class_names} )



In [28]:
test_pred_df.to_csv('../../Results/Prediction Data/keras_binary_test_out.csv')

In [29]:
new_pred_df.to_csv('../../Results/Prediction Data/keras_binary_new_out.csv')

In [27]:
model_combined.save_weights('../../Model Objects/keras_out_new_vars_binv1.h5')
model_combined_deeper.save_weights('../../Model Objects/keras_out_new_vars_binv1.h5')