In [1]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pandas as pd
import os

In [2]:
def process_non_txt_data(in_df, has_label = True, n_classes = 7
                 , label_col = 'primary_mechanism', num_cols = ['age']
                 , cat_cols = ['diagnosis', 'sex', 'body_part' ]  ):
#    features = NULL
    features = in_df[num_cols].values
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    one_hot_tab_features = pd.get_dummies(in_df[cat_cols] )
    one_hot_tab_features = one_hot_tab_features.values
    
    tabular_features_scaled = np.hstack([features_scaled, one_hot_tab_features])
    
    if has_label:
        labels_np = np.array(in_df[label_col])
        label_encoder = LabelEncoder()
        y_integer_encoded = label_encoder.fit_transform(labels_np)
        y_one_hot = to_categorical(y_integer_encoded, num_classes=n_classes)
        return(tabular_features_scaled, y_one_hot)
    else:
        return(tabular_features_scaled)

In [3]:
def tokenize_samples(txt_data, maxlen = 207, max_words = 100000):
    text_np = np.array(txt_data )
    
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text_np)
    sequences = tokenizer.texts_to_sequences(text_np)
    
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    text_input_clean = pad_sequences(sequences, maxlen=maxlen)
    print('Shape of data tensor:', text_input_clean.shape)
#    print('Shape of label tensor:', train_labels_np.shape)
    
    return(text_input_clean)

In [4]:
narrative_path = '../../data_to_model/narrative_tbl_all.csv'
train_path = '../../data_to_model/Train and Test Sets/train_df_count.csv'
test_path = '../../data_to_model/Train and Test Sets/test_df_count.csv'
new_path = '../../data_to_model/new_data_to_model_count.csv'

In [5]:
narrative_df = pd.read_csv(narrative_path)

In [6]:
train_df = pd.read_csv(train_path).merge(narrative_df,how='left', on=['cpsc_case_number'] )
test_df = pd.read_csv(test_path).merge(narrative_df,how='left', on=['cpsc_case_number'] )
new_df = pd.read_csv(new_path).merge(narrative_df,how='left', on=['cpsc_case_number'] )
#train_df['cpsc_case_number']

In [15]:
print(train_df.shape)
print(test_df.shape)
print(new_df.shape)

(1591, 23)
(400, 23)
(989, 22)


In [14]:
train_tabular_input, y_train_one_hot = process_non_txt_data(in_df = train_df)
test_tabular_input, y_test_one_hot = process_non_txt_data(in_df = test_df)
new_tabular_input = process_non_txt_data(in_df = new_df, has_label = False)

print(train_tabular_input.shape)
print(test_tabular_input.shape)
print(new_tabular_input.shape)
#new_tabular_input

(1591, 36)
(400, 36)
(989, 35)


In [None]:
train_txt_tokens = tokenize_samples(train_df['narrative'])
test_txt_tokens = tokenize_samples(test_df['narrative'])
new_txt_tokens = tokenize_samples(new_df['narrative'])

In [None]:
from keras.layers import Input, Embedding, Flatten, Dense, concatenate
from keras.models import Model

# Text input branch
text_input = Input(shape=(maxlen,), dtype='int32', name='text_input')
embedded_text = Embedding(max_words, 8)(text_input)
flattened_text = Flatten()(embedded_text)

# Tabular input branch
tabular_input = Input(shape=(tabular_features_scaled.shape[1],), name='tabular_input')
dense_tabular = Dense(16, activation='relu')(tabular_input)

# Combine branches
concatenated = concatenate([flattened_text, dense_tabular], axis=-1)
output = Dense(7, activation='sigmoid')(concatenated)

# Create and compile the model
model_combined = Model(inputs=[text_input, tabular_input], outputs=output)
model_combined.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model_combined.summary()

In [None]:
history_combined = model_combined.fit([text_input_train, tabular_features_scaled], y_train_one_hot,
epochs=80,
batch_size=32,
validation_split=.1)

In [None]:
from keras.layers import Dropout

In [None]:
# Text input branch
text_input_deeper = Input(shape=(maxlen,), dtype='int32', name='text_input')
embedded_text_deeper = Embedding(max_words, 8)(text_input_deeper)
flattened_text_deeper = Flatten()(embedded_text_deeper)

# Tabular input branch
tabular_input_deeper = Input(shape=(tabular_features_scaled.shape[1],), name='tabular_input')
dense_tabular_p1 = Dense(64, activation='relu')(tabular_input_deeper)
dropout1 = Dropout(0.5)(dense_tabular_p1)  # Dropout for regularization
dense_tabular_deeper = Dense(32, activation='relu')(dropout1)

# Combine branches
concatenated_deeper = concatenate([flattened_text_deeper, dense_tabular_deeper], axis=-1)
output_deeper = Dense(7, activation='sigmoid')(concatenated_deeper)

# Create and compile the model
model_combined_deeper = Model(inputs=[text_input_deeper, tabular_input_deeper], outputs=output_deeper)
model_combined_deeper.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model_combined_deeper.summary()

In [None]:
history_combined_deeper = model_combined_deeper.fit([text_input_train, tabular_features_scaled], y_train_one_hot,
epochs=80,
batch_size=32,
validation_split=.1)

In [None]:
model_combined.save_weights('../Model Objects/keras_out_new_varsv1.h5')
model_combined_deeper.save_weights('../Model Objects/keras_out_new_vars_deeperv1.h5')