In [1]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pandas as pd
import os

In [2]:
narrative_path = '../data_to_model/keras_data/model_df_full_narrative.csv'
train_path = '../data_to_model/Train and Test Sets/train_df_count.csv'

In [3]:
narrative_df = pd.read_csv(narrative_path)
narrative_df.drop(columns= ['primary_mechanism'], inplace = True )

In [4]:
train_df = pd.read_csv(train_path).merge(narrative_df,how='left', on=['cpsc_case_number'] )
#train_df

In [5]:
from sklearn.preprocessing import StandardScaler

features = train_df[['age']].values
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
#features_scaled


In [6]:
one_hot_tab_features = pd.get_dummies(train_df[['diagnosis', 'sex', 'body_part' ]] )
#print(one_hot_tab_features)
one_hot_tab_features = one_hot_tab_features.values
#print(one_hot_tab_features)

In [7]:
tabular_features_scaled = np.hstack([features_scaled, one_hot_tab_features])
print(tabular_features_scaled)
print(tabular_features_scaled.shape)

[[ 1.62316007  0.          0.         ...  0.          0.
   0.        ]
 [-1.25396467  0.          0.         ...  0.          0.
   0.        ]
 [ 0.39010661  0.          0.         ...  1.          0.
   0.        ]
 ...
 [ 0.39010661  0.          1.         ...  0.          0.
   0.        ]
 [ 0.59561552  0.          0.         ...  0.          0.
   0.        ]
 [ 0.69836998  0.          0.         ...  0.          0.
   0.        ]]
(1597, 36)


In [8]:
text_np = np.array(train_df['narrative'])
train_labels_np = np.array(train_df['primary_mechanism'])
#train_labels_np_bin = np.where(train_labels_np == 'Multi-sport', 'Multi-sport', 'Other')
#print(train_labels_np[0:30])
#print(train_labels_np_bin[0:30])

In [9]:
#l_list = []
#for txt in text_np:
#    l_list.append( len(txt) )
    
#l_list[0:10]

#max(l_list)

In [10]:
maxlen = 207
#training_samples = 1491
#validation_samples = 500
max_words = 100000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text_np)
sequences = tokenizer.texts_to_sequences(text_np)


In [11]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
text_input_train = pad_sequences(sequences, maxlen=maxlen)

print('Shape of data tensor:', text_input_train.shape)
print('Shape of label tensor:', train_labels_np.shape)


Found 1635 unique tokens.
Shape of data tensor: (1597, 207)
Shape of label tensor: (1597,)


In [12]:
label_encoder = LabelEncoder()

y_train_integer_encoded = label_encoder.fit_transform(train_labels_np)
#train_labels_npy_val_integer_encoded = label_encoder.fit_transform(y_val)

#y_train_integer_encoded
#y_train_integer_encodedtrain_labels_np
y_train_one_hot = to_categorical(y_train_integer_encoded, num_classes=7)
#y_val_one_hot = to_categorical(y_val_integer_encoded, num_classes=7)

In [13]:
#indices = np.arange(data.shape[0])
#np.random.shuffle(indices)
#data = data[indices]
#train_labels_np = train_labels_np[indices]
#x_train = data[:training_samples]
#y_train = train_labels_np[:training_samples]
#x_val = data[training_samples: training_samples + validation_samples]
#y_val = train_labels_np[training_samples: training_samples + validation_samples]

In [14]:
from keras.layers import Input, Embedding, Flatten, Dense, concatenate
from keras.models import Model

# Text input branch
text_input = Input(shape=(maxlen,), dtype='int32', name='text_input')
embedded_text = Embedding(max_words, 8)(text_input)
flattened_text = Flatten()(embedded_text)

# Tabular input branch
tabular_input = Input(shape=(tabular_features_scaled.shape[1],), name='tabular_input')
dense_tabular = Dense(16, activation='relu')(tabular_input)

# Combine branches
concatenated = concatenate([flattened_text, dense_tabular], axis=-1)
output = Dense(7, activation='sigmoid')(concatenated)

# Create and compile the model
model_combined = Model(inputs=[text_input, tabular_input], outputs=output)
model_combined.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model_combined.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_input (InputLayer)         [(None, 207)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 207, 8)       800000      text_input[0][0]                 
__________________________________________________________________________________________________
tabular_input (InputLayer)      [(None, 36)]         0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 1656)         0           embedding[0][0]                  
______________________________________________________________________________________________

In [15]:
history_combined = model_combined.fit([text_input_train, tabular_features_scaled], y_train_one_hot,
epochs=80,
batch_size=32,
validation_split=.1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [16]:
from keras.layers import Dropout

In [17]:
# Text input branch
text_input_deeper = Input(shape=(maxlen,), dtype='int32', name='text_input')
embedded_text_deeper = Embedding(max_words, 8)(text_input_deeper)
flattened_text_deeper = Flatten()(embedded_text_deeper)

# Tabular input branch
tabular_input_deeper = Input(shape=(tabular_features_scaled.shape[1],), name='tabular_input')
dense_tabular_p1 = Dense(64, activation='relu')(tabular_input_deeper)
dropout1 = Dropout(0.5)(dense_tabular_p1)  # Dropout for regularization
dense_tabular_deeper = Dense(32, activation='relu')(dropout1)

# Combine branches
concatenated_deeper = concatenate([flattened_text_deeper, dense_tabular_deeper], axis=-1)
output_deeper = Dense(7, activation='sigmoid')(concatenated_deeper)

# Create and compile the model
model_combined_deeper = Model(inputs=[text_input_deeper, tabular_input_deeper], outputs=output_deeper)
model_combined_deeper.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model_combined_deeper.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tabular_input (InputLayer)      [(None, 36)]         0                                            
__________________________________________________________________________________________________
text_input (InputLayer)         [(None, 207)]        0                                            
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 64)           2368        tabular_input[0][0]              
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 207, 8)       800000      text_input[0][0]                 
____________________________________________________________________________________________

In [18]:
history_combined_deeper = model_combined_deeper.fit([text_input_train, tabular_features_scaled], y_train_one_hot,
epochs=80,
batch_size=32,
validation_split=.1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [19]:
model_combined.save_weights('../Model Objects/keras_out_new_varsv1.h5')
model_combined_deeper.save_weights('../Model Objects/keras_out_new_vars_deeperv1.h5')