Importing Required Libraries

In [9]:
import pandas as pd
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Bidirectional, Concatenate
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import nltk

In [None]:
# Downloading stopwords & punkt
nltk.download('stopwords')
nltk.download('punkt')

In [10]:
# Reading the preprocessed dataset
df = pd.read_parquet("preprocessed_data.parquet")

Tokenization & Padding

In [11]:
max_features = 5000  # Maximum number of words in tokenizer
maxlen_articles = 400  # Maximum length of articles
maxlen_abstracts = 100  # Maximum length of abstracts

In [12]:
tokenizer_articles = Tokenizer(num_words=max_features)
tokenizer_articles.fit_on_texts(df['article'])
sequences_articles = tokenizer_articles.texts_to_sequences(df['article'])
tokenized_articles = pad_sequences(sequences_articles, maxlen=maxlen_articles, padding='post')

In [13]:
tokenizer_abstracts = Tokenizer(num_words=max_features)
tokenizer_abstracts.fit_on_texts(df['abstract'])
sequences_abstracts = tokenizer_abstracts.texts_to_sequences(df['abstract'])
tokenized_abstracts = pad_sequences(sequences_abstracts, maxlen=maxlen_abstracts, padding='post')

LSTM - Model Architecture

In [10]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(tokenized_articles, tokenized_abstracts, test_size=0.2, random_state=42)

In [11]:
# Define the LSTM model architecture
embedding_dim = 100  # Dimension of word embeddings
latent_dim = 300  # Dimension of LSTM output

In [12]:
# Encoder(in which the input sequence and produces a context vector [or hidden state], 
# which summarizes the input information and is passed to the decoder)

## Defining the encoding layers
encoder_inputs = Input(shape=(maxlen_articles,))
enc_emb = Embedding(max_features, embedding_dim, trainable=True)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_emb)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

In [13]:
# Decoder(in which the output sequence, one token at a time, 
# using the context vector from the encoder and its own previous outputs)

## Defining the decoding layers
decoder_inputs = Input(shape=(maxlen_abstracts,))
dec_emb_layer = Embedding(max_features, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
decoder_dense = Dense(max_features, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [14]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [15]:
# Compile the model with sparse categorical crossentropy loss
model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics=['accuracy'])

In [16]:
# Print model summary
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 400)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 400, 100)     500000      ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 400, 600),   962400      ['embedding[0][0]']              
                                 (None, 300),                                                 

In [17]:
# Train the model
epochs = 25
batch_size = 32

In [18]:
# Early stopping is used to halt training when the validation loss stops improving, helping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

In [18]:
# Adjust maxlen_abstracts and maxlen_articles if necessary
maxlen_abstracts = 100
maxlen_articles = 400

# Pad sequences
X_train_padded = pad_sequences(X_train, maxlen=maxlen_articles, padding='post')
X_val_padded = pad_sequences(X_val, maxlen=maxlen_articles, padding='post')

y_train_padded_input = pad_sequences(y_train[:, :-1], maxlen=maxlen_abstracts, padding='post')
y_train_padded_target = pad_sequences(y_train[:, 1:], maxlen=maxlen_abstracts, padding='post')

y_val_padded_input = pad_sequences(y_val[:, :-1], maxlen=maxlen_abstracts, padding='post')
y_val_padded_target = pad_sequences(y_val[:, 1:], maxlen=maxlen_abstracts, padding='post')

# Example shapes check
print("Shapes after padding:")
print(f"X_train_padded shape: {X_train_padded.shape}")
print(f"y_train_padded_input shape: {y_train_padded_input.shape}")
print(f"y_train_padded_target shape: {y_train_padded_target.shape}")
print(f"X_val_padded shape: {X_val_padded.shape}")
print(f"y_val_padded_input shape: {y_val_padded_input.shape}")
print(f"y_val_padded_target shape: {y_val_padded_target.shape}")

Shapes after padding:
X_train_padded shape: (8000, 400)
y_train_padded_input shape: (8000, 100)
y_train_padded_target shape: (8000, 100)
X_val_padded shape: (2000, 400)
y_val_padded_input shape: (2000, 100)
y_val_padded_target shape: (2000, 100)


In [19]:
# Train the model
history = model.fit(
    [X_train_padded, y_train_padded_input],         # Input: X_train_padded and y_train_padded_input
    y_train_padded_target,                          # Target: y_train_padded_target
    epochs=epochs,
    callbacks=[early_stopping],
    batch_size=batch_size,
    validation_data=(
        [X_val_padded, y_val_padded_input],         # Validation input: X_val_padded and y_val_padded_input
        y_val_padded_target                         # Validation target: y_val_padded_target
    )
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 18: early stopping


In [20]:
# Evaluate the model on validation data
loss, accuracy = model.evaluate([X_val_padded, y_val_padded_input], y_val_padded_target, verbose=0)

print(f'Evaluation loss: {loss}, Accuracy: {accuracy}')

Evaluation loss: 4.618181228637695, Accuracy: 0.2696850001811981


In [21]:
# Save the trained model
model.save('text_summarization_model.h5')

In [14]:
# Saving the article & abstract tokenizers as pickle file
with open('tokenizer_articles.pkl', 'wb') as f:
    pickle.dump(tokenizer_articles, f)

with open('tokenizer_abstracts.pkl', 'wb') as f:
    pickle.dump(tokenizer_abstracts, f)

Testing Sample Data (Test Case)

In [39]:
# Example text data
texts = [
    "abdominal cystic lymphangiomas rare occur secondary congenital malformation lymphatics mostly mesenterium acute chronic volvulus small bowel may occur traction lymphangioma transverse supraumbilical laparotomy performed volvulus small bowel seen lead point volvulus seven cm benign cystic lymphangioma located fifteen cm distal treitz ligament vital bowel repositioned cyst resected including small section jejunum anastomosed end end",
    "key clinical messageabdominal cystic lymphangiomas are rare and occur secondary to congenital malformation of the lymphatics mostly in the mesenterium acute or chronic volvulus of the small bowel may occur by traction of the lymphangioma therapy includes resection of the lymphangioma and of the small bowel involved",
    "maldi tof ms spectrum bacillus massiliogabonensis available hundred fifty six titre urms database rrna gene sequence strain marseille deposited genbank database accession number lt strain marseille deposited collection de souches de lunit des rickettsies registered number",
    "the discovery of new bacteria from the human gut using culturomics method is novel field of increasing interest in microbiology here the main characteristics of bacillus massiliogabonensis strain marseille new gram negative bacterium isolated from the stool sample of healthy sixteen year old gabonese boy are reported",
    "authors report conflicts interest authors alone responsible content writing article",
    "abstractthe exposure of prosthetic vascular graft is dangerous complication in revascularization procedures in this case report we describe successful coverage of an exposed prosthetic femorofemoral vascular graft in the suprapubic area with vertical rectus abdominis myocutaneous island flap"
]

# Initialize and fit the tokenizer on your text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Generate index_to_word dictionary
index_to_word = {index: word for word, index in tokenizer.word_index.items()}

# Save the tokenizer and index_to_word dictionary
with open('tokenizer_summary.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('index_to_word.pkl', 'wb') as f:
    pickle.dump(index_to_word, f)

# Verify the contents
print(f"Vocabulary Size: {len(tokenizer.word_index)}")
for i, (index, word) in enumerate(index_to_word.items()):
    if i < 10:  # Print first 10 entries
        print(f"Index {index}: Word '{word}'")


Vocabulary Size: 154
Index 1: Word 'of'
Index 2: Word 'the'
Index 3: Word 'small'
Index 4: Word 'bowel'
Index 5: Word 'in'
Index 6: Word 'occur'
Index 7: Word 'volvulus'
Index 8: Word 'lymphangioma'
Index 9: Word 'cystic'
Index 10: Word 'strain'


In [30]:
def decode_sequence(predicted_summary):
    # Assuming predicted_summary is in text format
    return predicted_summary  # Adjust based on actual output format

# Load model
model = tf.keras.models.load_model('text_summarization_model.h5', custom_objects={'_TextVectorization': TextVectorization})

# Tokenize text
tokenized_article = tokenizer_articles.texts_to_sequences([article])
tokenized_article = pad_sequences(tokenized_article, maxlen=max_len_articles, padding='post')

tokenized_abstract = tokenizer_abstracts.texts_to_sequences([abstract])
tokenized_abstract = pad_sequences(tokenized_abstract, maxlen=max_len_abstracts, padding='post')

# Predict the summary
predicted_summary = model.predict([tokenized_article, tokenized_abstract])


print("Original Article:", article)
print("Generated Summary:", predicted_summary)


Original Article: abdominal cystic lymphangiomas rare occur secondary congenital malformation lymphatics mostly mesenterium acute chronic volvulus small bowel may occur traction lymphangioma transverse supraumbilical laparotomy performed volvulus small bowel seen lead point volvulus seven cm benign cystic lymphangioma located fifteen cm distal treitz ligament vital bowel repositioned cyst resected including small section jejunum anastomosed end end
Generated Summary: [[[3.5144818e-05 1.1678024e-03 1.7694887e-02 ... 2.3339448e-05
   1.3893372e-05 1.7346711e-06]
  [1.7188948e-04 5.1575415e-03 1.1363861e-02 ... 7.3858696e-06
   7.9560930e-05 9.5291955e-07]
  [2.8256234e-05 3.7976238e-03 1.7489752e-02 ... 2.9629905e-06
   3.1072301e-05 6.8925465e-07]
  ...
  [9.9995112e-01 6.2463062e-07 4.1071835e-06 ... 6.8799607e-14
   4.4407125e-15 1.5501881e-11]
  [9.9995124e-01 6.1989510e-07 4.0945865e-06 ... 6.8927808e-14
   4.4086604e-15 1.5542385e-11]
  [9.9995136e-01 6.1284385e-07 4.0793011e-06 ..

In [40]:
# Load tokenizers
with open('tokenizer_articles.pkl', 'rb') as f:
    tokenizer_articles = pickle.load(f)

with open('tokenizer_abstracts.pkl', 'rb') as f:
    tokenizer_abstracts = pickle.load(f)

# Define or load your index_to_word dictionary
with open('index_to_word.pkl', 'rb') as f:
    index_to_word = pickle.load(f)

# Define padding token if applicable
padding_token = 0  # Adjust based on your model's padding token index

# Function to decode the predicted summary
def decode_sequence(predicted_summary, index_to_word, padding_token=0):
    # Get the index of the maximum probability for each token position
    token_indices = np.argmax(predicted_summary, axis=-1)
    
    # Convert token indices to words, excluding padding tokens
    decoded_summary = ' '.join([index_to_word.get(idx, '') for idx in token_indices[0] if idx != padding_token])
    
    return decoded_summary

# Load the model
model = tf.keras.models.load_model('text_summarization_model.h5', custom_objects={'_TextVectorization': TextVectorization})

# Example text
article = "abdominal cystic lymphangiomas are rare and occur secondary to congenital malformation lymphatics mostly in the mesenterium acute or chronic volvulus of the small bowel may occur by traction of the lymphangioma therapy includes resection of the lymphangioma and of the small bowel involved"
abstract = "key clinical message abdominal cystic lymphangiomas are rare and occur secondary to congenital malformation of the lymphatics mostly in the mesenterium acute or chronic volvulus of the small bowel may occur by traction of the lymphangioma therapy includes resection of the lymphangioma and of the small bowel involved"

# Tokenize text
max_len_articles = 400  # Set the max length based on your model
max_len_abstracts = 100  # Set the max length based on your model

tokenized_article = tokenizer_articles.texts_to_sequences([article])
tokenized_article = pad_sequences(tokenized_article, maxlen=max_len_articles, padding='post')

tokenized_abstract = tokenizer_abstracts.texts_to_sequences([abstract])
tokenized_abstract = pad_sequences(tokenized_abstract, maxlen=max_len_abstracts, padding='post')

# Predict the summary
predicted_summary = model.predict([tokenized_article, tokenized_abstract])

# Decode the predicted summary
decoded_summary = decode_sequence(predicted_summary, index_to_word, padding_token)

print("Original Article:", article)
print("Generated Summary:", decoded_summary)

Original Article: abdominal cystic lymphangiomas are rare and occur secondary to congenital malformation lymphatics mostly in the mesenterium acute or chronic volvulus of the small bowel may occur by traction of the lymphangioma therapy includes resection of the lymphangioma and of the small bowel involved
Generated Summary: point strain strain  island  of bowel in of  strain of   of   bowel of of   small are bowel of the of strain strain of the of seven of of strain  small bowel
