In [1]:
!pip install numpy==1.23.3 

Collecting numpy==1.23.3
  Downloading numpy-1.23.3-cp311-cp311-win_amd64.whl.metadata (2.3 kB)
Downloading numpy-1.23.3-cp311-cp311-win_amd64.whl (14.6 MB)
   ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
   ----------- ---------------------------- 4.2/14.6 MB 22.9 MB/s eta 0:00:01
   ----------------- ---------------------- 6.3/14.6 MB 20.3 MB/s eta 0:00:01
   --------------------- ------------------ 7.9/14.6 MB 13.1 MB/s eta 0:00:01
   ---------------------------- ----------- 10.5/14.6 MB 13.1 MB/s eta 0:00:01
   ------------------------------------- -- 13.6/14.6 MB 14.0 MB/s eta 0:00:01
   ---------------------------------------- 14.6/14.6 MB 14.1 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-1.23.3


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-multimodal 1.1.1 requires lightning<2.4,>=2.2, which is not installed.
autogluon-multimodal 1.1.1 requires nlpaug<1.2.0,>=1.1.10, which is not installed.
autogluon-multimodal 1.1.1 requires nptyping<2.5.0,>=1.4.4, which is not installed.
autogluon-multimodal 1.1.1 requires nvidia-ml-py3==7.352.0, which is not installed.
autogluon-multimodal 1.1.1 requires openmim<0.4.0,>=0.3.7, which is not installed.
autogluon-multimodal 1.1.1 requires pytesseract<0.3.11,>=0.3.9, which is not installed.
autogluon-multimodal 1.1.1 requires pytorch-metric-learning<2.4,>=1.3.0, which is not installed.
autogluon-multimodal 1.1.1 requires scikit-image<0.21.0,>=0.19.1, which is not installed.
autogluon-multimodal 1.1.1 requires seqeval<1.3.0,>=1.2.2, which is not installed.
autogluon-multimodal 1.1.1 requires text-unidecode<1

In [2]:
# %%
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
import time
import pandas as pd

dataset_dir = 'datasets'

# Read the dataset
df = pd.read_csv(f'{dataset_dir}/podcast_with_summary.csv')



In [3]:

# %%
# Preprocess text data
max_input_length = 1024
max_output_length = 200
num_words = 20000  # Vocabulary size

# Tokenizer for text
text_tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
text_tokenizer.fit_on_texts(df['text_short'])

# Tokenizer for summaries
summary_tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
summary_tokenizer.fit_on_texts(df['summary'])

# Convert text to sequences
text_sequences = text_tokenizer.texts_to_sequences(df['text_short'])
summary_sequences = summary_tokenizer.texts_to_sequences(df['summary'])

# Pad sequences
X = pad_sequences(text_sequences, maxlen=max_input_length, padding='post', truncating='post')
Y = pad_sequences(summary_sequences, maxlen=max_output_length, padding='post', truncating='post')

# %%
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)


In [4]:

# %%
# Define the LSTM model
embedding_dim = 128
hidden_units = 256

model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_input_length))
model.add(Bidirectional(LSTM(hidden_units, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dropout(0.3))
model.add(LSTM(hidden_units, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(hidden_units, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_words, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# %%
# Train the model
checkpoint = ModelCheckpoint("lstm_summarizer.keras", monitor='val_loss', save_best_only=True, verbose=1)




In [5]:
history = model.fit(
    X_train,
    np.expand_dims(Y_train[:, 0], axis=-1),  # Use the first token of each sequence for simplicity
    validation_split=0.1,
    batch_size=64,
    epochs=10,
    callbacks=[checkpoint]
)

Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.0421 - loss: 9.8946 
Epoch 1: val_loss improved from inf to 9.58593, saving model to lstm_summarizer.keras
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 5s/step - accuracy: 0.0448 - loss: 9.8930 - val_accuracy: 0.1034 - val_loss: 9.5859
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.1053 - loss: 9.2429
Epoch 2: val_loss improved from 9.58593 to 7.72678, saving model to lstm_summarizer.keras
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 5s/step - accuracy: 0.1039 - loss: 9.2078 - val_accuracy: 0.1034 - val_loss: 7.7268
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.0741 - loss: 6.5095
Epoch 3: val_loss improved from 7.72678 to 6.39750, saving model to lstm_summarizer.keras
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4s/step - accuracy: 0.0759 - lo

In [6]:
# %%
# Inference function
def run_inference(text):
    input_seq = text_tokenizer.texts_to_sequences([text])
    input_seq = pad_sequences(input_seq, maxlen=max_input_length, padding='post', truncating='post')

    predicted_sequence = []
    for _ in range(max_output_length):
        predicted_probs = model.predict(input_seq, verbose=0)
        predicted_token = np.argmax(predicted_probs, axis=-1)[0]
        predicted_sequence.append(predicted_token)
        if predicted_token == 0:  # Stop at padding token
            break

        # Update input_seq with the predicted token
        input_seq = np.roll(input_seq, -1, axis=-1)
        input_seq[0, -1] = predicted_token

    return summary_tokenizer.sequences_to_texts([predicted_sequence])[0]

# %%
# Evaluate the model
def evaluate_df(df, name):
    reference_summaries = []
    predicted_summaries = []
    total_time = 0
    
    for i, row in df.iterrows():
        test_text = row['text_short']
        reference_summary = row['summary']
        
        start_time = time.time()
        predicted_summary = run_inference(test_text)
        end_time = time.time()
        elapsed_time = end_time - start_time
        total_time += elapsed_time
        
        reference_summaries.append(reference_summary)
        predicted_summaries.append(predicted_summary)
    
    # Save results
    results_df = pd.DataFrame({
        'summary': reference_summaries,
        'summary_tuned': predicted_summaries
    })
    results_df.to_csv(f"./results/lstm/{name}_summaries.csv")
    
    print(f"Evaluation completed for {name}.")
    print(f"Total time (seconds): {total_time}")
    print(f"Total time (minutes): {total_time / 60}")


In [8]:
# test inference
test_df = pd.read_csv(f'{dataset_dir}/podcast_with_summary_test.csv')
print(run_inference(test_df['text_short'][0]))

a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a


In [7]:

# %%
test_df = pd.read_csv(f'{dataset_dir}/podcast_with_summary_test.csv')
evaluate_df(test_df, "test_dataset")

train_df = pd.read_csv(f'{dataset_dir}/podcast_with_summary_train.csv')
evaluate_df(train_df, 'train_dataset')

whole_df = pd.read_csv(f'{dataset_dir}/podcast_with_summary.csv')
evaluate_df(whole_df, 'whole_dataset')

KeyboardInterrupt: 