# Imports

In [22]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split



# Data Loading

In [23]:
# Function for reading one file
def read_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        blocks = file.read().split("\n@highlight\n")

    story = blocks[0].replace("\n", " ").strip()  # Remove newlines from the story

    # Concatenate highlights into one string, separated by dots
    highlights = '. '.join([blocks[i].replace("\n", " ").strip() for i in range(1, len(blocks))])


    return story, highlights

# function for reading all the stories
def read_all_stories(directories):
    # Gather all .STORY file paths from the specified directories
    file_paths = []
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith(".story"):
                file_paths.append(os.path.join(directory, filename))

    data = []  # List to hold data before converting to DataFrame

    # Read and process each .STORY file
    for path in tqdm(file_paths, desc="Processing files"):
        try:
            story, highlights = read_dataset(path)
            data.append({'story': story, 'highlights': highlights})
        except Exception as e:
            print(f"Error processing file {path}: {e}")

    df = pd.DataFrame(data)
    return df


In [24]:
# # Data for testing the model and evaluation
# file_paths = ['Data/sample']
# raw_data = read_all_stories(file_paths)

Processing files: 100%|████████████████████████████████████████████████████████| 21288/21288 [00:02<00:00, 7910.82it/s]


In [25]:
# file_paths = ['Data/testread1', 'Data/testread2']
# raw_data = read_all_stories(file_paths)

In [26]:
file_paths = ['Data/cnn', 'Data/dailymail']
raw_data = read_all_stories(file_paths)

In [27]:
# raw_data.head(5)

In [28]:
# raw_data.highlights[0]

In [29]:
# num_rows, num_columns = raw_data.shape
# print(f"Number of stories: {num_rows}")
# print(f"Number of columns: {num_columns}")

## Data Pre-processing

#### Pre-processing class

In [30]:
# class DataPreprocessor:
#     def __init__(self):
#         self.tokenizer = Tokenizer(oov_token="<OOV>", num_words=15500) # determined by word count analysis


#     def clean_df(self, df):
#         for i in tqdm(range(len(df)), desc="Cleaning Dataframe"):
#             df.at[i, 'story'] = self.clean_text(df.at[i, 'story'])
#             df.at[i, 'highlights'] = [self.clean_text(h) for h in df.at[i, 'highlights']]
#         return df

#     def clean_text(self, text):
#         lowercase_text = text.lower()
#         cleaned_text = self.remove_special(lowercase_text)
#         return cleaned_text
    
#     def remove_special(self, text):
#         # Replace special characters with spaces but keep .,!,?
#         cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?]', ' ', text)
#         # Replace multiple spaces with a single space
#         cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
#         return cleaned_text

#     def get_texts_list(self, df):
#         # Convert the story column to a list
#         story_texts = df['story'].tolist()
#         # Flatten the list of lists in the highlights column
#         story_texts = [text for text in tqdm(df['story'], desc="Text Concatenation")]
#         highlight_texts = [highlight for sublist in df['highlights'] for highlight in sublist]
#         # Combine the two lists
#         all_texts = story_texts + highlight_texts
#         return all_texts

#     def tokenize(self, df):
#         # Will need tokenization parameters
#         all_texts_list = self.get_texts_list(df)
#         self.tokenizer.fit_on_texts(all_texts_list)
        

#     def df_to_seq(self, df):
#         for i in tqdm(range(len(df)), desc="Tokenizing Dataframe"):
#             df.at[i, 'story'] = self.tokenizer.texts_to_sequences([df.at[i, 'story']])[0]
#             highlights = df.at[i, 'highlights']
             
#             # Add start and end tokens to each highlight
#             highlights = ['<start> ' + highlight + ' <end>' for highlight in highlights]
        
#             tokenized_highlights = [self.tokenizer.texts_to_sequences([highlight])[0] for highlight in highlights]
#             df.at[i, 'highlights'] = tokenized_highlights
#         return df


#     def data_padding(self, df):
#         max_story_len = 1250
#         max_highlight_len = 50
        
#         for i in tqdm(range(len(df)), desc="Padding sequences"):
#             # Padding for story
#             df.at[i, 'story'] = pad_sequences([df.at[i, 'story']], maxlen=max_story_len, padding='post', truncating='post')[0]
            
#             # Padding for highlights
#             df.at[i, 'highlights'] = [pad_sequences([highlight], maxlen=max_highlight_len, padding='post', truncating='post')[0] for highlight in df.at[i, 'highlights']]
            
#         return df

#     def data_splitter(self, df):
#         X = df['story']
#         y = df['highlights']

#         # Splitting data into train, test and validation subsets
#         X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
#         X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True)

#         return X_train, y_train, X_val, X_test, y_val, y_test





        

In [31]:
class DataPreprocessor:
    def __init__(self):
        self.tokenizer = Tokenizer(oov_token="<OOV>", num_words=15500) # determined by word count analysis

    def clean_df(self, df):
        for i in tqdm(range(len(df)), desc="Cleaning Dataframe"):
            df.at[i, 'story'] = self.clean_text(df.at[i, 'story'])
            df.at[i, 'highlights'] = self.clean_text(df.at[i, 'highlights'])
        return df

    def clean_text(self, text):
        lowercase_text = text.lower()
        cleaned_text = self.remove_special(lowercase_text)
        return cleaned_text
    
    def remove_special(self, text):
        # Replace special characters with spaces but keep .,!,?
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?]', ' ', text)
        # Replace multiple spaces with a single space
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return cleaned_text

    def get_texts_list(self, df):
        # Convert both columns to lists
        story_texts = df['story'].tolist()
        highlight_texts = df['highlights'].tolist()
        # Combine the two lists
        all_texts = story_texts + highlight_texts
        return all_texts


    def tokenize(self, df):
        # Will need tokenization parameters
        all_texts_list = self.get_texts_list(df)
        self.tokenizer.fit_on_texts(all_texts_list)
        

    def df_to_seq(self, df):
        for i in tqdm(range(len(df)), desc="Tokenizing Dataframe"):
            df.at[i, 'story'] = self.tokenizer.texts_to_sequences([df.at[i, 'story']])[0]
            
            # Add start and end tokens to the highlight
            highlight = '<start> ' + df.at[i, 'highlights'] + ' <end>'
            df.at[i, 'highlights'] = self.tokenizer.texts_to_sequences([highlight])[0]
        return df




    def data_padding(self, df):
        max_story_len = 1800
        max_highlight_len = 70  # Adjust this if needed since it's a single string now
            
        for i in tqdm(range(len(df)), desc="Padding sequences"):
            df.at[i, 'story'] = pad_sequences([df.at[i, 'story']], maxlen=max_story_len, padding='post', truncating='post')[0]
            df.at[i, 'highlights'] = pad_sequences([df.at[i, 'highlights']], maxlen=max_highlight_len, padding='post', truncating='post')[0]
                
        return df


    def data_splitter(self, df):
        X = df['story']
        y = df['highlights']

        # Splitting data into train, test and validation subsets
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True)

        return X_train, y_train, X_val, X_test, y_val, y_test





        

#### Data cleaning

In [32]:
preprocessor = DataPreprocessor()
cleaned_data = preprocessor.clean_df(raw_data)

Cleaning Dataframe: 100%|██████████████████████████████████████████████████████| 21288/21288 [00:06<00:00, 3539.22it/s]


#### Tokenization

In [33]:
preprocessor.tokenize(cleaned_data)
tokenized_df = preprocessor.df_to_seq(cleaned_data)

Tokenizing Dataframe: 100%|████████████████████████████████████████████████████| 21288/21288 [00:06<00:00, 3389.54it/s]


#### Padding

In [34]:
padded_df = preprocessor.data_padding(tokenized_df)

Padding sequences: 100%|███████████████████████████████████████████████████████| 21288/21288 [00:03<00:00, 6957.39it/s]


### Splitting data into subsets 

In [35]:
X_train, y_train, X_val, X_test, y_val, y_test = preprocessor.data_splitter(padded_df)

In [36]:
# X_train

## Defining the Model

In [None]:
# Model imports
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [38]:
VOCAB_SIZE = 15500 + 1  # 15,500 words + 1 for OOV token
EMBEDDING_DIM = 256
LSTM_UNITS = 512

# Encoder

# 1. Input Layer
encoder_inputs = Input(shape=(None,))

# 2. Embedding Layer
encoder_embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(encoder_inputs)

# 3. LSTM Layer
encoder_lstm, state_h, state_c = LSTM(LSTM_UNITS, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder

# 1. Input Layer
decoder_inputs = Input(shape=(None,))

# 2. Embedding Layer
decoder_embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM)
dec_emb = decoder_embedding(decoder_inputs)

# 3. LSTM Layer
decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# 4. Dense Layer
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
output = decoder_dense(decoder_outputs)

# Model

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=output)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [18]:
# len(y_train[0][1])

In [19]:
# print(encoder_input_data.shape)
# print(np.array(decoder_input_data).shape)
# print(np.array(decoder_target_data).shape)


### Model Training

In [39]:
# Prepare the data for training
encoder_input_data = np.array(X_train.tolist())  # Encoder input

# Decoder input: Exclude the last token
decoder_input_data = np.array([seq[:-1] for seq in y_train.tolist()])  

# Decoder target: Exclude the first token and add an extra dimension
decoder_target_data = np.expand_dims(np.array([seq[1:] for seq in y_train.tolist()]), -1)

# Training parameters
EPOCHS = 20
BATCH_SIZE = 64

# For validation data, follow a similar approach to get encoder and decoder inputs
encoder_input_val = np.array(X_val.tolist())
decoder_input_val = np.array([seq[:-1] for seq in y_val.tolist()])
# Decoder target for validation: Exclude the first token and add an extra dimension
decoder_target_val = np.expand_dims(np.array([seq[1:] for seq in y_val.tolist()]), -1)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

# Train the model
history = model.fit(
    [encoder_input_data, decoder_input_data],  # Model inputs
    decoder_target_data,                       # Model targets
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=([encoder_input_val, decoder_input_val], decoder_target_val),
    callbacks=[early_stopping]
)


Epoch 1/10

KeyboardInterrupt: 

### Evaluations

In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss Value')
plt.legend()
plt.show()


In [None]:
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy Value')
plt.legend()
plt.show()


### Word limit exploration

In [None]:
# # Step 1: Get Word Frequencies
# word_freq = preprocessor.tokenizer.word_counts

# # Step 2: Sort Word Frequencies in descending order
# sorted_word_freq = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))

# # Step 3: Calculate Cumulative Frequency
# word_counts = list(sorted_word_freq.values())
# cumulative_word_counts = np.cumsum(word_counts)
# total_word_count = cumulative_word_counts[-1]

# # Normalize the cumulative sum to get a distribution
# cumulative_distribution = cumulative_word_counts / total_word_count

# # Step 4: Visualize Cumulative Frequency Distribution
# plt.figure(figsize=(10, 6))
# plt.plot(cumulative_distribution)
# plt.xlabel("Number of Unique Words")
# plt.ylabel("Cumulative Frequency")
# plt.title("Cumulative Distribution of Word Frequencies")
# plt.grid(True)
# plt.show()

# index_95_percent = np.argmax(cumulative_distribution > 0.95)

# # Odpowiadająca wartość słowa
# word_at_95_percent = list(sorted_word_freq.keys())[index_95_percent]

# print(f"95% of the cumulative word count is covered by the first {index_95_percent} words.")
# print(f"The word at the 95% threshold is: {word_at_95_percent}")


### Max length exploration

In [None]:
# # Sequence lengths for stories
# story_lengths = tokenized_df['story'].apply(len)

# # Sequence lengths for highlights
# highlight_lengths = tokenized_df['highlights'].apply(len)

# # Plotting the length distributions
# plt.figure(figsize=(15, 6))

# # For highlights
# plt.subplot(1, 2, 1)
# plt.hist(highlight_lengths, bins=100, alpha=0.6, color='r', label='Highlights')
# plt.xlabel('Length')
# plt.ylabel('Frequency')
# plt.title('Highlights Length Distribution')
# plt.legend()

# # For stories
# plt.subplot(1, 2, 2)
# plt.hist(story_lengths, bins=100, alpha=0.6, color='b', label='Stories')
# plt.xlabel('Length')
# plt.ylabel('Frequency')
# plt.title('Stories Length Distribution')
# plt.legend()

# plt.tight_layout(pad=4.0)
# plt.show()


In [None]:
# # Sequence lengths for stories
# story_lengths = tokenized_df['story'].apply(len)

# # Sequence lengths for highlights
# highlight_lengths = tokenized_df['highlights'].apply(lambda x: [len(h) for h in x]).explode()


In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 5))

# # Histogram for story lengths
# plt.subplot(1, 2, 1)
# plt.hist(story_lengths, bins=50, color='blue', alpha=0.7)
# plt.title('Distribution of Story Lengths')
# plt.xlabel('Story Length')
# plt.ylabel('Number of Instances')

# # Histogram for highlight lengths
# plt.subplot(1, 2, 2)
# plt.hist(highlight_lengths, bins=50, color='green', alpha=0.7)
# plt.title('Distribution of Highlight Lengths')
# plt.xlabel('Highlight Length')
# plt.ylabel('Number of Instances')

# plt.tight_layout()
# plt.show()
