<a href="https://colab.research.google.com/github/lumenintellects/seq2seq-chatbot/blob/main/CSK507_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import json
import os
import pandas as pd
import torch.nn as nn
import datetime

PATH_WORKSPACE_ROOT = r'.' # Set the workspace root path here
FOLDER_DATASET = 'dataset' # Set the name of the folder containing the datasets here
FOLDER_LOG = FOLDER_DATASET # Set the name of the folder containing the log files here

EXTENSION_CSV = '.csv'
EXTENSION_PT = '.pt'
EXTENSION_PKL = '.pkl'
EXTENSION_JSON = '.json'
EXTENSION_LOG = '.log'
EXTENSION_PTH = '.pth'

FILE_NAME_DELIMITER = '_'

def to_csv_filename(name):
    """
    Create a CSV filename with the given name.

    Parameters:
        name (str): The name of the CSV file.

    Returns:
        str: The full path to the CSV file.
    """
    return f"{name}{EXTENSION_CSV}"

def to_pt_filename(name):
    """
    Create a PT filename with the given name.

    Parameters:
        name (str): The name of the PT file.

    Returns:
        str: The full path to the PT file.
    """
    return f"{name}{EXTENSION_PT}"

def to_pkl_filename(name):
    """
    Create a PKL filename with the given name.

    Parameters:
        name (str): The name of the PKL file.

    Returns:
        str: The full path to the PKL file.
    """
    return f"{name}{EXTENSION_PKL}"

def to_json_filename(name):
    """
    Create a JSON filename with the given name.

    Parameters:
        name (str): The name of the JSON file.

    Returns:
        str: The full path to the JSON file.
    """
    return f"{name}{EXTENSION_JSON}"

def to_log_filename(name):
    """
    Create a log filename with the given name.

    Parameters:
        name (str): The name of the log file.

    Returns:
        str: The full path to the log file.
    """
    return f"{name}{EXTENSION_LOG}"

def to_pth_filename(name):
    """
    Create a PTH filename with the given name.

    Parameters:
        name (str): The name of the PTH file.

    Returns:
        str: The full path to the PTH file.
    """
    return f"{name}{EXTENSION_PTH}"

def compose_filename(base, name_tokens):
    """
    Compose a filename with the given base and name tokens.

    Parameters:
        base (str): The base filename.
        name_tokens (list): A list of name tokens.

    Returns:
        str: The composed filename.
    """
    return FILE_NAME_DELIMITER.join([base] + name_tokens)

NAME_TOKEN_OUTLIERS = 'outliers'
NAME_TOKEN_INPUT_OUTPUT_PAIRS = 'input_output_pairs'
NAME_TOKEN_INPUT = 'input'
NAME_TOKEN_OUTPUT = 'output'
NAME_TOKEN_VOCAB = 'vocab'
NAME_TOKEN_SEQ = 'seq'
NAME_TOKEN_PADDED = 'padded'
NAME_TOKEN_BATCH = 'batch'

def get_path_log(base, dataset_name, timestamp_token):
    """
    Get the path to the log file.

    Parameters:
        base (str): The base filename.
        timestamp_token (str): The timestamp token.

    Returns:
        str: The path to the log file.
    """
    base_filename = compose_filename(base, [dataset_name, timestamp_token])
    log_filename = to_log_filename(base_filename)
    return os.path.join(FOLDER_LOG, log_filename)

def get_path_source_csv(base):
    """
    Get the path to the source CSV file.

    Parameters:
        base (str): The base filename.

    Returns:
        str: The path to the source CSV file.
    """

    base_filename = compose_filename(base, [])
    csv_filename = to_csv_filename(base_filename)
    return os.path.join(FOLDER_DATASET, csv_filename)

def get_path_outliers(base):
    """
    Get the path to the outliers CSV file.

    Parameters:
        base (str): The base filename.

    Returns:
        str: The path to the outliers CSV file.
    """
    base_filename = compose_filename(base, [NAME_TOKEN_OUTLIERS])
    csv_filename = to_csv_filename(base_filename)
    return os.path.join(FOLDER_DATASET, csv_filename)

def get_path_input_output_pairs(base):
    """
    Get the path to the input-output pairs file.

    Parameters:
        base (str): The base filename.

    Returns:
        str: The path to the input-output pairs file.
    """
    base_filename = compose_filename(base, [NAME_TOKEN_INPUT_OUTPUT_PAIRS])
    csv_filename = to_csv_filename(base_filename)
    return os.path.join(FOLDER_DATASET, csv_filename)

def get_path_input_sequences(base):
    """
    Get the path to the input sequences file.

    Parameters:
        base (str): The base filename.

    Returns:
        str: The path to the input sequences file.
    """
    base_filename = compose_filename(base, [NAME_TOKEN_INPUT, NAME_TOKEN_SEQ])
    pt_filename = to_pt_filename(base_filename)
    return os.path.join(FOLDER_DATASET, pt_filename)

def get_path_output_sequences(base):
    """
    Get the path to the output sequences file.

    Parameters:
        base (str): The base filename.

    Returns:
        str: The path to the output sequences file.
    """
    base_filename = compose_filename(base, [NAME_TOKEN_OUTPUT, NAME_TOKEN_SEQ])
    pt_filename = to_pt_filename(base_filename)
    return os.path.join(FOLDER_DATASET, pt_filename)

def get_path_vocab(base):
    """
    Get the path to the vocabulary file.

    Parameters:
        base (str): The base filename.

    Returns:
        str: The path to the vocabulary file.
    """
    base_filename = compose_filename(base, [NAME_TOKEN_VOCAB])
    pkl_filename = to_pkl_filename(base_filename)
    return os.path.join(FOLDER_DATASET, pkl_filename)

def get_path_input_sequences_padded_batch(base, batch_number):
    """
    Get the path to the input sequences padded batch file.

    Parameters:
        base (str): The base filename.
        batch_number (int): The batch number.

    Returns:
        str: The path to the input sequences padded batch file.
    """
    base_filename = compose_filename(base, [NAME_TOKEN_INPUT, NAME_TOKEN_SEQ, NAME_TOKEN_PADDED, NAME_TOKEN_BATCH, str(batch_number)])
    pt_filename = to_pt_filename(base_filename)
    return os.path.join(FOLDER_DATASET, pt_filename)

def get_path_output_sequences_padded_batch(base, batch_number):
    """
    Get the path to the output sequences padded batch file.

    Parameters:
        base (str): The base filename.
        batch_number (int): The batch number.

    Returns:
        str: The path to the output sequences padded batch file.
    """
    base_filename = compose_filename(base, [NAME_TOKEN_OUTPUT, NAME_TOKEN_SEQ, NAME_TOKEN_PADDED, NAME_TOKEN_BATCH, str(batch_number)])
    pt_filename = to_pt_filename(base_filename)
    return os.path.join(FOLDER_DATASET, pt_filename)

def get_path_input_sequences_padded_batch_pattern(base):
    """
    Get the path pattern for the input sequences padded batch files.

    Parameters:
        base (str): The base filename.

    Returns:
        str: The path pattern for the input sequences padded batch files.
    """
    path_pattern = compose_filename(base, [NAME_TOKEN_INPUT, NAME_TOKEN_SEQ, NAME_TOKEN_PADDED, NAME_TOKEN_BATCH])
    return os.path.join(FOLDER_DATASET, f"{path_pattern}{FILE_NAME_DELIMITER}*.pt")

def get_path_output_sequences_padded_batch_pattern(base):
    """
    Get the path pattern for the output sequences padded batch files.

    Parameters:
        base (str): The base filename.

    Returns:
        str: The path pattern for the output sequences padded batch files.
    """
    path_pattern = compose_filename(base, [NAME_TOKEN_OUTPUT, NAME_TOKEN_SEQ, NAME_TOKEN_PADDED, NAME_TOKEN_BATCH])
    return os.path.join(FOLDER_DATASET, f"{path_pattern}{FILE_NAME_DELIMITER}*.pt")

def get_path_model(base, version):
    """
    Get the path to the model file.

    Parameters:
        base (str): The base filename.
        version (str): The version of the model.

    Returns:
        str: The path to the model file.
    """
    base_filename = compose_filename(base, [version])
    pth_filename = to_pth_filename(base_filename)
    return os.path.join(FOLDER_DATASET, pth_filename)

MODE_READONLY = 'r'

BASE_FILENAME_SETTINGS = 'settings' # Set the base filename for the settings JSON file here
BASE_FILENAME_MODEL = 'seq2seq_model'

# Define the settings keys inside the settings JSON file
SETTING_ENABLE_LOGGING = 'enableLogging'
SETTING_TRAINING_LOOP_CONTINUE = 'trainingLoopContinue'
SETTING_NEXT_SUBSET_CONTINUE = 'nextSubsetContinue'
SETTING_ANALYZE_SEQUENCES = 'analyzeSequences'
SETTING_DEBUG_MODE = 'debugMode'
SETTING_TRAINING_SUBSET_SIZE = 'trainingSubsetSize'
SETTING_EVALUATION_SUBSET_SIZE = 'evaluationSubsetSize'
SETTING_EVALUATION_LOOP_CONTINUE = 'evaluationLoopContinue'
SETTING_EVALUATION_RELOAD_MODEL_IN_LOOP = 'evaluationReloadModelInLoop'

# Define the Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src shape: (batch_size, src_len)
        embedded = self.dropout(self.embedding(src))  # (batch_size, src_len, emb_dim)
        outputs, hidden = self.rnn(embedded)  # outputs: (batch_size, src_len, hidden_dim), hidden: (n_layers, batch_size, hidden_dim)
        return hidden  # Return only the hidden state

# Define the Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, hidden):
        # trg shape: (batch_size, trg_len)
        # hidden shape: (n_layers, batch_size, hidden_dim)
        embedded = self.dropout(self.embedding(trg))  # (batch_size, trg_len, emb_dim)
        outputs, hidden = self.rnn(embedded, hidden)  # (batch_size, trg_len, hidden_dim), hidden: (n_layers, batch_size, hidden_dim)
        predictions = self.fc_out(outputs).float()  # Ensure predictions are float32
        return predictions, hidden

# Define the Seq2Seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg):
        # src: (batch_size, src_len), trg: (batch_size, trg_len)
        hidden = self.encoder(src)  # Get the context vector
        outputs, _ = self.decoder(trg, hidden)  # Decode based on the context vector
        return outputs

class Seq2SeqWithAttention(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [src_len, batch_size]
        # trg: [trg_len, batch_size]
        # teacher_forcing_ratio: probability to use teacher forcing

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # Tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # Encode the source sequence
        encoder_outputs, hidden = self.encoder(src)

        # First input to the decoder is the <bos> token
        input = trg[0, :]

        for t in range(1, trg_len):
            # Decode
            output, hidden = self.decoder(input, hidden, encoder_outputs)

            outputs[t] = output

            # Decide whether to use teacher forcing
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio

            # Get the highest predicted token
            top1 = output.argmax(1)

            # Decide the next input
            input = trg[t] if teacher_force else top1

        return outputs

def get_setting(setting_name):
    """
    Get the value of a setting from the settings file.

    Parameters:
        setting_name (str): The name of the setting.

    Returns:
        str: The value of the setting.
    """
    filename_settings = to_json_filename(BASE_FILENAME_SETTINGS)
    relative_path_settings = os.path.join(filename_settings)
    path_settings = os.path.join(PATH_WORKSPACE_ROOT, relative_path_settings)

    # Load the settings JSON file
    with open(path_settings, MODE_READONLY) as file:
        settings = json.load(file)
        return settings[setting_name]

def get_setting_training_subset_size():
    """
    Get the value of the setting 'trainingSubsetSize' from the settings file.

    Returns:
        int: The value of the setting 'trainingSubsetSize'.
    """
    return get_setting(SETTING_TRAINING_SUBSET_SIZE)


def get_setting_evaluation_subset_size():
    """
    Get the value of the setting 'evaluationSubsetSize' from the settings file.

    Returns:
        int: The value of the setting 'evaluationSubsetSize'.
    """
    return get_setting(SETTING_EVALUATION_SUBSET_SIZE)

def get_setting_evaluation_loop_continue():
    """
    Get the value of the setting 'evaluationLoopContinue' from the settings file.

    Returns:
        bool: The value of the setting 'evaluationLoopContinue'.
    """
    return get_setting(SETTING_EVALUATION_LOOP_CONTINUE)

def get_setting_evaluation_reload_model_in_loop():
    """
    Get the value of the setting 'evaluationReloadModelInLoop' from the settings file.

    Returns:
        bool: The value of the setting 'evaluationReloadModelInLoop'.
    """
    return get_setting(SETTING_EVALUATION_RELOAD_MODEL_IN_LOOP)

def get_setting_evaluation_subset_size():
    """
    Get the value of the setting 'evaluationSubsetSize' from the settings file.

    Returns:
        int: The value of the setting 'evaluationSubsetSize'.
    """
    return get_setting(SETTING_EVALUATION_SUBSET_SIZE)

def get_setting_evaluation_loop_continue():
    """
    Get the value of the setting 'evaluationLoopContinue' from the settings file.

    Returns:
        bool: The value of the setting 'evaluationLoopContinue'.
    """
    return get_setting(SETTING_EVALUATION_LOOP_CONTINUE)

def get_setting_evaluation_reload_model_in_loop():
    """
    Get the value of the setting 'evaluationReloadModelInLoop' from the settings file.

    Returns:
        bool: The value of the setting 'evaluationReloadModelInLoop'.
    """
    return get_setting(SETTING_EVALUATION_RELOAD_MODEL_IN_LOOP)

def get_setting_debug_mode():
    """
    Get the value of the setting 'debugMode' from the settings file.

    Returns:
        bool: The value of the setting 'debugMode'.
    """
    return get_setting(SETTING_DEBUG_MODE)

def get_setting_analyze_sequences():
    """
    Get the value of the setting 'analyzeSequences' from the settings file.

    Returns:
        bool: The value of the setting 'analyzeSequences'.
    """
    return get_setting(SETTING_ANALYZE_SEQUENCES)

def get_setting_enable_logging():
    """
    Get the value of the setting 'enableLogging' from the settings file.

    Returns:
        bool: The value of the setting 'enableLogging'.
    """
    return get_setting(SETTING_ENABLE_LOGGING)

def get_setting_training_loop_continue():
    """
    Get the value of the setting 'trainingLoopContinue' from the settings file.

    Returns:
        bool: The value of the setting 'trainingLoopContinue'.
    """
    return get_setting(SETTING_TRAINING_LOOP_CONTINUE)

def get_setting_next_subset_continue():
    """
    Get the value of the setting 'nextSubsetContinue' from the settings file.

    Returns:
        bool: The value of the setting 'nextSubsetContinue'.
    """
    return get_setting(SETTING_NEXT_SUBSET_CONTINUE)

def extract_rows(input_csv, output_csv, num_rows):
    """
    Extract the first X rows from a CSV file and save to a new file.

    Parameters:
        input_csv (str): Path to the input CSV file.
        output_csv (str): Path to save the output CSV file.
        num_rows (int): Number of rows to extract.
    """
    try:
        # Load the CSV file
        df = pd.read_csv(input_csv)

        # Extract the first num_rows rows
        sampled_df = df.head(num_rows)

        # Save the extracted rows to a new CSV file
        sampled_df.to_csv(output_csv, index=False)

        print(f"Successfully extracted {num_rows} rows to {output_csv}")
    except Exception as e:
        print(f"Error: {e}")

# for each row in the input csv, evaluate each cell against the filter_predicate and write to output csv
def filter_csv(input_csv, output_csv, filter_predicate):
    try:
        # Load the CSV file
        df = pd.read_csv(input_csv)

        # Filter the dataset
        filtered_df = df[df.apply(filter_predicate, axis=1)]

        # Save the filtered dataset to a new CSV file
        filtered_df.to_csv(output_csv, index=False)

        print(f"Successfully filtered dataset to {output_csv}")
    except Exception as e:
        print(f"Error: {e}")

def clean_text(text):
    """
    Clean the text by:
    1. removing leading/trailing spaces
    2. converting to lowercase.
    3. remove excess whitespace chars

    Parameters:
        text (str): The input text.

    Returns:
        str: The cleaned text.
    """
    return " ".join(text.strip().lower().split())

In [67]:
if 'df_time' not in locals():
       df_time = pd.DataFrame(columns=['method','Time_Taken'])

In [68]:
import glob
import os
import time
import logging
import pandas as pd
import spacy
#from common import PATH_WORKSPACE_ROOT, get_path_log, get_path_input_output_pairs, get_path_vocab
#from common import get_path_input_sequences, get_path_output_sequences
#from common import get_path_input_sequences_padded_batch, get_path_output_sequences_padded_batch
#from common import get_path_input_sequences_padded_batch_pattern, get_path_output_sequences_padded_batch_pattern
import torch
import pickle
import numpy as np

DATASET_NAME = 'ubuntu_dialogue_corpus_input_output_pairs'
LOG_BASE_FILENAME = "3_tokenize_dataset"

N_PROCESS_VALUE = 10
BATCH_SIZE = 500000
TRAINING_SUBSET_SIZE = 200
SETTING_ANALYZE_SEQUENCES = False
LOSS_THRESHOLD = 1.0

# Tokenizer using spaCy with multithreading
def spacy_tokenizer_pipe(texts, nlp, n_process=4):
    """
    Tokenizes a list of texts using SpaCy's nlp.pipe for multithreaded tokenization.

    Parameters:
        texts (list): List of text strings to tokenize.
        nlp: SpaCy language model.
        n_process (int): Number of processes for parallel processing.

    Returns:
        list: List of tokenized texts.
    """
    print(f"Tokenizing {len(texts)} texts using {n_process} processes...")
    tokenized_texts = []
    for doc in nlp.pipe(texts, n_process=n_process):
        tokenized_texts.append([token.text for token in doc if not token.is_space])
    return tokenized_texts

# Build vocabulary from spaCy tokens
def build_vocab(tokens_iterable, specials=["<unk>", "<pad>", "<bos>", "<eos>"]):
    vocab = {"<unk>": 0, "<pad>": 1, "<bos>": 2, "<eos>": 3}
    for tokens in tokens_iterable:
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

# Process text into sequences of indices with SpaCy pipeline
def process_text_spacy_pipe(texts, vocab, nlp, n_process=4):
    """
    Tokenizes a list of texts using SpaCy's nlp.pipe for multithreaded processing
    and converts them to sequences of indices.

    Parameters:
        texts (list): List of text strings to process.
        vocab (dict): Vocabulary mapping tokens to indices.
        nlp: SpaCy language model.
        n_process (int): Number of processes for parallel processing.

    Returns:
        list: List of sequences of indices.
    """
    print(f"Processing {len(texts)} texts using {n_process} processes...")
    tokenized_sequences = []
    for doc in nlp.pipe(texts, n_process=n_process):
        tokens = ["<bos>"] + [token.text for token in doc if not token.is_space] + ["<eos>"]
        tokenized_sequences.append([vocab.get(token, vocab["<unk>"]) for token in tokens])
    return tokenized_sequences

def analyze_sequences(sequences):
    sequence_lengths = [len(seq) for seq in sequences]
    max_length = max(sequence_lengths)
    mean_length = sum(sequence_lengths) / len(sequence_lengths)
    median_length = sorted(sequence_lengths)[len(sequence_lengths) // 2]

    # Percentiles
    import numpy as np
    percentile_95 = np.percentile(sequence_lengths, 95)

    print(f"Max Length: {max_length}")
    print(f"Mean Length: {mean_length}")
    print(f"Median Length: {median_length}")
    print(f"95th Percentile: {percentile_95}")

# Explicitly pad sequences to the global maximum length
def pad_to_length(sequences, max_length, padding_value):
    """
    Pads all sequences to a specified maximum length.

    Parameters:
        sequences (list of lists): Sequences to pad.
        max_length (int): Desired maximum length.
        padding_value (int): Padding value.

    Returns:
        torch.Tensor: Tensor of padded sequences.
    """
    padded_sequences = []
    for seq in sequences:
        if len(seq) > max_length:
            seq = seq[:max_length]  # Truncate if longer than max_length
        else:
            seq = seq + [padding_value] * (max_length - len(seq))  # Pad if shorter
        padded_sequences.append(seq)
    return torch.tensor(padded_sequences, dtype=torch.int64)



In [69]:
# ==========================
start = datetime.datetime.now()
print(start)

if __name__ == "__main__":

    # Set the current working directory
    os.chdir(PATH_WORKSPACE_ROOT)

    log_start_time = time.strftime('%Y%m%d_%H%M%S')
    path_log = get_path_log(LOG_BASE_FILENAME, DATASET_NAME, log_start_time)

    # Set up logging configuration
    logging.basicConfig(
        level=logging.INFO,  # Set the minimum log level (DEBUG, INFO, WARNING, etc.)
        format="%(asctime)s - %(levelname)s - %(message)s",  # Log format with timestamps
        handlers=[
            logging.FileHandler(path_log),  # Log to a file
            logging.StreamHandler()  # Log to the console
        ]
    )

    logger = logging.getLogger(__name__)

    print("Running main script...")
    print(f"Current Working Directory: {os.getcwd()}")

    # Load spaCy language model
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded.")

    # ==========================

    path_input_output_pairs = get_path_input_output_pairs(DATASET_NAME)
    path_vocab = get_path_vocab(DATASET_NAME)
    path_input_sequences = get_path_input_sequences(DATASET_NAME)
    path_output_sequences = get_path_output_sequences(DATASET_NAME)
    path_input_sequences_padded_batch_pattern = get_path_input_sequences_padded_batch_pattern(DATASET_NAME)
    path_output_sequences_padded_batch_pattern = get_path_output_sequences_padded_batch_pattern(DATASET_NAME)

    # Define the save path
    path_model = os.path.join(PATH_WORKSPACE_ROOT, "seq2seq_model.pth")

    # ==========================

    # Load the dataset#####################################################################################################################################
    df = pd.read_csv(path_input_output_pairs,on_bad_lines='skip',engine='python')
    print(f"Loaded csv into dataframe: {path_input_output_pairs}")
    df_main = df
    df = df.loc[0:100]
    # Replace NaN in 'input' and 'output' columns
    df['input'] = df['input'].fillna("")
    df['output'] = df['output'].fillna("")
    print("NaN replaced with empty strings.")

    # Check for existing vocabulary
    if os.path.exists(path_vocab):
        print("Vocabulary file found. Loading vocabulary...")
        with open(path_vocab, "rb") as vocab_file:
            vocab = pickle.load(vocab_file)
        print(f"Vocabulary loaded. Size: {len(vocab)}")
    else:
        print("Vocabulary file not found. Generating vocabulary...")

        # Tokenize using SpaCy's multithreading
        print("Tokenizing input and output texts...")
        time_input_sequences_start = time.time()
        time_input_sequences_start_hh_mm_ss = time.strftime('%H:%M:%S', time.localtime(time_input_sequences_start))
        print(f"The time is: {time_input_sequences_start_hh_mm_ss}")
        texts_combined = df['input'].tolist() + df['output'].tolist()
        combined_tokens = spacy_tokenizer_pipe(texts_combined, nlp, n_process=N_PROCESS_VALUE)
        time_input_sequences_end = time.time()
        print(f"Tokenization completed in {time_input_sequences_end - time_input_sequences_start:.2f} seconds.")

        # Build vocabulary from tokenized texts
        print("Building vocabulary...")
        time_build_vocab_start = time.time()
        time_build_vocab_start_hh_mm_ss = time.strftime('%H:%M:%S', time.localtime(time_build_vocab_start))
        print(f"The time is: {time_build_vocab_start_hh_mm_ss}")
        vocab = build_vocab(combined_tokens)
        time_build_vocab_end = time.time()
        print(f"Vocabulary built in {time_build_vocab_end - time_build_vocab_start} seconds.")
        print(f"Vocabulary built. Size: {len(vocab)}")

        # Save the vocabulary to file
        with open(path_vocab, "wb") as vocab_file:
            pickle.dump(vocab, vocab_file)
        print("Vocabulary saved to file.")

    padding_value = vocab["<pad>"]

    # Check for previous serialized input sequences
    if os.path.exists(path_input_sequences):
        print("Serialized input sequences found.")
    else:
        print("Serialized input sequences not found, generating input sequences...")

        # Tokenize and convert to sequences
        print("Tokenizing and converting to input sequences...")

        input_texts = df['input'].tolist()

        # Process input sequences in parallel
        time_input_sequences_start = time.time()
        time_input_sequences_start_hh_mm_ss = time.strftime('%H:%M:%S', time.localtime(time_input_sequences_start))
        print(f"The time is: {time_input_sequences_start_hh_mm_ss}")
        input_sequences = process_text_spacy_pipe(input_texts, vocab, nlp, n_process=N_PROCESS_VALUE)
        torch.save(input_sequences, path_input_sequences)
        time_input_sequences_end = time.time()
        print(f"Input sequences completed in {time_input_sequences_end - time_input_sequences_start} seconds.")

    # Check for previous serialized padded input sequences matching batch file name pattern
    if len(glob.glob(path_input_sequences_padded_batch_pattern)) > 0:
        print("Serialized padded input sequences found.")
    else:
        input_sequences = torch.load(path_input_sequences, weights_only=True)

        input_lengths = [len(seq) for seq in input_sequences]
        input_max_length = max(input_lengths)
        print(f"Max input length: {input_max_length}")

        input_mean_length = sum(input_lengths) / len(input_lengths)
        print(f"Mean input length: {input_mean_length}")

        input_median_length = sorted(input_lengths)[len(input_lengths) // 2]
        print(f"Median input length: {input_median_length}")

        input_percentile_95 = np.percentile(input_lengths, 95)
        print(f"95th percentile input length: {input_percentile_95}")

        # truncate input sequences longer than the 95th percentile
        print("Truncating input sequences longer than the 95th percentile...")
        input_max_length = int(input_percentile_95)
        input_sequences = [seq[:input_max_length] for seq in input_sequences]

        print("Serialized padded input sequences not found, padding input sequences...")
        time_pad_input_sequences_start = time.time()
        time_pad_input_sequences_start_hh_mm_ss = time.strftime('%H:%M:%S', time.localtime(time_pad_input_sequences_start))
        print(f"The time is: {time_pad_input_sequences_start_hh_mm_ss}")

        # Process sequences in batches to avoid memory issues
        for i in range(0, len(input_sequences), BATCH_SIZE):
            batch = input_sequences[i:i + BATCH_SIZE]
            print(f"Padding sequences in batch {i // BATCH_SIZE} to {input_max_length}")
            padded_batch = pad_to_length(batch, input_max_length, padding_value)  # Use explicit padding

            # Examine the padded batch
            print(f"Batch {i // BATCH_SIZE} shape: {padded_batch.shape}")

            batch_file_path = get_path_input_sequences_padded_batch(DATASET_NAME, i // BATCH_SIZE)
            torch.save(padded_batch, batch_file_path)
            print(f"Saved batch {i // BATCH_SIZE} to {batch_file_path}")

        time_pad_input_sequences_end = time.time()
        print(f"Padding input sequences completed in {time_pad_input_sequences_end - time_pad_input_sequences_start} seconds.")

    # Check for previous serialized output sequences
    if os.path.exists(path_output_sequences):
        print("Serialized output sequences found.")
    else:
        print("Serialized output sequences not found, generating output sequences...")

        # Tokenize and convert to sequences
        print("Tokenizing and converting to output sequences...")

        output_texts = df['output'].tolist()

        # Process output sequences in parallel
        time_output_sequences_start = time.time()
        time_output_sequences_start_hh_mm_ss = time.strftime('%H:%M:%S', time.localtime(time_output_sequences_start))
        print(f"The time is: {time_output_sequences_start_hh_mm_ss}")
        output_sequences = process_text_spacy_pipe(output_texts, vocab, nlp, n_process=N_PROCESS_VALUE)
        torch.save(output_sequences, path_output_sequences)
        time_output_sequences_end = time.time()
        print(f"Output sequences completed in {time_output_sequences_end - time_output_sequences_start} seconds.")

    # Check for previous serialized padded output sequences matching batch file name pattern
    if len(glob.glob(path_output_sequences_padded_batch_pattern)) > 0:
        print("Serialized padded output sequences found.")
    else:
        output_sequences = torch.load(path_output_sequences, weights_only=True)

        output_lengths = [len(seq) for seq in output_sequences]
        output_max_length = max(output_lengths)
        print(f"Max output length: output_max_length")

        output_mean_length = sum(output_lengths) / len(output_lengths)
        print(f"Mean output length: {output_mean_length}")

        output_median_length = sorted(output_lengths)[len(output_lengths) // 2]
        print(f"Median output length: {output_median_length}")

        output_percentile_95 = np.percentile(output_lengths, 95)
        print(f"95th percentile output length: {output_percentile_95}")

        # truncate output sequences longer than the 95th percentile
        print("Truncating output sequences longer than the 95th percentile...")
        output_max_length = int(output_percentile_95)
        output_sequences = [seq[:output_max_length] for seq in output_sequences]

        print("Serialized padded output sequences not found, padding output sequences...")
        time_pad_output_sequences_start = time.time()
        time_pad_output_sequences_start_hh_mm_ss = time.strftime('%H:%M:%S', time.localtime(time_pad_output_sequences_start))
        print(f"The time is: {time_pad_output_sequences_start_hh_mm_ss}")

        # Process sequences in batches to avoid memory issues
        for i in range(0, len(output_sequences), BATCH_SIZE):
            batch = output_sequences[i:i + BATCH_SIZE]
            print(f"Padding sequences in batch {i // BATCH_SIZE} to {output_max_length}")
            padded_batch = pad_to_length(batch, output_max_length, padding_value)  # Use explicit padding

            # Examine the padded batch
            print(f"Batch {i // BATCH_SIZE} shape: {padded_batch.shape}")

            batch_file_path = get_path_output_sequences_padded_batch(DATASET_NAME, i // BATCH_SIZE)
            torch.save(padded_batch, batch_file_path)
            print(f"Saved batch {i // BATCH_SIZE} to {batch_file_path}")

        time_pad_output_sequences_end = time.time()
        print(f"Padding output sequences completed in {time_pad_output_sequences_end - time_pad_output_sequences_start} seconds.")
        print("Exiting program.")
        exit()

    input_sequences_padded = torch.cat([torch.load(file, weights_only=True) for file in glob.glob(path_input_sequences_padded_batch_pattern)], dim=0)
    print("Loaded input sequences from files.")

    output_sequences_padded = torch.cat([torch.load(file, weights_only=True) for file in glob.glob(path_output_sequences_padded_batch_pattern)], dim=0)
    print("Loaded output sequences from file.")

    # Analyze sequences
    if SETTING_ANALYZE_SEQUENCES:
        print("Analyzing input and output sequences...")
        analyze_sequences(input_sequences_padded)
        analyze_sequences(output_sequences_padded)

        print(f"Input shape: {input_sequences_padded.shape}")
        print(f"Output shape: {output_sequences_padded.shape}")




end = datetime.datetime.now()
print(end)

print('Time taken to tokenize: ',end-start)
time_taken = end - start
new_row = pd.DataFrame({'Time_Taken': [time_taken], 'Method': ['multi']})
df_time = pd.concat([df_time, new_row], ignore_index=True)


2024-12-08 19:50:06.194863
Running main script...
Current Working Directory: /content
spaCy model loaded.
Loaded csv into dataframe: dataset/ubuntu_dialogue_corpus_input_output_pairs_input_output_pairs.csv
NaN replaced with empty strings.
Vocabulary file found. Loading vocabulary...
Vocabulary loaded. Size: 739
Serialized input sequences found.
Serialized padded input sequences found.
Serialized output sequences found.
Serialized padded output sequences found.
Loaded input sequences from files.
Loaded output sequences from file.
2024-12-08 19:50:18.322518
Time taken to tokenize:  0:00:12.127655


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['input'] = df['input'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['output'] = df['output'].fillna("")


#non multi threaded

In [None]:
import datetime
overallStart = datetime.datetime.now()
start = datetime.datetime.now()
print(start)
import locale
print('Prefferred encoding: ',locale.getpreferredencoding())
locale.getpreferredencoding = lambda: "UTF-8"
import os #needed to manipulate the downloaded files within the collab environment and to pull data
import json
from google.colab import userdata
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from matplotlib import pyplot as plt
%matplotlib inline
import spacy
from spacy import displacy
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import zipfile
import spacy.cli
from spacy.lang.en import English # updated
gpu = spacy.prefer_gpu()
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from ipywidgets import IntProgress
from IPython.display import display
from google.colab import drive
from matplotlib import pyplot as plt
import seaborn as sns
drive.mount('/content/drive')




start = datetime.datetime.now()
print(start)

#because of the complex corpus, use the larger en_core_web_lg
gpu = spacy.prefer_gpu()
try:
  nlp = spacy.load("en_core_web_lg")
except OSError:
  print('Downloading en_core_web_lg')
  spacy.cli.download("en_core_web_lg")
  nlp = spacy.load("en_core_web_lg")

dfu = pd.read_csv('/content/drive/MyDrive/ubuntu_dialogue_corpus_input_output_pairs.csv',on_bad_lines='skip',engine='python')



#print('length of df before splitting:', len(dfu))

#first_20PCT = int(len(dfu)*0.005)
#df = df_main

#print('length of df after splitting:', len(df))


ques = df['input']
ans = df['output']
#df = pd.DataFrame({'input': ques, 'output': ans})


ques_words = Counter()
ans_words = Counter()

ques_inputs = []
ans_inputs = []

f = IntProgress(min=0, max=len(df)) # instantiate the bar  [[McAteer, S (2017) Stackoverflow: How do I implement a progress bar, avaialable at https://stackoverflow.com/a/41457700 (accessed 25/11/2024)]]
display(f) # display the bar

for i in range(len(df)):
    ques_tokens = nlp(str(ques[i]))
    ans_tokens = nlp(str(ans[i]))




    if (len(ques_tokens)!=0 and len(ans_tokens)!=0):


      for token in ques_tokens:
          ques_words.update([token.text]) #this is the counter for the question frequency, update it


      ques_inputs.append([token.text for token in ques_tokens] + ['_EOS'])

      for token in ans_tokens:
          ans_words.update([token.text]) #this is the counter for the answer frequency, update it

      ans_inputs.append([token.text for token in ans_tokens] + ['_EOS'])
    f.value += 1


end = datetime.datetime.now()

print(end)

print('Time taken to tokenize: ',end-start)
time_taken = end - start
new_row = pd.DataFrame({'Time_Taken': [time_taken], 'Method': 'non'})
df_time = pd.concat([df_time, new_row], ignore_index=True)


2024-12-08 19:50:18.352244
Prefferred encoding:  UTF-8
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2024-12-08 19:50:22.646675


In [None]:
df_time

In [None]:


# Calculate average time for each method
average_times = df_time.groupby('Method')['Time_Taken'].mean().reset_index()
title = 'Average Time to Tokenize Comparison over 100 row corpus ('+str(int(len(df_time)/2))+' itterations)'

plt.figure(figsize=(8, 4))
sns.barplot(x='Time_Taken', y='Method', data=average_times, palette='husl', orient='h')
plt.xlabel('Average Time Taken (s)')
plt.ylabel('Method')
plt.title(title)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.show()