## Preprocessing

To use this for data preparation, 
- Prepare a single text file with all training materials.  
- Remove as much unneccesary detail as possible (copyrights, tables of contents etc.) 
- Blank lines will be automatically deleted, but chapter names or other details will remain.

In [2]:
#!pip install tensorflow spacy
#!python -m spacy download en_core_web_sm

import os
import pickle
import json
import numpy as np

from shared_project_functions import get_target_subdirectory

import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
#Intermediate functions

def clean_sentences(corpus_name, dir, reset=False) -> list:
    import re
    
    #load from pkl in subdirectory if exists
    pkl_file = f"{corpus_name}_cleaned_sentences.pkl"
    os_path = os.path.join(dir, pkl_file)
    if not reset and os.path.exists(os.path.join(dir, pkl_file)):
        with open(os.path.join(dir, pkl_file), "rb") as f:
            cleaned_sentences = pickle.load(f)
        print(f"\tLoaded cleaned sentences from {os.path.join(dir, pkl_file)}.")
    else:
        # Create a new list to hold cleaned sentences
        print("\tNo cleaned sentences found - processing corpus...")
        cleaned_sentences = []
        file_name = f"{corpus_name}.txt"
        # Load corpus
        try: # Load corpus from main directory
            with open(os.path.join(file_name), "r") as f:
                print(f"\tOpened {file_name} from main directory.")
                corpus = f.read()
                move_file_flag = True
        except FileNotFoundError:
            print(f"\tCorpus file {file_name} not found in main directory - trying subdirectory.")
            try: # Load corpus from subdirectory dir
                with open(os.path.join(dir, os.path.basename(file_name)), "r") as f:
                    print(f"\tOpened {os.path.join(dir, os.path.basename(file_name))} from subdirectory.")
                    corpus = f.read()
            except FileNotFoundError:
                print(f"\tCorpus file {os.path.join(dir, os.path.basename(file_name))} not found in subdirectory either.")
                return []

        nlp = spacy.load("en_core_web_sm")
        nlp.max_length = len(corpus) + 10007    

        print(f"\t\tCorpus length (characters): {len(corpus)}")
        print("\t\tStarting spaCy sentence segmentation...")
        split_lines = corpus.split('\n')  # or another delimiter
        
        #remove short lines such as blanks, "Chapter 1" etc.
        split_lines = [line for line in split_lines if len(line.strip()) >= 3] 
        
        print(f"\t\tNumber of split lines: {len(split_lines)}")
        sentences = []
        for doc in nlp.pipe(split_lines, batch_size=20, n_process=-1):
            sentences.extend([sent.text.lower().strip() for sent in doc.sents])
        print("\t\tFinished spaCy sentence segmentation.")
    
        # Remove short sentences and clean
        cleaned_sentences = []
        for idx, s in enumerate(sentences):
            words = [w for w in s.split() if not re.search(r'\d', w)]
            if len(words) > 1:
                s_clean = ' '.join(words).lower()
                cleaned_sentences.append(s_clean)

        # Save cleaned sentences to pkl file in subdirectory dir
        with open(os.path.join(dir, pkl_file), "wb") as f:
            pickle.dump(cleaned_sentences, f)
        print(f"\tSaved cleaned sentences to {os.path.join(dir, pkl_file)}.")

    return cleaned_sentences

def process_lemmas(corpus_name, sentences, nlp, dir, reset=False):
    pkl_file = f"{corpus_name}_lemmas.pkl"
    
    #load from pkl if exists
    if not reset and os.path.exists(os.path.join(dir, pkl_file)):
        with open(os.path.join(dir, pkl_file), "rb") as f:
            all_lemmas = pickle.load(f)
        print(f"\tLoaded lemmas from {os.path.join(dir, pkl_file)}.")
    else:
        print("\tProcessing lemmas...")
        processed_docs = nlp.pipe(sentences, batch_size=100, n_process=-1)
        all_lemmas = []
        for doc in processed_docs:
            lemmas = [token.lemma_.lower() for token in doc if token.is_alpha]
            all_lemmas.extend(lemmas)
        
        #save lemmas to pkl file in subdirectory dir
        with open(os.path.join(dir, pkl_file), "wb") as f:
            pickle.dump(all_lemmas, f)
        print(f"\tSaved lemmas to {os.path.join(dir, pkl_file)}.")
    return all_lemmas

def create_mappings(corpus_name, all_lemmas, dir, reset=False):
    word_to_id_file = f"{corpus_name}_word_to_id.json"
    id_to_word_file = f"{corpus_name}_id_to_word.json"
    
    #load from json files if exist
    if not reset and os.path.exists(os.path.join(dir, word_to_id_file)) and os.path.exists(os.path.join(dir, id_to_word_file)):
        with open(os.path.join(dir, word_to_id_file), "r") as f:
            word_to_id = json.load(f)
        with open(os.path.join(dir, id_to_word_file), "r") as f:
            id_to_word = json.load(f)
        print(f"\tLoaded mappings from {word_to_id_file} and {id_to_word_file}.")
    else:
        special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]
        word_to_id = {token: i for i, token in enumerate(special_tokens)}
        next_id = len(special_tokens)
        for lemma in set(all_lemmas):
            if lemma not in word_to_id:
                word_to_id[lemma] = next_id
                next_id += 1
        id_to_word = {str(id): word for word, id in word_to_id.items()}
        with open(os.path.join(dir, word_to_id_file), "w") as f:
            json.dump(word_to_id, f)
        with open(os.path.join(dir, id_to_word_file), "w") as f:
            json.dump(id_to_word, f)
        print(f"\tSaved mappings to {os.path.join(dir, word_to_id_file)} and {os.path.join(dir, id_to_word_file)}.")
    return word_to_id, id_to_word

def create_training_sequences(corpus_name, sentences, nlp, word_to_id, dir, max_seq_length=20, reset=False):
    npz_file = f"{corpus_name}_training_sequences.npz"
    
    #load from file if exists
    if not reset and os.path.exists(os.path.join(dir, npz_file)):
        data = np.load(os.path.join(dir, npz_file))
        X_train = data["X_train"]
        y_train = data["y_train"]
        print(f"\tLoaded training data from {os.path.join(dir, npz_file)}.")
    else:
        X_train = []
        y_train = []
        for sentence in sentences:
            lemmatized_sentence = [token.lemma_.lower() for token in nlp(sentence) if token.is_alpha]
            lemmatized_sentence = ["<SOS>"] + lemmatized_sentence + ["<EOS>"]
            numerical_sentence = [word_to_id.get(word, word_to_id["<UNK>"]) for word in lemmatized_sentence]
            for i in range(1, len(numerical_sentence)):
                input_seq = numerical_sentence[:i]
                target_token = numerical_sentence[i]
                if target_token == word_to_id["<UNK>"]:
                    continue  # Skip sequences with <UNK> target
                 # Pad input_seq to max_seq_length
                padded_input_seq = input_seq[:max_seq_length] + [word_to_id["<PAD>"]] * (max_seq_length - len(input_seq))
                X_train.append(padded_input_seq)
                y_train.append(target_token)
        X_train_np = np.array(X_train)
        y_train_np = np.array(y_train)
        #save to npz file in subdirectory dir
        np.savez_compressed(os.path.join(dir, npz_file), X_train=X_train_np, y_train=y_train_np, max_seq_length=max_seq_length)
        print(f"\tSaved training data to {os.path.join(dir, npz_file)}.")
    return X_train, y_train

In [33]:
 #To do:
# - Use Tokenizer from Tenensorflow rather than spacy for tokenization

In [34]:
#Optional: concatenate multiple text files into one corpus

def concatenate_texts(directory, output_file_name):
    import os
    #concatenate all texts in directory
    file_list = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".txt")]
    combined_text = ""
    for file_name in file_list:
        with open(file_name, "r", encoding="utf-8") as f:
            combined_text += f.read() + "\n"
    with open(output_file_name, "w", encoding="utf-8") as f:
        f.write(combined_text)
    print(f"Combined {len(file_list)} files into {output_file_name}.")

#concatenate_texts("model_1_sherlock/texts", "doyle.txt")

In [35]:
def preprocess(file_name: str, 
               lemmatize: bool = False,
               max_seq_length: int = 20,
               reset: bool = False,
               move_file_flag: bool = True) -> None:
    """
    Preprocess the text corpus for training a language model.

    Arguments:
        file_name (str): Path to the corpus .txt file.
        method (str): Preprocessing method. "default" uses spaCy and step-by-step processing.
                      (Future methods can be added.)
        max_seq_length (int): Maximum sequence length for training samples, needed by model.
        reset (bool): If False, uses previously-saved intermediate files to save time.
                      If True, starts over and overwrites any files.

    Outputs:
        Saves a single .pkl file with training data (X, y) and bidirectional mappings (word <-> id).

    Examples:
        - preprocess("shakespeare.txt")
        - preprocess("taylor_swift.txt", reset=True)

    To use pkl files in other notebooks:
        ```python
        with open("preprocessed_data.pkl", "rb") as f:
            data = pickle.load(f)
            X_train = data["X_train"]
            y_train = data["y_train"]
            word_to_id = data["word_to_id"]
            id_to_word = data["id_to_word"]
            max_seq_length = data["max_seq_length"]
        ```
    """
    import spacy
    import pickle
    import os
    import numpy as np
    import json    
    import re
    
    corpus_name = file_name.replace(".txt", "")
    print(f"Preprocessing {corpus_name}")
    dir = get_target_subdirectory(corpus_name)
    
    # Sentence cleaning
    cleaned_sentences = clean_sentences(corpus_name, dir, reset=reset)

    print(f"\t\tTotal number of sentences in corpus: {len(cleaned_sentences)}")
    #example sentence
    random_sentence_index = np.random.randint(0, len(cleaned_sentences))
    print(f"\t\tExample cleaned sentence [{random_sentence_index}]: {cleaned_sentences[random_sentence_index]}")
    
    # Lemmatization
    if lemmatize:
        all_lemmas = process_lemmas(corpus_name, cleaned_sentences, nlp, dir, reset)
        print(f"\t\tTotal lemmas extracted: {len(all_lemmas)}")
    else:
        print("\tSkipping lemmatization")

    # Mapping
    if lemmatize:
        word_to_id, id_to_word = create_mappings(corpus_name, all_lemmas, dir, reset)
    else:
        # Flatten cleaned_sentences into a list of words
        all_words = []
        for s in cleaned_sentences:
            all_words.extend([w for w in s.split() if w.isalpha()])
        word_to_id, id_to_word = create_mappings(corpus_name, all_words, dir, reset)
    print(f"\t\tVocabulary size (including special tokens): {len(word_to_id)}")

    # Training Sequences
    X_train, y_train = create_training_sequences(corpus_name, cleaned_sentences, nlp, word_to_id, dir, max_seq_length, reset=reset)
    print(f"\t\tNumber of training samples: {len(X_train)}")

    # Save all training data to pkl
    data = {
        "X_train": X_train,
        "y_train": y_train,
        "word_to_id": word_to_id,
        "id_to_word": id_to_word,
        "max_seq_length": max_seq_length
    }
    
    # Save to pkl file in subdirectory dir
    output_filename = f"{corpus_name}_preprocessed_data.pkl"
    with open(os.path.join(dir, output_filename), "wb") as f:
        pickle.dump(data, f)
    print(f"\tSaved preprocessed data to {os.path.join(dir, output_filename)}")
    
    # Move source file to subdirectory dir
    if move_file_flag:
        try:
            os.rename(file_name, os.path.join(dir, os.path.basename(file_name)))
            print(f"\tMoved {file_name} to {os.path.join(dir, os.path.basename(file_name))}")
        except Exception as e:
            print(f"\tCould not move {file_name}: {e}")

In [37]:
preprocess("doyle.txt", reset=False)

Preprocessing doyle
Corpus: doyle   Directory: model_2_doyle, reset: False
pklfile: doyle_cleaned_sentences.pkl
os_path: model_2_doyle/doyle_cleaned_sentences.pkl
	Loaded cleaned sentences from model_2_doyle/doyle_cleaned_sentences.pkl.
		Total number of sentences in corpus: 189298
		Example cleaned sentence [41571]: i hope that you will continue your duties here until you have found a place elsewhere.”
	Skipping lemmatization
	Loaded mappings from doyle_word_to_id.json and doyle_id_to_word.json.
		Vocabulary size (including special tokens): 38177
	Saved training data to model_2_doyle/doyle_training_sequences.npz.
		Number of training samples: 3672672
	Saved preprocessed data to model_2_doyle/doyle_preprocessed_data.pkl
	Could not move doyle.txt: [Errno 2] No such file or directory: 'doyle.txt' -> 'model_2_doyle/doyle.txt'
