## 1. The Vocabulary

In [24]:
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
import re 
import os 
import shutil
import torch

#### collecting the vocabulary

##### This can be any data, can be from internet or you own data. I have used a book named pride and prejudice

In [3]:
# File Paths
doc_name = "pride_and_prejudice"

path = "pride_and_prejudice.pdf"
text_filename = "pride_and_prejudice.txt"
filepath_clean = f"pride_and_prejudice_clean.txt"
filepath_words = f"pride_and_prejudice_words.txt"


## 2. Data Cleaning

In [4]:
# Split text by sentence

import os
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text

document = PdfReader(open(path, "rb"))  # replace with your PDF file


all_pages_text = []
page_num = 0

for i in range(len(document.pages)):
    # Convert page to PDF File Writer object
    page = document.pages[i]
    
    # Extract text from page
    page_text = page.extract_text()

    start_page = i - 1

    all_pages_text.append(page_text)

with open(text_filename, 'w', encoding="utf-8") as file:
    for item in all_pages_text:
        file.write(str(item) + '\n')

In [5]:
# PRE-PROCESS THE TEXT
import os
import pandas as pd
import nltk

# Download the punkt tokenizer models
nltk.download('punkt')

def file_to_sentences(filepath):
    """
    Given a filepath, read the text file and split it into sentences.
    Returns a list of sentences.
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read()
        sentences = nltk.tokenize.sent_tokenize(content)
    return sentences

def add_sentences_to_dataframe(sentences, dataframe):
    """
    Add sentences to the given dataframe.
    """
    for sentence in sentences:
        dataframe = dataframe.append({'Sentence': sentence}, ignore_index=True)
    return dataframe

# Initialize an empty DataFrame with one column "Sentence"
df = pd.DataFrame(columns=["text"])

# Initialize a list to hold all sentences
all_sentences = []

# Set the directory where the text files are located
filepath = f"{doc_name}.txt"
all_sentences.extend(file_to_sentences(filepath))


# Convert the list of sentences to a DataFrame
df = pd.DataFrame(all_sentences, columns=["Sentence"])

# Save the DataFrame to a CSV file
df.to_csv(f"{doc_name}_sentences.csv", index=False)


[nltk_data] Downloading package punkt to /Users/mansoor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
import re
import unicodedata

# Function to convert to ASCII
def to_ascii(text):
    normalized = unicodedata.normalize('NFKD', text)
    return normalized.encode('ascii', 'ignore').decode('ascii')

# Function to clean the text
def clean_text(text):
    # Convert to lowercase and to ASCII
    text = to_ascii(text.lower())
    # Keep only alphabetic characters and spaces
    text = re.sub(r'[^a-z\s]+', ' ', text)
    # Normalize spaces to a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Read the content of the file


with open(filepath, 'r', encoding='utf-8') as file:
    content = file.read()

# Clean the content
cleaned_content = clean_text(content)

# Save the cleaned content back to a file

with open(filepath_clean, 'w', encoding='ascii') as file:
    file.write(cleaned_content)

print(f"The text has been cleaned and saved to {filepath_clean}.")



The text has been cleaned and saved to pride_and_prejudice_clean.txt.


In [27]:
# TOKENIZE THE TEXT

# Function to extract unique words from the text
def extract_unique_words(text):
    # Split text into words based on whitespace
    words = text.split()
    # Use a set to remove duplicates, extracting the unique words
    unique_words = set(words)
    return unique_words

# Read the cleaned content of the file
with open(filepath_clean, 'r', encoding='utf-8') as file:
    content = file.read()

# Get the list of unique words
unique_words_set = extract_unique_words(content)
unique_words_list = sorted(list(unique_words_set))  # Convert to a sorted list if order is needed

# Optionally save the unique words to a file
with open(filepath_words, 'w', encoding='utf-8') as file:
    for word in unique_words_list:
        file.write(word + '\n')

print(f"Total unique words: {len(unique_words_list)}")
print(f"The list of unique words has been saved to {filepath_words}")


Total unique words: 7526
The list of unique words has been saved to pride_and_prejudice_words.txt


In [8]:
lines_list = []

# Open the file for reading ('r')
with open(filepath_words, 'r', encoding='utf-8') as file:
    # Read all lines in the file and add them to the list
    lines_list = file.readlines()

# Now lines_list contains all the lines from the file
# If you want to remove newline characters from the end of each line, you can do:
unique_words = [line.strip() for line in lines_list]


### Special Tokens

In [29]:
list_as_text = ' '.join(unique_words)
start_token = '[S]'
end_token = '[EOS]'
unknown_token = '[UNK]'
pad_token = '[PAD]'  # Add the PAD token

In [31]:
# A tokenizer function to remove special characters for simple tokenizer, 
#in actual production we need to consider them

def tokenize(text):
    # Use regular expression to separate words from periods and commas
    return re.findall(r"[\w']+|[.,!?]", text.lower())

In [32]:
text = "This is a text of our tokenizer" 

tokenized_text = tokenize(text) 

print(tokenized_text)

['this', 'is', 'a', 'text', 'of', 'our', 'tokenizer']


In [33]:
tokens = tokenize(list_as_text)
unique_tokens = list(set(tokens))
unique_tokens.sort()
# Ensure special tokens are at the beginning
unique_tokens = [token for token in unique_tokens if token not in (start_token, end_token, unknown_token, pad_token)] + \
                [start_token, end_token, unknown_token, pad_token]


In [34]:
# helper functions to go from token to id, and from id to token
word2idx = {token: idx for idx, token in enumerate(unique_tokens)}

idx2word = {idx: token for token, idx in word2idx.items()}


In [16]:
# Print the first 10 elements
print("First 10 tokens:", unique_tokens[:10])

# Print the last 10 elements
print("Last 10 tokens:", unique_tokens[-10:])


First 10 tokens: ['a', 'ab', 'abatement', 'abeth', 'abhor', 'abhorrence', 'abhorrent', 'abide', 'abiding', 'abilities']
Last 10 tokens: ['youunable', 'youwant', 'youwas', 'youwere', 'youwish', 'zle', '[S]', '[EOS]', '[UNK]', '[PAD]']


In [17]:
start_token_id = word2idx[start_token]  # Save the index of the PAD token
end_token_id = word2idx[end_token]  # Save the index of the PAD token
unknown_token_id = word2idx[unknown_token]  # Save the index of the PAD token
pad_token_id = word2idx[pad_token]

In [18]:
print(f"start_token:   {start_token}   - Id: {start_token_id}")
print(f"end_token:     {end_token} - Id: {end_token_id}")
print(f"unknown_token: {unknown_token} - Id: {unknown_token_id}")
print(f"pad_token:     {pad_token} - Id: {pad_token_id}")

start_token:   [S]   - Id: 7526
end_token:     [EOS] - Id: 7527
unknown_token: [UNK] - Id: 7528
pad_token:     [PAD] - Id: 7529


### ENCODE function

In [19]:
def encode(text, max_length=None, truncation=False, return_tensors=False):
    """
    Encode the text into a sequence of token IDs, with optional truncation.

    Parameters:
    - text: The text to encode.
    - max_length: The maximum length of the token sequence after encoding.
    - truncation: Whether to truncate the sequence to max_length.
    
    Returns:
    - A list of token IDs representing the encoded text.
    """
    tokens = tokenize(text)
    encoded_tokens = [word2idx.get(token, word2idx[unknown_token]) for token in tokens]

    # Prepend the start token ID and append the end token ID
    encoded_tokens = [word2idx[start_token]] + encoded_tokens + [word2idx[end_token]]

    # Handle truncation
    if truncation and max_length is not None:
        # Truncate the sequence if it's longer than max_length
        encoded_tokens = encoded_tokens[:max_length - 1] + [word2idx[end_token]]

    # Convert to tensor if return_tensors is True
    if return_tensors:
        encoded_tokens = torch.tensor([encoded_tokens])  # Adding a batch dimension

    return encoded_tokens

## 3. Tokenizer

### Decode Function

In [39]:
def decode(indices, skip_special_tokens=False):
    # Ensure indices is a list of integers, not a list of tensors
    if isinstance(indices, torch.Tensor):
        indices = indices.tolist()
    
    # Define a set of all special token ids you want to skip
    special_token_ids = set()
    if skip_special_tokens:
        special_token_ids.update([
            start_token_id,
            end_token_id,
            unknown_token_id,
            pad_token_id,
            # Add any other special token ids you have
        ])
    
    # Use a list comprehension to filter out all special tokens
    tokens = [idx2word[idx] for idx in indices if idx not in special_token_ids]
    
    # Join the tokens into a single string with spaces
    return ' '.join(tokens)

In [40]:
def convert_ids_to_tokens(token_ids):
    """
    Convert a list of token IDs to their corresponding tokens.
    
    Parameters:
    - token_ids: A list of integers representing token IDs.
    
    Returns:
    - tokens: A list of string tokens corresponding to the input IDs.
    """
    tokens = [idx2word.get(token_id, unknown_token) for token_id in token_ids]
    
    return tokens

## Tokenizer Function

In [41]:
class SimpleTokenizer:
    def __init__(self, text):
        # Define special tokens
        self.start_token = '[S]'
        self.end_token = '[EOS]'
        self.unknown_token = '[UNK]'
        self.pad_token = '[PAD]'  # Add the PAD token
        
        # Tokenize the input text and include special tokens
        tokens = self.tokenize(text)
        unique_tokens = list(set(tokens))
        
        # Sort tokens to ensure consistent indexing (except for special tokens)
        unique_tokens.sort()
        # Ensure special tokens are at the beginning
        unique_tokens = [token for token in unique_tokens if token not in (self.start_token, self.end_token, self.unknown_token, self.pad_token)] + \
                        [self.start_token, self.end_token, self.unknown_token, self.pad_token]
        
        # Assign an index to each unique token
        self.word2idx = {token: idx for idx, token in enumerate(unique_tokens)}
        self.idx2word = {idx: token for token, idx in self.word2idx.items()}

        self.start_token_id = self.word2idx[self.start_token]  # Save the index of the PAD token
        self.end_token_id = self.word2idx[self.end_token]  # Save the index of the PAD token
        self.unknown_token_id = self.word2idx[self.unknown_token]  # Save the index of the PAD token
        self.pad_token_id = self.word2idx[self.pad_token]

    def tokenize(self, text):
        # Use regular expression to separate words from periods and commas
        return re.findall(r"[\w']+|[.,!?]", text.lower())

    def encode(self, text, max_length=None, truncation=False, return_tensors=False):
        """
        Encode the text into a sequence of token IDs, with optional truncation.

        Parameters:
        - text: The text to encode.
        - max_length: The maximum length of the token sequence after encoding.
        - truncation: Whether to truncate the sequence to max_length.
        
        Returns:
        - A list of token IDs representing the encoded text.
        """
        tokens = self.tokenize(text)
        encoded_tokens = [self.word2idx.get(token, self.word2idx[self.unknown_token]) for token in tokens]

        # Prepend the start token ID and append the end token ID
        encoded_tokens = [self.word2idx[self.start_token]] + encoded_tokens + [self.word2idx[self.end_token]]

        # Handle truncation
        if truncation and max_length is not None:
            # Truncate the sequence if it's longer than max_length
            encoded_tokens = encoded_tokens[:max_length - 1] + [self.word2idx[self.end_token]]

        # Convert to tensor if return_tensors is True
        if return_tensors:
            encoded_tokens = torch.tensor([encoded_tokens])  # Adding a batch dimension

        return encoded_tokens

    def decode(self, indices, skip_special_tokens=False):
        # Ensure indices is a list of integers, not a list of tensors
        if isinstance(indices, torch.Tensor):
            indices = indices.tolist()
        
        # Define a set of all special token ids you want to skip
        special_token_ids = set()
        if skip_special_tokens:
            special_token_ids.update([
                self.start_token_id,
                self.end_token_id,
                self.unknown_token_id,
                self.pad_token_id,
                # Add any other special token ids you have
            ])
        
        # Use a list comprehension to filter out all special tokens
        tokens = [self.idx2word[idx] for idx in indices if idx not in special_token_ids]
        
        # Join the tokens into a single string with spaces
        return ' '.join(tokens)
        
    def convert_ids_to_tokens(self, token_ids):
        """
        Convert a list of token IDs to their corresponding tokens.
        
        Parameters:
        - token_ids: A list of integers representing token IDs.
        
        Returns:
        - tokens: A list of string tokens corresponding to the input IDs.
        """
        tokens = [self.idx2word.get(token_id, self.unknown_token) for token_id in token_ids]
        
        return tokens
    
    def convert_tokens_to_ids(self, tokens):
        """
        Convert a list of tokens to their corresponding token IDs.

        Parameters:
        - tokens: A list of string tokens.

        Returns:
        - token_ids: A list of integers representing the token IDs.
        """
        token_ids = [self.word2idx.get(token, self.unknown_token_id) for token in tokens]

        return token_ids

In [42]:

# Usage example
tokenizer = SimpleTokenizer(list_as_text)
encoded = tokenizer.encode("color ? A rose is red.")
decoded = tokenizer.decode(encoded)

print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")
print(f"PAD token ID: {tokenizer.pad_token_id}")


Encoded: [7526, 7528, 7528, 0, 5742, 3678, 5453, 7528, 7527]
Decoded: [S] [UNK] [UNK] a rose is red [UNK] [EOS]
PAD token ID: 7529
