<a href="https://colab.research.google.com/github/nonyeezeh/Research-Project-Code/blob/main/Untitled5_NLP2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
from collections import Counter, defaultdict, OrderedDict
import random
import math

In [2]:
# Function to clean text by removing specified characters and stop words
def clean_all_text(text, removed_chars, stop_words):
    for char in removed_chars:
        text = text.replace(char, '')

    words = text.split()

    cleaned_words = [word for word in words if word.lower() not in stop_words]

    cleaned_text = ' '.join(cleaned_words)

    return cleaned_text

# List of characters to remove
removed_characters = ['.', ',', '!', '?', ';', ':', '“', '”', '"', "'", '’', '(', ')', '[', ']', '{', '}', '-', '_', '…', '—',
                      '`', '~', '/', '\\', '|', '@', '#', '$', '%', '^', '&', '*', '+', '=', '<', '>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

# Read stop words from file
with open('stopwords.txt', 'r') as file:
    stop_words = set(line.strip().lower() for line in file)

# Read and clean text from the first book (HP1)
with open("HP1.txt", 'r', encoding='utf-8') as file:
    all_text_HP1 = file.read()

new_text_HP1 = clean_all_text(all_text_HP1, removed_characters, stop_words)

# Read and clean text from the second book (HP2)
with open("HP2.txt", 'r', encoding='utf-8') as file:
    all_text_HP2 = file.read()

new_text_HP2 = clean_all_text(all_text_HP2, removed_characters, stop_words)

In [3]:
# Function to get BPE statistics
def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

# Function to merge the most frequent BPE pair
def merge_vocab(pair, v_in):
    v_out = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in v_in:
        w_out = word.replace(bigram, replacement)
        v_out[w_out] = v_in[word]
    return v_out

# Initialize vocabulary with words split into characters for HP1
words_HP1 = new_text_HP1.split()
vocab_HP1 = {' '.join(word): words_HP1.count(word) for word in set(words_HP1)}

# Initialize vocabulary with words split into characters for HP2
words_HP2 = new_text_HP2.split()
vocab_HP2 = {' '.join(word): words_HP2.count(word) for word in set(words_HP2)}

# Set the number of BPE merges
num_merges = 100

# Perform BPE for HP1
for i in range(num_merges):
    pairs = get_stats(vocab_HP1)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab_HP1 = merge_vocab(best, vocab_HP1)

# Perform BPE for HP2
for i in range(num_merges):
    pairs = get_stats(vocab_HP2)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab_HP2 = merge_vocab(best, vocab_HP2)

In [4]:
# Convert vocabularies to lists of tokenized pages
pages_HP1 = list(vocab_HP1.keys())
pages_HP2 = list(vocab_HP2.keys())

# Labels for the books
labels_HP1 = ["HP1"] * len(pages_HP1)
labels_HP2 = ["HP2"] * len(pages_HP2)

# Combine the pages and labels from both books
X_combined = pages_HP1 + pages_HP2
y_combined = labels_HP1 + labels_HP2

In [10]:
# Seed for reproducibility
random.seed(42)

# Combine X and y into a list of tuples to keep tokens and labels together
data_combined = list(zip(X_combined, y_combined))

# Shuffle the combined data
random.shuffle(data_combined)

# Calculate the sizes for the training, validation, and test sets
train_size = int(0.7 * len(data_combined))  # 70% for training
val_size = int(0.15 * len(data_combined))   # 15% for validation
test_size = len(data_combined) - train_size - val_size  # Remaining 15% for test

# Split the data based on the calculated sizes
train_data = data_combined[:train_size]
val_data = data_combined[train_size:train_size + val_size]
test_data = data_combined[train_size + val_size:]

# Unzip the data back into X and y components
X_train, y_train = zip(*train_data)
X_val, y_val = zip(*val_data)
X_test, y_test = zip(*test_data)

# Convert to lists (optional)
X_train, y_train = list(X_train), list(y_train)
X_val, y_val = list(X_val), list(y_val)
X_test, y_test = list(X_test), list(y_test)

In [11]:
# Step 1: Calculate Prior Probabilities
prior_probs = defaultdict(float)
total_pages = len(y_train)
for book in y_train:
    prior_probs[book] += 1
for book in prior_probs:
    prior_probs[book] /= total_pages

# Step 2: Calculate Conditional Probabilities
cond_probs = defaultdict(lambda: defaultdict(float))
token_counts = defaultdict(lambda: defaultdict(int))
total_tokens_per_book = defaultdict(int)

for tokens, book in zip(X_train, y_train):
    for token in tokens.split():  # Assuming tokens are space-separated strings
        token_counts[book][token] += 1
        total_tokens_per_book[book] += 1

# Convert counts to probabilities
for book in token_counts:
    for token in token_counts[book]:
        cond_probs[book][token] = token_counts[book][token] / total_tokens_per_book[book]

# Step 3: Prediction Function
def predict(tokens):
    max_prob = float('-inf')
    best_book = None
    for book in prior_probs:
        log_prob = math.log(prior_probs[book])  # Log to prevent underflow
        for token in tokens.split():
            if token in cond_probs[book]:
                log_prob += math.log(cond_probs[book][token])
            else:
                log_prob += math.log(1 / (total_tokens_per_book[book] + 1))  # Laplace smoothing
        if log_prob > max_prob:
            max_prob = log_prob
            best_book = book
    return best_book

# Example Prediction with a Validation Set
correct = 0
for tokens, true_book in zip(X_val, y_val):
    predicted_book = predict(tokens)
    if predicted_book == true_book:
        correct += 1

accuracy = correct / len(y_val)
print(f"Validation Accuracy: {accuracy:.2%}")

# Example usage with a new page
test_page = "Hermione snatched the schedule back, blushing furiously."
predicted_book = predict(test_page)
print(f"The predicted book for the given text is: {predicted_book}")

Validation Accuracy: 67.06%
The predicted book for the given text is: HP1
