**Mateo Alexander**

**Homework 3**

**Natural Language Processing: QMSS 5067**

**Professor Patrick Houlihan**

**Homework Due 11/08/2024**

In [None]:
"""
Build a function, called word_prob, that outputs probabilities, for every possible topic [all, fishing, hiking, machinelearning, mathematics] that a token or  sequential token combination (the user input to the function) shows up in an arbitrary textual based column  (body, body_sw, body_sw_stem), dictated by the user, from the dataframe, the_data, we have been using  in class.  The output dictionary of the function needs to have the following keys:

all: <probability the sequential input token(s) shows up in ALL the corpuses
fishing: <probability the sequential input token(s) shows up in the fishing corpuses
hiking: <probability the sequential input token(s) shows up in the hiking corpuses
machinelearning: <probability the sequential input token(s) shows up in the machinelearning corpuses
mathematics: <probability the sequential input token(s) shows up in the mathematics corpuses
 
The 'value' field of a dictionary is to have a value of None if the token(s) do not show up
NOTE: If there are a total of 100 tokens, and the count of a specific token is 12, the probability of that token showing up is 12/100=.12
"""

In [36]:
!pip install vaderSentiment

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels


In [223]:
import os
import pandas as pd
import multiprocessing
from multiprocessing import Pool
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import urllib.request
from collections import Counter
import itertools
from itertools import islice

In [203]:
from utils import read_file, clean_txt
from utils import stem_fun

In [205]:
file_path_data = os.path.abspath('Natural Language Processing/Data')
print(file_path_data)

/home/37b4d573-02a2-4c75-b69b-8f21f8c5d212/Natural Language Processing/Natural Language Processing/Data


In [207]:
# Establish the directory
directory = '/home/37b4d573-02a2-4c75-b69b-8f21f8c5d212/Natural Language Processing/Data'

In [225]:
# Version 6: Implement Clean_Txt, Multiprocessing, Stemming, Batch Processing, and DataFrame Output
def process_batch(batch):
    all_tokens = Counter()
    all_tokens_stemmed = Counter()
    topic_tokens = {"Fishing": Counter(), "Hiking": Counter(), "MachineLearning": Counter(), "Mathematics": Counter()}
    topic_tokens_stemmed = {"Fishing": Counter(), "Hiking": Counter(), "MachineLearning": Counter(), "Mathematics": Counter()}
    
    for topic, file_path in batch:
        with open(file_path, 'r', encoding='utf-8') as f:
            body_text = f.read()
            # Clean the text
            body_text = clean_txt(body_text)
            tokens = body_text.split()
            all_tokens.update(tokens)
            topic_tokens[topic].update(tokens)
            # Apply stemming to tokens
            stemmed_tokens = [stem_fun(token, "stem") for token in tokens]
            all_tokens_stemmed.update(stemmed_tokens)
            topic_tokens_stemmed[topic].update(stemmed_tokens)
    
    return all_tokens, all_tokens_stemmed, topic_tokens, topic_tokens_stemmed

def batch_iterator(iterable, batch_size):
    iterator = iter(iterable)
    for first in iterator:
        yield list(islice(iterator, batch_size - 1)) + [first]

def word_prob(directory, column_name, token, batch_size=10):
    # Normalize the token
    token_cleaned = clean_txt(token)
    token_stemmed = stem_fun(token_cleaned, "stem")
    
    # Initialize counters
    all_tokens = Counter()
    all_tokens_stemmed = Counter()
    topic_tokens = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    topic_tokens_stemmed = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    
    # Collect files for all topics
    files_to_process = []
    for topic in topic_tokens.keys():
        topic_path = os.path.join(directory, topic)
        if not os.path.exists(topic_path):
            raise ValueError(f"Directory '{topic_path}' does not exist.")
        
        for root, _, files in os.walk(topic_path):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    files_to_process.append((topic, file_path))
    
    # Create batches of files
    batches = list(batch_iterator(files_to_process, batch_size))
    
    # Use multiprocessing to process batches in parallel
    with Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.map(process_batch, batches)
    
    # Update counters with results from parallel processing
    for result in results:
        batch_all_tokens, batch_all_tokens_stemmed, batch_topic_tokens, batch_topic_tokens_stemmed = result
        all_tokens.update(batch_all_tokens)
        all_tokens_stemmed.update(batch_all_tokens_stemmed)
        for topic in topic_tokens.keys():
            topic_tokens[topic].update(batch_topic_tokens[topic])
            topic_tokens_stemmed[topic].update(batch_topic_tokens_stemmed[topic])
    
    # Calculate probabilities
    def calc_probability(count, total):
        return count / total if total > 0 else None

    # Probabilities without stemming
    all_count = sum(all_tokens.values())
    token_count_all = all_tokens[token_cleaned]
    probability_all = calc_probability(token_count_all, all_count)
    
    probabilities = {
        "all": probability_all,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens.items():
        total_tokens_topic = sum(counter.values())
        token_count_topic = counter[token_cleaned]
        probabilities[topic] = calc_probability(token_count_topic, total_tokens_topic)
    
    # Probabilities with stemming
    all_count_stemmed = sum(all_tokens_stemmed.values())
    token_count_all_stemmed = all_tokens_stemmed[token_stemmed]
    probability_all_stemmed = calc_probability(token_count_all_stemmed, all_count_stemmed)
    
    probabilities_stemmed = {
        "all": probability_all_stemmed,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens_stemmed.items():
        total_tokens_topic_stemmed = sum(counter.values())
        token_count_topic_stemmed = counter[token_stemmed]
        probabilities_stemmed[topic] = calc_probability(token_count_topic_stemmed, total_tokens_topic_stemmed)
    
    # Replace probabilities with None if the token count is 0
    for key, value in probabilities.items():
        if value == 0:
            probabilities[key] = None
    for key, value in probabilities_stemmed.items():
        if value == 0:
            probabilities_stemmed[key] = None
    
    # Create a DataFrame for better display
    data = []
    data.append(["All", probabilities["all"], probabilities_stemmed["all"]])
    for topic in topic_tokens.keys():
        data.append([topic, probabilities[topic], probabilities_stemmed[topic]])
    
    df = pd.DataFrame(data, columns=["Topic", "Probability Without Stemming", "Probability With Stemming"])
    return df

In [227]:
# Example usage
result = word_prob(directory, "body", "hiking")
print(result)

             Topic  Probability Without Stemming  Probability With Stemming
0              All                      0.002114                   0.004733
1          Fishing                      0.000415                   0.000586
2           Hiking                      0.006753                   0.015361
3  MachineLearning                           NaN                        NaN
4      Mathematics                           NaN                        NaN


In [209]:
# Version 1: Basic Algorithm
"""
def word_prob(directory, column_name, token):
    # Initialize counters
    all_tokens = Counter()
    topic_tokens = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    
    # Walk through the directory and collect token counts
    for topic in topic_tokens.keys():
        topic_path = os.path.join(directory, topic)
        if not os.path.exists(topic_path):
            raise ValueError(f"Directory '{topic_path}' does not exist.")
        
        for root, _, files in os.walk(topic_path):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        body_text = f.read()
                        tokens = body_text.split()
                        # Update the counters for "all" tokens and topic tokens
                        all_tokens.update(tokens)
                        topic_tokens[topic].update(tokens)
    
    # Calculate probabilities
    def calc_probability(count, total):
        return count / total if total > 0 else None

    all_count = sum(all_tokens.values())
    token_count_all = all_tokens[token]
    probability_all = calc_probability(token_count_all, all_count)
    
    # Calculate topic-specific probabilities
    probabilities = {
        "All": probability_all,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens.items():
        total_tokens_topic = sum(counter.values())
        token_count_topic = counter[token]
        probabilities[topic] = calc_probability(token_count_topic, total_tokens_topic)
    
    # Replace probabilities with None if the token count is 0
    for key, value in probabilities.items():
        if value == 0:
            probabilities[key] = None
    
    return probabilities
"""

'\ndef word_prob(directory, column_name, token):\n    # Initialize counters\n    all_tokens = Counter()\n    topic_tokens = {\n        "Fishing": Counter(),\n        "Hiking": Counter(),\n        "MachineLearning": Counter(),\n        "Mathematics": Counter()\n    }\n    \n    # Walk through the directory and collect token counts\n    for topic in topic_tokens.keys():\n        topic_path = os.path.join(directory, topic)\n        if not os.path.exists(topic_path):\n            raise ValueError(f"Directory \'{topic_path}\' does not exist.")\n        \n        for root, _, files in os.walk(topic_path):\n            for file in files:\n                if file.endswith(".txt"):\n                    file_path = os.path.join(root, file)\n                    with open(file_path, \'r\', encoding=\'utf-8\') as f:\n                        body_text = f.read()\n                        tokens = body_text.split()\n                        # Update the counters for "all" tokens and topic tokens\n     

In [211]:
# Version 2: Multiprocessing Implemented
"""
def process_file(args):
    topic, file_path = args
    with open(file_path, 'r', encoding='utf-8') as f:
        body_text = f.read()
        tokens = body_text.split()
        return topic, tokens

def word_prob(directory, column_name, token):
    # Initialize counters
    all_tokens = Counter()
    topic_tokens = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    
    # Collect files for all topics
    files_to_process = []
    for topic in topic_tokens.keys():
        topic_path = os.path.join(directory, topic)
        if not os.path.exists(topic_path):
            raise ValueError(f"Directory '{topic_path}' does not exist.")
        
        for root, _, files in os.walk(topic_path):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    files_to_process.append((topic, file_path))
    
    # Use multiprocessing to process files in parallel
    with Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.map(process_file, files_to_process)
    
    # Update counters with results from parallel processing
    for topic, tokens in results:
        all_tokens.update(tokens)
        topic_tokens[topic].update(tokens)
    
    # Calculate probabilities
    def calc_probability(count, total):
        return count / total if total > 0 else None

    all_count = sum(all_tokens.values())
    token_count_all = all_tokens[token]
    probability_all = calc_probability(token_count_all, all_count)
    
    # Calculate topic-specific probabilities
    probabilities = {
        "all": probability_all,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens.items():
        total_tokens_topic = sum(counter.values())
        token_count_topic = counter[token]
        probabilities[topic] = calc_probability(token_count_topic, total_tokens_topic)
    
    # Replace probabilities with None if the token count is 0
    for key, value in probabilities.items():
        if value == 0:
            probabilities[key] = None
    
    return probabilities
"""

'\ndef process_file(args):\n    topic, file_path = args\n    with open(file_path, \'r\', encoding=\'utf-8\') as f:\n        body_text = f.read()\n        tokens = body_text.split()\n        return topic, tokens\n\ndef word_prob(directory, column_name, token):\n    # Initialize counters\n    all_tokens = Counter()\n    topic_tokens = {\n        "Fishing": Counter(),\n        "Hiking": Counter(),\n        "MachineLearning": Counter(),\n        "Mathematics": Counter()\n    }\n    \n    # Collect files for all topics\n    files_to_process = []\n    for topic in topic_tokens.keys():\n        topic_path = os.path.join(directory, topic)\n        if not os.path.exists(topic_path):\n            raise ValueError(f"Directory \'{topic_path}\' does not exist.")\n        \n        for root, _, files in os.walk(topic_path):\n            for file in files:\n                if file.endswith(".txt"):\n                    file_path = os.path.join(root, file)\n                    files_to_process.append(

In [213]:
# Version 3: Clean Text Implemented
"""
def process_file(args):
    topic, file_path = args
    with open(file_path, 'r', encoding='utf-8') as f:
        body_text = f.read()
        # Clean the text
        body_text = clean_txt(body_text)
        tokens = body_text.split()
        return topic, tokens

def word_prob(directory, column_name, token):
    # Normalize the token
    token = clean_txt(token)
    
    # Initialize counters
    all_tokens = Counter()
    topic_tokens = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    
    # Collect files for all topics
    files_to_process = []
    for topic in topic_tokens.keys():
        topic_path = os.path.join(directory, topic)
        if not os.path.exists(topic_path):
            raise ValueError(f"Directory '{topic_path}' does not exist.")
        
        for root, _, files in os.walk(topic_path):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    files_to_process.append((topic, file_path))
    
    # Use multiprocessing to process files in parallel
    with Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.map(process_file, files_to_process)
    
    # Update counters with results from parallel processing
    for topic, tokens in results:
        all_tokens.update(tokens)
        topic_tokens[topic].update(tokens)
    
    # Calculate probabilities
    def calc_probability(count, total):
        return count / total if total > 0 else None

    all_count = sum(all_tokens.values())
    token_count_all = all_tokens[token]
    probability_all = calc_probability(token_count_all, all_count)
    
    # Calculate topic-specific probabilities
    probabilities = {
        "all": probability_all,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens.items():
        total_tokens_topic = sum(counter.values())
        token_count_topic = counter[token]
        probabilities[topic] = calc_probability(token_count_topic, total_tokens_topic)
    
    # Replace probabilities with None if the token count is 0
    for key, value in probabilities.items():
        if value == 0:
            probabilities[key] = None
    
    return probabilities
"""

'\ndef process_file(args):\n    topic, file_path = args\n    with open(file_path, \'r\', encoding=\'utf-8\') as f:\n        body_text = f.read()\n        # Clean the text\n        body_text = clean_txt(body_text)\n        tokens = body_text.split()\n        return topic, tokens\n\ndef word_prob(directory, column_name, token):\n    # Normalize the token\n    token = clean_txt(token)\n    \n    # Initialize counters\n    all_tokens = Counter()\n    topic_tokens = {\n        "Fishing": Counter(),\n        "Hiking": Counter(),\n        "MachineLearning": Counter(),\n        "Mathematics": Counter()\n    }\n    \n    # Collect files for all topics\n    files_to_process = []\n    for topic in topic_tokens.keys():\n        topic_path = os.path.join(directory, topic)\n        if not os.path.exists(topic_path):\n            raise ValueError(f"Directory \'{topic_path}\' does not exist.")\n        \n        for root, _, files in os.walk(topic_path):\n            for file in files:\n              

In [215]:
# Version 4: Stemming Algorithm Implemented
"""
def process_file(args):
    topic, file_path = args
    with open(file_path, 'r', encoding='utf-8') as f:
        body_text = f.read()
        # Clean the text
        body_text = clean_txt(body_text)
        tokens = body_text.split()
        return topic, tokens

def word_prob(directory, column_name, token):
    # Normalize the token
    token_cleaned = clean_txt(token)
    token_stemmed = stem_fun(token_cleaned, "stem")
    
    # Initialize counters
    all_tokens = Counter()
    all_tokens_stemmed = Counter()
    topic_tokens = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    topic_tokens_stemmed = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    
    # Collect files for all topics
    files_to_process = []
    for topic in topic_tokens.keys():
        topic_path = os.path.join(directory, topic)
        if not os.path.exists(topic_path):
            raise ValueError(f"Directory '{topic_path}' does not exist.")
        
        for root, _, files in os.walk(topic_path):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    files_to_process.append((topic, file_path))
    
    # Use multiprocessing to process files in parallel
    with Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.map(process_file, files_to_process)
    
    # Update counters with results from parallel processing
    for topic, tokens in results:
        all_tokens.update(tokens)
        topic_tokens[topic].update(tokens)
        # Apply stemming to tokens
        stemmed_tokens = [stem_fun(token, "stem") for token in tokens]
        all_tokens_stemmed.update(stemmed_tokens)
        topic_tokens_stemmed[topic].update(stemmed_tokens)
    
    # Calculate probabilities
    def calc_probability(count, total):
        return count / total if total > 0 else None

    # Probabilities without stemming
    all_count = sum(all_tokens.values())
    token_count_all = all_tokens[token_cleaned]
    probability_all = calc_probability(token_count_all, all_count)
    
    probabilities = {
        "all": probability_all,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens.items():
        total_tokens_topic = sum(counter.values())
        token_count_topic = counter[token_cleaned]
        probabilities[topic] = calc_probability(token_count_topic, total_tokens_topic)
    
    # Probabilities with stemming
    all_count_stemmed = sum(all_tokens_stemmed.values())
    token_count_all_stemmed = all_tokens_stemmed[token_stemmed]
    probability_all_stemmed = calc_probability(token_count_all_stemmed, all_count_stemmed)
    
    probabilities_stemmed = {
        "all": probability_all_stemmed,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens_stemmed.items():
        total_tokens_topic_stemmed = sum(counter.values())
        token_count_topic_stemmed = counter[token_stemmed]
        probabilities_stemmed[topic] = calc_probability(token_count_topic_stemmed, total_tokens_topic_stemmed)
    
    # Replace probabilities with None if the token count is 0
    for key, value in probabilities.items():
        if value == 0:
            probabilities[key] = None
    for key, value in probabilities_stemmed.items():
        if value == 0:
            probabilities_stemmed[key] = None
    
    return {
        "without_stemming": probabilities,
        "with_stemming": probabilities_stemmed
    }
"""

'\ndef process_file(args):\n    topic, file_path = args\n    with open(file_path, \'r\', encoding=\'utf-8\') as f:\n        body_text = f.read()\n        # Clean the text\n        body_text = clean_txt(body_text)\n        tokens = body_text.split()\n        return topic, tokens\n\ndef word_prob(directory, column_name, token):\n    # Normalize the token\n    token_cleaned = clean_txt(token)\n    token_stemmed = stem_fun(token_cleaned, "stem")\n    \n    # Initialize counters\n    all_tokens = Counter()\n    all_tokens_stemmed = Counter()\n    topic_tokens = {\n        "Fishing": Counter(),\n        "Hiking": Counter(),\n        "MachineLearning": Counter(),\n        "Mathematics": Counter()\n    }\n    topic_tokens_stemmed = {\n        "Fishing": Counter(),\n        "Hiking": Counter(),\n        "MachineLearning": Counter(),\n        "Mathematics": Counter()\n    }\n    \n    # Collect files for all topics\n    files_to_process = []\n    for topic in topic_tokens.keys():\n        topic_

In [217]:
# Version 5: Clean Up Results in Table
"""
def process_file(args):
    topic, file_path = args
    with open(file_path, 'r', encoding='utf-8') as f:
        body_text = f.read()
        # Clean the text
        body_text = clean_txt(body_text)
        tokens = body_text.split()
        return topic, tokens

def word_prob(directory, column_name, token):
    # Normalize the token
    token_cleaned = clean_txt(token)
    token_stemmed = stem_fun(token_cleaned, "stem")
    
    # Initialize counters
    all_tokens = Counter()
    all_tokens_stemmed = Counter()
    topic_tokens = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    topic_tokens_stemmed = {
        "Fishing": Counter(),
        "Hiking": Counter(),
        "MachineLearning": Counter(),
        "Mathematics": Counter()
    }
    
    # Collect files for all topics
    files_to_process = []
    for topic in topic_tokens.keys():
        topic_path = os.path.join(directory, topic)
        if not os.path.exists(topic_path):
            raise ValueError(f"Directory '{topic_path}' does not exist.")
        
        for root, _, files in os.walk(topic_path):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    files_to_process.append((topic, file_path))
    
    # Use multiprocessing to process files in parallel
    with Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.map(process_file, files_to_process)
    
    # Update counters with results from parallel processing
    for topic, tokens in results:
        all_tokens.update(tokens)
        topic_tokens[topic].update(tokens)
        # Apply stemming to tokens
        stemmed_tokens = [stem_fun(token, "stem") for token in tokens]
        all_tokens_stemmed.update(stemmed_tokens)
        topic_tokens_stemmed[topic].update(stemmed_tokens)
    
    # Calculate probabilities
    def calc_probability(count, total):
        return count / total if total > 0 else None

    # Probabilities without stemming
    all_count = sum(all_tokens.values())
    token_count_all = all_tokens[token_cleaned]
    probability_all = calc_probability(token_count_all, all_count)
    
    probabilities = {
        "all": probability_all,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens.items():
        total_tokens_topic = sum(counter.values())
        token_count_topic = counter[token_cleaned]
        probabilities[topic] = calc_probability(token_count_topic, total_tokens_topic)
    
    # Probabilities with stemming
    all_count_stemmed = sum(all_tokens_stemmed.values())
    token_count_all_stemmed = all_tokens_stemmed[token_stemmed]
    probability_all_stemmed = calc_probability(token_count_all_stemmed, all_count_stemmed)
    
    probabilities_stemmed = {
        "all": probability_all_stemmed,
        "Fishing": None,
        "Hiking": None,
        "MachineLearning": None,
        "Mathematics": None
    }
    
    for topic, counter in topic_tokens_stemmed.items():
        total_tokens_topic_stemmed = sum(counter.values())
        token_count_topic_stemmed = counter[token_stemmed]
        probabilities_stemmed[topic] = calc_probability(token_count_topic_stemmed, total_tokens_topic_stemmed)
    
    # Replace probabilities with None if the token count is 0
    for key, value in probabilities.items():
        if value == 0:
            probabilities[key] = None
    for key, value in probabilities_stemmed.items():
        if value == 0:
            probabilities_stemmed[key] = None
    
    # Create a DataFrame for better display
    data = []
    data.append(["All", probabilities["all"], probabilities_stemmed["all"]])
    for topic in topic_tokens.keys():
        data.append([topic, probabilities[topic], probabilities_stemmed[topic]])
    
    df = pd.DataFrame(data, columns=["Topic", "Probability Without Stemming", "Probability With Stemming"])
    return df
"""

'\ndef process_file(args):\n    topic, file_path = args\n    with open(file_path, \'r\', encoding=\'utf-8\') as f:\n        body_text = f.read()\n        # Clean the text\n        body_text = clean_txt(body_text)\n        tokens = body_text.split()\n        return topic, tokens\n\ndef word_prob(directory, column_name, token):\n    # Normalize the token\n    token_cleaned = clean_txt(token)\n    token_stemmed = stem_fun(token_cleaned, "stem")\n    \n    # Initialize counters\n    all_tokens = Counter()\n    all_tokens_stemmed = Counter()\n    topic_tokens = {\n        "Fishing": Counter(),\n        "Hiking": Counter(),\n        "MachineLearning": Counter(),\n        "Mathematics": Counter()\n    }\n    topic_tokens_stemmed = {\n        "Fishing": Counter(),\n        "Hiking": Counter(),\n        "MachineLearning": Counter(),\n        "Mathematics": Counter()\n    }\n    \n    # Collect files for all topics\n    files_to_process = []\n    for topic in topic_tokens.keys():\n        topic_