In [1]:
# Define functions to read files and clean text for analyis

## Clean text function 
def clean_text(var_in):
    import re 
    tmp_t = re.sub("[^A-Za-z']+", " ", var_in).strip().lower()
    return tmp_t

## Read file function
def read_file(full_path_in):
    f_t = open(full_path_in, "r", encoding = "UTF-8", errors="ignore")
    text_t = f_t.read() #read the whole file
    text_t = clean_text(text_t)
    f_t.close()
    return text_t 
    
## Files to crawl from the a centralfile path fucntion
def file_crawler(path_in):
    import os
    import pandas as pd 
    my_pd_t = pd.DataFrame()
    for root, dirs, files in os.walk(path_in, topdown = False):
        for name in files:
            try:
                txt_t = read_file(root + "/" + name)
                if len(txt_t) > 0:
                    the_lab = root.split("/")[-1]
                    tmp_pd = pd.DataFrame(
                        {"body": txt_t, "label": the_lab}, index = [0])
                    my_pd_t = pd.concat(
                        [my_pd_t, tmp_pd], ignore_index = True)
            except: 
                print(root + "/" + name)
                pass 
    return my_pd_t

## Remove stop words from the corpus function 
def rem_sw(str_in):
    from nltk.corpus import stopwords
    sw = stopwords.words('english')
    tmp = [word for word in str_in.split() if word not in sw]
    tmp = ' '.join(tmp)
    return tmp 

## Stem words fucntion 
def stem_fun(var_in, sw_in):
    if sw_in == "stem":
        from nltk.stem import PorterStemmer
        ps = PorterStemmer()
    else:
        from nltk.stem import WordNetLemmatizer
        ps = WordNetLemmatizer()
    split_ex = var_in.split()
    t_l = list()
    for word in split_ex:
        if sw_in == "stem":
            tmp = ps.stem(word)
        else:
            tmp = ps.lemmatize(word)
        t_l.append(tmp)
    tmp = ' '.join(t_l)
    return tmp

In [2]:
# Extract the data files 
the_path = "../data/"
data = file_crawler(the_path)
data

Unnamed: 0,body,label
0,fishing kdwpt kdwpt main menu search hunting f...,fishing
1,get your deep sea fishing trip tickets daveys ...,fishing
2,njdep division of fish wildlife fishing in new...,fishing
3,search key west fishing charters and informati...,fishing
4,fishing palisades interstate park in new jerse...,fishing
...,...,...
258,what n j thinks of hiking minimum wage legaliz...,hiking
259,santa barbara hiking trails and scenic vistas ...,hiking
260,danner danner men's hiking boots free shipping...,hiking
261,provo hikes and trails utah com hikes in provo...,hiking


In [3]:
# Data preparation
data["body_sw"] = data["body"].apply(rem_sw)
data["body_sw_stem"] = data["body_sw"].apply(lambda x: stem_fun(x, "stem"))
data

Unnamed: 0,body,label,body_sw,body_sw_stem
0,fishing kdwpt kdwpt main menu search hunting f...,fishing,fishing kdwpt kdwpt main menu search hunting f...,fish kdwpt kdwpt main menu search hunt fee lic...
1,get your deep sea fishing trip tickets daveys ...,fishing,get deep sea fishing trip tickets daveys locke...,get deep sea fish trip ticket davey locker hom...
2,njdep division of fish wildlife fishing in new...,fishing,njdep division fish wildlife fishing new jerse...,njdep divis fish wildlif fish new jersey altho...
3,search key west fishing charters and informati...,fishing,search key west fishing charters information f...,search key west fish charter inform fla key co...
4,fishing palisades interstate park in new jerse...,fishing,fishing palisades interstate park new jersey h...,fish palisad interst park new jersey home albu...
...,...,...,...,...
258,what n j thinks of hiking minimum wage legaliz...,hiking,n j thinks hiking minimum wage legalizing pot ...,n j think hike minimum wage legal pot nj com n...
259,santa barbara hiking trails and scenic vistas ...,hiking,santa barbara hiking trails scenic vistas visi...,santa barbara hike trail scenic vista visit sa...
260,danner danner men's hiking boots free shipping...,hiking,danner danner men's hiking boots free shipping...,danner danner men' hike boot free ship free re...
261,provo hikes and trails utah com hikes in provo...,hiking,provo hikes trails utah com hikes provo provo ...,provo hike trail utah com hike provo provo ove...


In [4]:
# Function to calculate the input token probabilites on topics 

def word_prob(input_tokens, col_name, topic_col, data):
    import pandas as pd
    topics = data[topic_col].unique().tolist()
    topic_probabilities = {'all': None, }

    try:
        for topic in topics:
            topic_probabilities[topic] = None
            
        # Total token count across all data for 'all' key
        total_token_count = data[col_name].str.split().str.len().sum()
        all_count = data[col_name].str.contains(input_tokens, regex=True).sum()
        if all_count > 0:
            topic_probabilities['all'] = all_count / total_token_count
        
        # Calculate probabilities for each specific topic
        for topic in topics:
            topic_data = data[data[topic_col] == topic]
            topic_token_count = topic_data[col_name].str.split().str.len().sum()
            token_count = topic_data[col_name].str.contains(input_tokens, regex=True).sum()
            if token_count > 0:
                topic_probabilities[topic] = token_count / topic_token_count
                
    except KeyError:
        raise ValueError(f"Column '{col_name}' or '{topic_col}' not found in dataframe")
        
    return topic_probabilities


In [5]:
# Testcase
test = word_prob("data science", "body", "label", data)
print(test)

{'all': 0.0001264181068186181, 'fishing': None, 'mathematics': 0.0001371328268560928, 'machinelearning': 0.00037710622396681465, 'hiking': None, '': None}
