## TODO
- Clean metadata - correct talks inline

In [1]:
import polars as pl 
import re
import subprocess   
from pathlib import Path
data_path = Path("./data")
output_txt_path = data_path / "combined_output.txt"
processed_word_files = data_path / "output"
text_path = data_path / "output"

output_path = data_path / "text"


def load_metadata():
    metadata = pl.read_csv(data_path / "metadata/metadata.csv")

    metadata = metadata.with_columns([
        pl.col("Location").str.split(", ").list.get(0).alias("Location"), 
        pl.col("Location").str.split(", ").list.get(1).alias("Country")])\
        .with_columns(pl.coalesce(
            pl.col("Date").str.strptime(dtype=pl.Date, format="%d %B %Y", strict=False),
            pl.col("Date").str.strptime(dtype=pl.Date, format="%B %d %Y", strict=False)))

    return metadata

def parse_docx_into_md(processed_word_files, output_txt_path, text_path, output_path):

    output_path.mkdir(parents=True, exist_ok=True)

    for file in text_path.glob("*.docx"):
        src = str(file)
        dst = str(output_path / f"{file.stem}.md")
        subprocess.run(["python", "-m", "docx2md", src, dst])

    with open(output_txt_path, "w") as outfile:
        for file in output_path.glob("*.md"):
            with open(file, "r") as infile:
                outfile.write(infile.read() + "\n\n###\n\n")
            


In [8]:
test_str = """
### path/test/to/file.md

23 March 1973 

How the Divine is Working Within Us

Public Programme, Day 1

Jehangir Hall, Mumbai, India

Talk Language: English | Transcript: Verified 

I had told you the last time what is a soul (Atma), how it is Sachidanand (Sat + Chit + Anand; Truth + Attention + Bliss), and how after being made aware of the soul only humans can achieve these three things. Without self-realization, you cannot know the truth. You cannot achieve bliss. You should have the sensitivity of the soul. Now you can converse with your soul. You can ask the soul. You people are seated, you should ask by stretching your hands like this, is there a God in the world? Does his will re..."""

re.split(r'### (.*?)\n\n', test_str)

['\n',
 'path/test/to/file.md',
 '23 March 1973 \n\nHow the Divine is Working Within Us\n\nPublic Programme, Day 1\n\nJehangir Hall, Mumbai, India\n\nTalk Language: English | Transcript: Verified \n\nI had told you the last time what is a soul (Atma), how it is Sachidanand (Sat + Chit + Anand; Truth + Attention + Bliss), and how after being made aware of the soul only humans can achieve these three things. Without self-realization, you cannot know the truth. You cannot achieve bliss. You should have the sensitivity of the soul. Now you can converse with your soul. You can ask the soul. You people are seated, you should ask by stretching your hands like this, is there a God in the world? Does his will re...']

In [2]:

def fuzzy_search(keywords, file_path=output_txt_path):
    """
    Perform a fuzzy search for keywords in the specified text file.
    
    Args:
        keywords (list): A list of keywords to search for.
        file_path (Path): The path to the text file to search in.
        
    Returns:
        list: A list of paragraphs containing any of the keywords.
    """
    paragraphs_with_keyword = []
    
    with open(file_path, "r") as file:
        content = file.read()
        # Create a regex pattern that matches any of the keywords
        pattern = r'(?s)(.*?\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b.*?)\n\n'
        paragraphs_with_keyword = re.findall(pattern, content, re.IGNORECASE)
    
    return paragraphs_with_keyword

def search_all_keywords(keywords, output_path=output_path):
    """
    Search for multiple keywords in all markdown files in the output path.
    
    Args:
        keywords (list): A list of keywords to search for.
        output_path (Path): The path to the directory containing markdown files.
    
    Returns:
        list: A list of dictionaries with keys "source_file", "keyword", and "paragraph".
    """
    results = []
    
    from joblib import Parallel, delayed

    def process_file(file, keywords):
        results = []
        for keyword in keywords:
            paragraphs = fuzzy_search(keyword, file_path=str(file))
            results.extend([{
                "source_file": str(file),
                "keyword": keyword,
                "paragraph": paragraph
            } for paragraph in paragraphs])
        return results

    results = Parallel(n_jobs=-1)(delayed(process_file)(file, keywords) for file in output_path.glob("*.md"))
    results = [item for sublist in results for item in sublist]  # Flatten the list
    return results

# fuzzy_search(["Eart", "disrespect"])



KeyboardInterrupt: 

In [8]:
def fuzzy_search(keywords, file_path=output_txt_path):
    """
    Perform a fuzzy search for keywords in the specified text file.
    
    Args:
        keywords (list): A list of keywords to search for.
        file_path (Path): The path to the text file to search in.
        
    Returns:
        list: A list of paragraphs containing any of the keywords.
    """
    paragraphs_with_keyword = []
    
    with open(file_path, "r") as file:
        content = file.read()
        # Split the content into paragraphs
        talks = content.split("###\n\n")
        
        paragraphs = [talk.split('\n\n') for talk in talks]
        paragraphs = {"\n".join(talk[:5]): talk[5:] for talk in paragraphs}

        paragraphs = [{"talk": k, "paragraph": p} for k, v in paragraphs.items() for p in v]
        # Create a regex pattern that matches any of the keywords
        patterns = [r'\b' + re.escape(keyword) + r'\b' for keyword in keywords]
        # Apply regex to each paragraph
        paragraphs_with_keyword = [paragraph for paragraph in paragraphs if all(re.search(pattern, paragraph['paragraph'], re.IGNORECASE) for pattern in patterns)]
    

    return paragraphs_with_keyword

for result in fuzzy_search(["kundalini", "earth", ]):
    print(result['talk'], result["paragraph"], "", sep="\n")


7 March 1990
I Will Ask Mother for Yoga 
Public Programme
Camberwell Civic Centre, Melbourne, Australia
Talk Language: English | Transcript: Draft
So, this energy is settled within us in the triangular bone. And it has to pass through six subtle centres. While the seventh centre is below the Kundalini. And this is the centre which is for all our excretion inclusive of sex. So, sex has nothing to do with your yoga. On the contrary when the Kundalini is rising all your excretory movements and excretory things just become dull or sleep. And then the Kundalini rises. It passes through all these six centres. She nourishes them. She integrates them and ultimately passes through here into limbic area which is a hollow place and pierces through the sixth centre here which we call as the brahmarandra meaning the hole of the Brahma the all-pervading Power and then enters into the subtle energy, which is controlling us, which is organising us which is doing all living work on this earth and in th

In [None]:
import re

def fuzzy_search(keywords, file_path=output_txt_path):
    """
    Perform a fuzzy search for keywords in the specified text file.
    
    Args:
        keywords (list): A list of keywords to search for.
        file_path (Path): The path to the text file to search in.
        
    Returns:
        list: A list of paragraphs containing any of the keywords.
    """
    paragraphs_with_keyword = []
    
    with open(file_path, "r") as file:
        content = file.read()
        # Create a regex pattern to find paragraphs containing any of the keywords
        pattern = r'(?s)(.*?\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b.*?)\n\n'
        paragraphs_with_keyword = re.findall(pattern, content, re.IGNORECASE)
    
    return paragraphs_with_keyword

def search_all_keywords(keywords, output_path=output_path):
    """
    Search for multiple keywords in all markdown files in the output path.
    
    Args:
        keywords (list): A list of keywords to search for.
        output_path (Path): The path to the directory containing markdown files.
    
    Returns:
        list: A list of dictionaries with keys "source_file", "keyword", and "paragraph".
    """
    results = []
    
    for file in output_path.glob("*.md"):
        paragraphs = fuzzy_search(keywords, file_path=str(file))
        for paragraph in paragraphs:
            results.append({
                "source_file": str(file),
                "keyword": keyword,
                "paragraph": paragraph
            })
    
    return results

search_all_keywords(["soul", "god"])
