In [None]:
# get all files from ../python/converter/temp_files/uploaded

import os
import random
import tiktoken

# Define the directory path
directory_path = "../python/converter/temp_files/uploaded"

# Get all files from the specified directory
files = os.listdir(directory_path)

# Get count of files with .md extension
md_files = [file for file in files if file.endswith(".mmd")]
md_files_count = len(md_files)
print(f"Number of .mmd files: {md_files_count}")

# Define search terms
search_terms = ["# Abstract", "# abstract", "#Abstract"]

# Initialize a dictionary to store counts for each search term
abstract_counts = {term: 0 for term in search_terms}

# Initialize a list to store files containing any of the search terms
abstract_files = []

# Initialize a counter for abstracts below the specified length
short_abstracts_count = 0

# Define the minimum length for short abstracts
min_abstract_length = 500

# Initialize the tokenizer (using cl100k_base which is used for GPT-4)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Initialize counter for files exceeding token limit
files_exceeding_limit = []
TOKEN_LIMIT = 128000  # 128k tokens

# Search for each term in the files
for file in md_files:
    file_path = os.path.join(directory_path, file)
    with open(file_path, 'r') as f:
        content = f.read()
        # Count tokens in the file
        num_tokens = len(tokenizer.encode(content))
        # Check if file exceeds token limit
        if num_tokens > TOKEN_LIMIT:
            files_exceeding_limit.append((file, num_tokens))
        for term in search_terms:
            if term in content:
                abstract_counts[term] += 1
                abstract_files.append(file)
                # Check the length of the abstract
                abstract = content.split(term)[1].split("#")[0].strip()
                if len(abstract) < min_abstract_length:
                    short_abstracts_count += 1
                break


# Print the number of files exceeding the token limit
print(f"Total files exceeding token limit: {len(files_exceeding_limit)} ({len(files_exceeding_limit)/md_files_count:.2%} of all files)")

# Print counts for each search term
for term, count in abstract_counts.items():
    if md_files_count > 0:
        percentage = count / md_files_count
    else:
        percentage = 0
    print(f"Number of files with '{term}': {count} ({percentage:.2%})")

# Print the total number of files containing any of the search terms
total_abstract_files = sum(abstract_counts.values())
if md_files_count > 0:
    total_percentage = total_abstract_files / md_files_count
else:
    total_percentage = 0
print(f"Total number of files with any abstract term: {total_abstract_files} ({total_percentage:.2%})")

# Print the number of abstracts below the specified length
print(f"Number of abstracts below {min_abstract_length} characters: {short_abstracts_count}")

# Get a random example abstract
if abstract_files:
    random_file = abstract_files[random.randint(0, len(abstract_files) - 1)]
    file_path = os.path.join(directory_path, random_file)
    with open(file_path, 'r') as f:
        content = f.read()
        term = search_terms[0]
        if term in content:
            start_index = content.find(term)
            pre_abstract = content[max(0, start_index-100):start_index]
            example_abstract = content.split(term)[1].split("#")[0]
            post_abstract = content.split(term)[1][len(example_abstract):len(example_abstract)+100]
            print(f"\nExample abstract from '{random_file}' with term '{term}':\n{pre_abstract}[---]\n{term}{example_abstract}[---]\n{post_abstract}")
