In [None]:
# --- Split Large TXT File into 100MB Chunks ---

import os

# ======== CONFIGURATION ========
input_file = "ne.txt"   # Path to your large .txt file
output_dir = "split_files"           # Folder to store output files
chunk_size_mb = 100                  # Chunk size in megabytes
# =================================

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

chunk_size = chunk_size_mb * 1024 * 1024  # Convert MB to bytes
file_number = 1
bytes_written = 0

output_path = os.path.join(output_dir, f"part_{file_number}.txt")
out_file = open(output_path, "w", encoding="utf-8")

with open(input_file, "r", encoding="utf-8", errors="ignore") as infile:
    for line in infile:
        out_file.write(line)
        bytes_written += len(line.encode("utf-8"))

        # If current file exceeds chunk size, start a new one
        if bytes_written >= chunk_size:
            out_file.close()
            print(f"Created: {output_path}")
            file_number += 1
            bytes_written = 0
            output_path = os.path.join(output_dir, f"part_{file_number}.txt")
            out_file = open(output_path, "w", encoding="utf-8")

out_file.close()
print(f"Created: {output_path}")
print("✅ Splitting complete.")


In [1]:
# --- Clean Text Files, Count Character Occurrences & Report Missing Symbols ---

import os
from collections import Counter

# ======== CONFIGURATION ========
input_dir = "split_files"    # Directory containing the split .txt files
output_dir = "cleaned_files" # Where cleaned files will be saved
# =================================

# Define allowed characters
allowed_numbers = '०१२३४५६७८९0123456789'
allowed_symbols = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{}~।॥—‘’“”… "
allowed_lang_chars = 'अआइईउऊऋएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहक्षत्रज्ञािीुूृेैोौंःँॅॉ'
allowed_chars = set(allowed_numbers + allowed_symbols + allowed_lang_chars + '\n')  # Keep newlines

# Prepare output folder
os.makedirs(output_dir, exist_ok=True)

# Initialize global counter
char_counter = Counter()

# Process each file
for filename in sorted(os.listdir(input_dir)):
    if not filename.endswith(".txt"):
        continue

    input_path = os.path.join(input_dir, filename)
    output_path = os.path.join(output_dir, filename)

    print(f"Processing: {filename}")
    with open(input_path, "r", encoding="utf-8", errors="ignore") as infile, \
         open(output_path, "w", encoding="utf-8") as outfile:

        for line in infile:
            cleaned_line = ''.join(ch for ch in line if ch in allowed_chars)
            outfile.write(cleaned_line)
            char_counter.update(cleaned_line)

print("✅ Cleaning complete. All files saved in:", output_dir)

# Save character frequency dictionary
freq_output_path = os.path.join(output_dir, "character_frequency.txt")
with open(freq_output_path, "w", encoding="utf-8") as f:
    for ch, count in sorted(char_counter.items(), key=lambda x: -x[1]):
        f.write(f"{repr(ch)}: {count}\n")

print("✅ Character frequency file created:", freq_output_path)

# Identify symbols not found at all
symbols_not_found = [s for s in allowed_symbols if char_counter[s] == 0]

missing_symbols_path = os.path.join(output_dir, "symbols_not_found.txt")
with open(missing_symbols_path, "w", encoding="utf-8") as f:
    for s in symbols_not_found:
        f.write(f"{repr(s)}\n")

print("✅ Symbols not found file created:", missing_symbols_path)
print("Symbols not found:", symbols_not_found)


Processing: part_1.txt
Processing: part_10.txt
Processing: part_11.txt
Processing: part_12.txt
Processing: part_13.txt
Processing: part_14.txt
Processing: part_15.txt
Processing: part_16.txt
Processing: part_17.txt
Processing: part_18.txt
Processing: part_19.txt
Processing: part_2.txt
Processing: part_20.txt
Processing: part_21.txt
Processing: part_22.txt
Processing: part_23.txt
Processing: part_24.txt
Processing: part_25.txt
Processing: part_26.txt
Processing: part_27.txt
Processing: part_28.txt
Processing: part_29.txt
Processing: part_3.txt
Processing: part_30.txt
Processing: part_31.txt
Processing: part_32.txt
Processing: part_33.txt
Processing: part_34.txt
Processing: part_35.txt
Processing: part_36.txt
Processing: part_37.txt
Processing: part_38.txt
Processing: part_39.txt
Processing: part_4.txt
Processing: part_5.txt
Processing: part_6.txt
Processing: part_7.txt
Processing: part_8.txt
Processing: part_9.txt
✅ Cleaning complete. All files saved in: cleaned_files
✅ Character freque