In [1]:
import json
import os
import random
import pandas as pd
import math
import shutil
from itertools import combinations
from collections import defaultdict, Counter, OrderedDict
import torch


In [None]:
MAX_LEN = 31

PAD_TOKEN = '[PAD]'
MASK_TOKEN = '_'
BOS_TOKEN = "[START]"
EOS_TOKEN = "[END]"

# Define special tokens
SPECIAL_TOKENS = {"pad_token": PAD_TOKEN, "bos_token": BOS_TOKEN, "eos_token":EOS_TOKEN, 'mask_token':MASK_TOKEN }


# isTEST='test-'
# isTEST='new-'
isTEST=''

In [None]:
def load_vocabulary(file_path):
  
    with open(file_path, 'r') as file:
        # Read all lines, strip whitespace, and filter out empty lines
        vocabulary = [line.strip() for line in file if line.strip()]
    return vocabulary 


def clearFile(path):
    # Remove the output file if it already exists
    if os.path.isfile(path):  # Delete if it's a file
        os.remove(path)
        print(f"File {path} has been deleted.")


def delete_intermediate_directory(intermediate_dir):
    """Delete the entire intermediate directory and its contents."""
    try:
        shutil.rmtree(intermediate_dir)  # Remove the directory and all its contents
        print(f"Deleted intermediate directory: {intermediate_dir}")
    except Exception as e:
        print(f"Error deleting directory {intermediate_dir}: {e}")


def load_and_convert_json_with_padding(json_file, max_len=MAX_LEN, token=SPECIAL_TOKENS ):
    """Load a JSON file with stringified JSON objects, pad the 'input' and 'target' to max_len, and convert to DataFrame."""
    print("\nPadding dataset")
   
    with open(json_file, 'r') as f:
        data = []
        for line in f:
            record = json.loads(line)  # Parse each line into a Python dictionary
            
            # Pad the 'input' list to the desired length with start and end tokens
            input_list = record["input"] 
            padded_input = [token['bos_token']]  + input_list + [token['eos_token']] + [token['pad_token']] * (max_len - len(input_list) - 2 ) # -2 for start and end token
            padded_input = padded_input[:max_len]  # Ensure exactly max_len long
            
            # Convert 'target' to a character-level list and pad
            target_list = list(record["target"]) 
            padded_target = [token['bos_token']] +  target_list + [token['eos_token']] + [token['pad_token']] * (max_len - len(target_list) -2)
            # padded_target = [token['bos_token']] + target_list + [token['eos_token']] + [token['pad_token']] * (max_len - len(target_list) -2)
            padded_target = padded_target[:max_len]  # Ensure exactly max_len long
            
            # print(len(padded_input), len(padded_target))
            
            # Append the processed data to the list
            data.append({"input": padded_input, "target": padded_target})
    
    
    clearFile(json_file)
    
    with open(f'{json_file}', 'w') as js_file:
        json.dump(data, js_file)
        
    
    # Convert the list of dicts into a Pandas DataFrame
    df = pd.DataFrame(data)
    # print(json_file)
    df.to_csv(f"{json_file.split('.')[0]}.csv", sep='|', index=False)
    

def convert_json_to_parquet_in_directory(directory_path):

    try:
        # Check if the directory exists
        if not os.path.isdir(directory_path):
            print(f"The directory '{directory_path}' does not exist.")
            return

        # List all JSON files in the directory
        json_files = [file for file in os.listdir(directory_path) if file.endswith('.json')]

        if not json_files:
            print(f"No JSON files found in the directory '{directory_path}'.")
            return

        # Loop through all JSON files and convert them to Parquet
        for json_file in json_files:
            json_file_path = os.path.join(directory_path, json_file)
            parquet_file_path = os.path.splitext(json_file_path)[0] + '.parquet'

            try:
                # Read the JSON file into a Pandas DataFrame
                df = pd.read_json(json_file_path, orient='records', lines=True)
                
                # Write the DataFrame to a Parquet file
                df.to_parquet(parquet_file_path, engine='pyarrow', index=False)

                # Delete the original JSON file
                os.remove(json_file_path)

                print(f"Converted '{json_file}' to Parquet and deleted the original JSON file.")
            except Exception as e:
                print(f"Error processing file '{json_file}': {e}")
    except Exception as e:
        print(f"An error occurred: {e}")



In [4]:
# BRUTE FORCE -- ALL COMBINATIONS
def masked_combinations(word):
    masked_list = []
    word_length = len(word)
    
    for num_visible in range(word_length + 1):
        for visible_indices in combinations(range(word_length), num_visible):
            masked_words = ['_'] * word_length
            for idx in visible_indices:
                masked_words[idx] = word[idx]
            # masked_list.append(''.join(masked_words))
            masked_list.append(masked_words)
    
    return masked_list



# GENERATE TOP K MASKED SAMPLES
def generate_random_masked_combinations(word, MAX_PERMUTATIONS=60):
   
    word_length = len(word)
    masked_list = set()

    MAX_PERMUTATIONS = round(min(MAX_PERMUTATIONS , 0.5*(pow(2,len(word)))))
    for _ in range(MAX_PERMUTATIONS):
        
        # Randomly decide the number of visible characters
        num_visible = random.randint(0, word_length)

        # Randomly choose indices to be visible
        visible_indices = random.sample(range(word_length), num_visible)

        # Create the masked word
        masked_word = ['_'] * word_length
        for idx in visible_indices:
            masked_word[idx] = word[idx]

        masked_list.add(tuple(masked_word))
    
    # print(masked_list)

    return list(masked_list)


def makeMaskedDataset(df, PATH='./datasets/maskedCombinations'):
    # masked_list=[]
    big_words_idx=[]
    MAX_WORD_LEN = 30

    for idx, word in enumerate(df):

        if len(word) < MAX_WORD_LEN: 
            # masked_list = masked_combinations(word)
            masked_list = generate_random_masked_combinations(word)
            
            # Ensure the output directory exists
            output_file = f'{PATH}/{word}.json'
            os.makedirs(PATH, exist_ok=True)

            with open(output_file, 'w') as json_file:
                json.dump(masked_list, json_file)
                masked_list = None
                print(f"STORED WORD : {word}")
        
        
        else: 
            big_words_idx.append(idx)
    
    
    if big_words_idx:
        with open(f'./{PATH}/bigwords-pending.json', 'w') as json_file:
            json.dump(big_words_idx, json_file)
            print(f"STORED BIG WORDS IDX")

    print("STORED ALL WORDS !!")



In [5]:
def split_files_into_batches(directory_path, batch_size):
    """Split files in a directory into batches."""
    files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.json')]
    num_batches = math.ceil(len(files) / batch_size)
    return [files[i * batch_size: (i + 1) * batch_size] for i in range(num_batches)]



def process_batch(batch_files, batch_output_file):
    """Process a single batch of files and append the result to an intermediate file."""
    with open(batch_output_file, 'a') as out_f:  # Open in append mode
        for file_path in batch_files:
            with open(file_path, 'r') as f:
                data = json.load(f)  # Load JSON data
                target = os.path.splitext(os.path.basename(file_path))[0]  # Use file name as target
                for input_data in data:  # Assuming data is a list of arrays
                    json.dump({'input': input_data, 'target': target}, out_f)
                    out_f.write('\n')  # Ensure each entry is on a new line



def merge_intermediate_files(intermediate_files, final_output_file):
    """Merge intermediate files into a single final output file."""
    
    clearFile(final_output_file)

    
    # Merge valid JSON lines
    with open(final_output_file, 'a') as out_f:
        for file_path in intermediate_files:
            with open(file_path, 'r') as f:
                for line in f:
                    line = line.strip()  # Remove extra whitespace
                    try:
                        json_obj = json.loads(line)  # Validate JSON
                        out_f.write(json.dumps(json_obj) + '\n')  # Write as JSON line
                    except json.JSONDecodeError:
                        print(f"Skipping invalid JSON line: {line}")

    
    print(f"Merged intermediate files into {final_output_file}.")


def process_files_in_batches(directory_path, batch_size, intermediate_dir, final_output_file):
    """
    Process files in batches, store intermediate results, and merge them into a final output file.
    Then delete the entire intermediate directory.
    """
    # Step 1: Split files into batches
    batches = split_files_into_batches(directory_path, batch_size)
    intermediate_files = []

    # Step 2: Process each batch and append intermediate results
    for i, batch_files in enumerate(batches):
        batch_output_file = os.path.join(intermediate_dir, f'batch_{i}.json')
        process_batch(batch_files, batch_output_file)
        intermediate_files.append(batch_output_file)
        print(f"Processed batch {i + 1}/{len(batches)}")

    # Step 3: Merge intermediate files into the final output file
    merge_intermediate_files(intermediate_files, final_output_file)
    print(f"Merged all batches into {final_output_file}")

    # Step 4: Delete the intermediate directory to free up memory
    delete_intermediate_directory(intermediate_dir)
    print("Deleted intermediate directory to free up memory.")


## LOAD VOCAB

In [6]:
# vocabulary = ["a", "an", "ant", "bat", "cat", "me", "do", "see", "tree", "hangman", "meith"]

CHECKPOINT=0
vocabulary = load_vocabulary("words_250000_train.txt")

if isTEST: 
    vocabulary = vocabulary[CHECKPOINT:10]
else: 
    vocabulary = vocabulary[CHECKPOINT:]

## MAKE MASKED SETS

In [7]:
directory_path = f'datasets/{isTEST}maskedCombinations-topk'


## COMBINE FILES
batch_size = 10000 # Number of files per batch
directory_path = f'datasets/{isTEST}maskedCombinations-topk'
intermediate_dir = f'datasets/{isTEST}merged-intermediates'  # Directory to store intermediate files
final_output_file = f'datasets/{isTEST}dataset.json'    # Path for the final merged file


In [8]:
makeMaskedDataset(vocabulary, directory_path)

STORED WORD : aaa
STORED WORD : aaaaaa
STORED WORD : aaas
STORED WORD : aachen
STORED WORD : aaee
STORED WORD : aag
STORED WORD : aahed
STORED WORD : aahs
STORED WORD : aal
STORED WORD : aalesund
STORED ALL WORDS !!


## COMBINE FILES

In [9]:
# Ensure intermediate directory exists
os.makedirs(intermediate_dir, exist_ok=True)

# Process files in batches
process_files_in_batches(directory_path, batch_size, intermediate_dir, final_output_file)


Processed batch 1/1
File datasets/test-dataset.json has been deleted.
Merged intermediate files into datasets/test-dataset.json.
Merged all batches into datasets/test-dataset.json
Deleted intermediate directory: datasets/test-merged-intermediates
Deleted intermediate directory to free up memory.


In [10]:
load_and_convert_json_with_padding(final_output_file, MAX_LEN, SPECIAL_TOKENS)


Padding dataset
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
29 29
File datasets/test-dataset.json has been deleted.


In [None]:
torch.cuda.empty_cache() # clear GPU cache
torch.cuda.reset_max_memory_allocated()

print(f"RAN PROCESS DATA")

print("\n\n ****ENDED SESSION !!*** \n\n")

RAN PROCESS DATA


 ****ENDED SESSION !!*** 


