In [4]:
import os

def concatenate_txt_files(input_folder, output_file):
    with open(output_file, 'w', encoding='utf-8') as outfile:
        # Iterate over each file in the input folder
        for filename in sorted(os.listdir(input_folder)):
            # Check if the file is a .txt file
            if filename.endswith('.txt'):
                file_path = os.path.join(input_folder, filename)
                # Use ISO-8859-1 encoding to read files with unknown or mixed encodings
                with open(file_path, 'r', encoding='ISO-8859-1', errors='ignore') as infile:
                    # Read each line in the file and write it to the output file
                    for line in infile:
                        outfile.write(line)
                    # Add a newline after each file's content (optional)
                    outfile.write('\n')

# Usage
input_folder = '/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/input'  # Replace with your folder path
output_file = '/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/concatenated.txt'  # Name of the output file
concatenate_txt_files(input_folder, output_file)

print(f"All files concatenated into {output_file}")


All files concatenated into /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/concatenated.txt


In [None]:
input_folder = '/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/input'  # Replace with your folder path
output_file = '/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/concatenated.txt'  # Name of the output file

In [6]:
import os
import json
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Lianglab/PharmBERT-uncased")

# Function to load raw instructions from the input directory
def load_raw_instructions(input_dir):
    """Load raw instructions from input files line by line."""
    input_data = {}
    entry_index = 0  # Use an index to track unique entries

    for file_name in os.listdir(input_dir):
        if file_name.endswith('.txt'):
            file_path = os.path.join(input_dir, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    cleaned_line = line.strip()
                    if cleaned_line:  # Ignore empty lines
                        entry_name = f"{file_name}_{entry_index}"
                        input_data[entry_name] = cleaned_line
                        entry_index += 1
    return input_data

# Function to tokenize the instructions
def tokenize_instructions(raw_instructions):
    """Tokenize raw instructions without alignment."""
    tokenized_data = []
    
    for base_name, instruction in raw_instructions.items():
        # Tokenize the raw instruction
        tokenized_inputs = tokenizer(
            instruction.split(),
            is_split_into_words=True,
            truncation=True,
            return_tensors="pt",
            max_length=48,
            padding='max_length',
        )

        # Convert token IDs to tokens
        tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][0])

        # Prepare entry for saving
        entry = {
            "raw_instruction": instruction,
            "tokenized_instruction": tokens
        }

        tokenized_data.append(entry)
    
    return tokenized_data

# Function to save the data to JSON
def save_to_json(data, output_file):
    """Save the tokenized data to a JSON file."""
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    print(f"Tokenized data has been saved to '{output_file}'.")

# Main execution
if __name__ == "__main__":
    # Define input directory and output JSON file
    input_dir = '/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/input'  # Update with your input directory path
    output_json = '/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/concatenated.json'

    # Load raw instructions
    raw_instructions = load_raw_instructions(input_dir)

    # Tokenize instructions
    tokenized_data = tokenize_instructions(raw_instructions)

    # Save to JSON
    save_to_json(tokenized_data, output_json)


Tokenized data has been saved to '/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/concatenated.json'.
