<a href="https://colab.research.google.com/github/maneeha/KGLLM/blob/main/Dataset%20F500_words_sentence_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wcmatch

Collecting wcmatch
  Downloading wcmatch-8.5.2-py3-none-any.whl (39 kB)
Collecting bracex>=2.1.1 (from wcmatch)
  Downloading bracex-2.4-py3-none-any.whl (11 kB)
Installing collected packages: bracex, wcmatch
Successfully installed bracex-2.4 wcmatch-8.5.2


In [None]:
from wcmatch import wcmatch
import os
from tqdm import tqdm

In [14]:
# function to extract sentences from text files
def extract_sentences(file_path):
    # This is what will be returned from the function with Empty as a placeholder
    final_content = "Empty"
    # The files contents will be saved in the variable below for processing
    content = ''
    # Stride length is the maximum number of words we want to include in our sequence being generated
    stride = 500

    # Validate file path and return "Empty" if not valid
    if not os.path.isfile(file_path):
        print("{} does not exist ".format(file_path))
        return final_content

    # Read file and remove empty line and new lines
    with open(file_path, 'r') as file:
        for line in file.readlines():
            if line.strip():
                if len(line.strip()) > 2:
                    content += line.replace('\n','')

    # Create list of words and generate number of words
    split_content = content.split()
    seq_len = len(split_content)

    # Check that contents have been extracted and reset Empty flag
    if seq_len > 0:
        final_content = ""

    # Create the sequences
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + stride, seq_len)
        if len(split_content[begin_loc:end_loc]) != 0:
            # Include a new line at the end of the generated sequence
            final_content += "<s>" + ' '.join(split_content[begin_loc:end_loc]) + "</s> \n"

    # Return the sequences
    return final_content

In [15]:
def create_dataset(folder_path, file_ext, folder_destination, dataset_name="dataset.txt"):
    # folder_path is the source path
    # file_ext is the file formats to be matched
    # folder_destination is where to save the dataset
    # dataset_name is the name of the dataset (default will be dataset.txt)

    # Save file paths matched in files variable
    print("Start processing ...")
    files = wcmatch.WcMatch(root_dir=folder_path, file_pattern=file_ext, flags=wcmatch.RECURSIVE).match()
    print(str(len(files)) + " files to be processed!")

    # Loop through and process each file
    i = 0
    while i < len(files):
        try:
            # Get extracted sentences
            contents = extract_sentences(files[i])
            # Ignore "Empty" sentences
            if contents != 'Empty':
                # Open or create dataset in append and byte mode
                f = open(folder_destination + dataset_name, "ab")
                # Save contents in utf-8 encoding
                f.write(contents.encode('utf-8'))
                f.close()
        except Exception as e:
            # Log any issues encountered for further investigation
            print(files[i])
            print ("Error saving extraction to file " + str(e))

        # Increment counter
        i += 1

    print("Finished processing ...")

In [18]:
# Read from the following path
folder_path = "/content/drive/My Drive/Colab Notebooks/pdf"
#Only process the following file formats, add more file extensions using comma separation
file_ext = "*.txt"
# Save to the following path
folder_destination = "/content/drive/My Drive/Colab Notebooks/pdf/extracted"
# Dataset name (can be omitted to use default values)
dataset_name="custom-llama2-dataset.txt"

In [19]:
create_dataset(folder_path, file_ext, folder_destination, dataset_name)

Start processing ...
0 files to be processed!
Finished processing ...


In [11]:
!pip install huggingface_hub

