In [1]:
from datasets import load_dataset, load_from_disk
import json
import os
import pandas as pd

In [2]:
def load_and_filter_sustainability_data(directory, min_length=50):
    all_qa_pairs = []  
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:  
                data = json.load(file)
                for entry in data.values(): 
                    for qa in entry:
                        question = qa['question'].strip()
                        answer = qa['answer'].strip()
                        # Filter by minimum length
                        if len(question) >= min_length and len(answer) >= min_length:
                            all_qa_pairs.append({'question': question, 'answer': answer})
    return all_qa_pairs

In [15]:
def get_format_data_hf(dataset):
    df = pd.DataFrame(dataset)
    # Dynamically check the column names to adjust accordingly
    if 'answer' in df.columns:
        df['formatted'] = df.apply(lambda row: f"[INST] {row['question']} [/INST] {row['answer']}", axis=1)
    elif 'response' in df.columns:
        df['formatted'] = df.apply(lambda row: f"[INST] {row['question']} [/INST] {row['response']}", axis=1)
    else:
        raise ValueError("Dataset must have 'answer' or 'response' column")
    df = df[['formatted']].rename(columns={'formatted': 'text'})
    return df

In [5]:
def get_format_data_json(dataset_name):
    with open(dataset_name, encoding="utf8") as file:
        data = json.load(file)
    formatted_data = [f"[INST] {item['question']} [/INST] {item['answer']}" for item in data]
    df = pd.DataFrame(formatted_data, columns=['text'])
    return df

In [18]:
def get_format_data_xsum(dataset):
    df = pd.DataFrame(dataset)

    # Check if necessary columns are present
    if 'documents' in df.columns and 'summary' in df.columns:
        # Apply formatting directly to create the desired instruction format
        df['formatted'] = df.apply(lambda row: f"[INST] You are an AI assistant. You need to make a concise summary from the following text: {row['documents']} [/INST] {row['summary']}", axis=1)
    else:
        raise ValueError("Dataset must have 'documents' and 'summary' columns")

    # Select the formatted column and rename it for consistency
    df = df[['formatted']].rename(columns={'formatted': 'text'})
    return df

In [20]:
def get_format_data_wyvern(dataset_name):
    # Load the dataset
    dataset = load_dataset(dataset_name, split="train")
    df = pd.DataFrame(dataset)

    # Check if necessary columns are present
    if 'instruction' in df.columns and 'response' in df.columns:
        # Apply formatting directly to create the desired instruction format
        df['formatted'] = df.apply(lambda row: f"[INST] {row['instruction']} [/INST] {row['response']}", axis=1)
    else:
        raise ValueError("Dataset must have 'instruction' and 'response' columns")

    # Select the formatted column and rename it for consistency
    df = df[['formatted']].rename(columns={'formatted': 'text'})
    return df

In [6]:
def save_to_json(output_path, data):
    with open(output_path, 'w', encoding='utf-8') as f: 
        json.dump(data, f, ensure_ascii=False, indent=4)

In [7]:
base_dir = os.getcwd()
results_dir = os.path.join(base_dir, 'pdfs', 'sustainability', 'results')

# LOADING SUSATAINABILITY QA DATA
qa_pairs = load_and_filter_sustainability_data(results_dir, min_length=10)
formatted_data = [f"[INST] {item['question']} [/INST] {item['answer']}" for item in qa_pairs]
final_output_path = os.path.join(results_dir, 'final', 'final_results.json')
# save_to_json(final_output_path, qa_pairs)
print(f"Filtered data saved to {final_output_path}")

Filtered data saved to C:\Users\nessa\Documents\Thesis\code\data_generation\pdfs\sustainability\results\final\final_results.json


In [9]:
df_sustainability_qa = pd.DataFrame(formatted_data, columns=['text'])
print(f"Sustainability QA data formated, the amount of rows is: {len(df_sustainability_qa)}")
print(f"There are {len(df_sustainability_qa)} questions")
df_sustainability_qa.head()

Sustainability QA data formated, the amount of rows is: 9473
There are 9473 questions


Unnamed: 0,text
0,[INST] How does the author approach sustainabl...
1,[INST] What are some challenges faced by compa...
2,[INST] What challenges will future business le...
3,"[INST] Why do 21st century directors, managers..."
4,[INST] Why is it important for companies and t...


In [17]:
dataset = load_dataset("Open-Orca/OpenOrca", split="train")
df_general_qa = get_format_data_hf(dataset)
print("General QA data loaded and formatted, the amount of rows is: ", len(df_general_qa))
df_general_qa.head()

General QA data loaded and formatted, the amount of rows is:  4233923


Unnamed: 0,text
0,[INST] You will be given a definition of a tas...
1,[INST] Generate an approximately fifteen-word ...
2,[INST] What happens next in this paragraph?\n\...
3,[INST] Please answer the following question: I...
4,[INST] James runs a TV show and there are 5 ma...


In [None]:
dataset = load_dataset("EdinburghNLP/xsum", split="train")
df_summary_qa = get_format_data_xsum(dataset)
print("Summaries QA data loaded and formatted, the amount of rows is: ", len(df_summary_qa))
df_summary_qa.head()

In [None]:
dataset = load_dataset("StudentLLM/Open-Wyvern-74k", split="train")
df_general_qa_2 = get_format_data_wyvern(dataset)
print("General QA data 2 loaded and formatted, the amount of rows is: ", len(df_general_qa_2))
df_general_qa_2.head()

In [21]:
dataset = load_dataset("Open-Orca/SlimOrca", split="train")
df = pd.DataFrame(dataset)

Downloading readme:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/986M [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device