In [1]:
split= "train"

In [11]:
import json
from datasets import load_dataset

# Load the SST dataset from Hugging Face datasets
sst_dataset = load_dataset("sst")

# Extract the necessary information from the SST dataset
reformatted_data = []
for example in sst_dataset[split]:
    sentence = example["sentence"]
    label = example["label"]
    
    # Create a new JSON object following the SQuAD format
    squad_example = {
        "context": sentence,
        "qas": [
            {
                "question": "What is the sentiment of this sentence?",
                "id": f"sst-{len(reformatted_data)}",
                "answers": [
                    {
                        "text": "Positive" if label == 1 else "Negative",
                        "answer_start": 0
                    }
                ]
            }
        ]
    }
    
    reformatted_data.append(squad_example)

# Create the final JSON object
squad_formatted_data = {
    "version": "1.1",
    "data": [
        {
            "title": "SST Dataset",
            "paragraphs": reformatted_data
        }
    ]
}

# Save the reformatted data to a JSON file
with open(f"/home/ouranos/Documents/Classes/NLP/HMIGenerativeReplay/training_data/sst_to_squad-{split}-v2.0.json", "w") as file:
    json.dump(squad_formatted_data, file, indent=2)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [12]:
import json
from datasets import load_dataset

# Load the QA-SRL dataset from Hugging Face datasets
qasrl_dataset = load_dataset("qa_srl")

# Extract the necessary information from the QA-SRL dataset
reformatted_data = []
for example in qasrl_dataset[split]:
    sentence = example["sentence"]
    sent_id = example["sent_id"]
    predicate = example["predicate"]
    question_template = example["question"]
    answers = example["answers"]
    
    # Format the question
    question = " ".join([token for token in question_template if token != "_"]).capitalize()
    
    # Create answer objects
    squad_answers = []
    for answer in answers:
        answer_start = sentence.find(answer)
        squad_answers.append({"text": answer, "answer_start": answer_start})
    
    # Create a new JSON object following the SQuAD format
    squad_example = {
        "context": sentence,
        "qas": [
            {
                "question": question,
                "id": f"{sent_id}-{predicate}",
                "answers": squad_answers
            }
        ]
    }
    
    reformatted_data.append(squad_example)

# Create the final JSON object
squad_formatted_data = {
    "version": "1.1",
    "data": [
        {
            "title": "QA-SRL Dataset",
            "paragraphs": reformatted_data
        }
    ]
}

# Save the reformatted data to a JSON file
with open(f"/home/ouranos/Documents/Classes/NLP/HMIGenerativeReplay/training_data/srl_to_squad-{split}-v2.0.json", "w") as file:
    json.dump(squad_formatted_data, file, indent=2)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [13]:
import json
from datasets import load_dataset

# Load the DBpedia dataset from Hugging Face datasets
dbpedia_dataset = load_dataset("dbpedia_14")

# Extract the necessary information from the DBpedia dataset
reformatted_data = []
for example in dbpedia_dataset[split]:
    text = example["content"]
    label = example["label"]
    
    # Create a new JSON object following the SQuAD format
    squad_example = {
        "context": text,
        "qas": [
            {
                "question": "What is the category of this text?",
                "id": f"dbpedia-{len(reformatted_data)}",
                "answers": [
                    {
                        "text": str(label),
                        "answer_start": 0
                    }
                ]
            }
        ]
    }
    
    reformatted_data.append(squad_example)

# Create the final JSON object
squad_formatted_data = {
    "version": "1.1",
    "data": [
        {
            "title": "DBpedia Dataset",
            "paragraphs": reformatted_data
        }
    ]
}

# Save the reformatted data to a JSON file
with open(f"/home/ouranos/Documents/Classes/NLP/HMIGenerativeReplay/training_data/dbpedia_to_squad-{split}-v2.0.json", "w") as file:
    json.dump(squad_formatted_data, file, indent=2)

In [14]:
import json
from datasets import load_dataset

# Load the AG News dataset from Hugging Face datasets
ag_news_dataset = load_dataset("ag_news")

# Extract the necessary information from the AG News dataset
reformatted_data = []
for example in ag_news_dataset[split]:
    text = example["text"]
    label = example["label"]
    
    # Map the label to the corresponding category
    label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    category = label_map[label]
    
    # Create a new JSON object following the SQuAD format
    squad_example = {
        "context": text,
        "qas": [
            {
                "question": "What is the category of this news article?",
                "id": f"ag_news-{len(reformatted_data)}",
                "answers": [
                    {
                        "text": category,
                        "answer_start": 0
                    }
                ]
            }
        ]
    }
    
    reformatted_data.append(squad_example)

# Create the final JSON object
squad_formatted_data = {
    "version": "1.1",
    "data": [
        {
            "title": "AG News Dataset",
            "paragraphs": reformatted_data
        }
    ]
}

# Save the reformatted data to a JSON file
with open(f"/home/ouranos/Documents/Classes/NLP/HMIGenerativeReplay/training_data/ag_to_squad-{split}-v2.0.json", "w") as file:
    json.dump(squad_formatted_data, file, indent=2)

In [15]:
import json
from datasets import load_dataset

# Load the Yelp Review Full dataset from Hugging Face datasets
yelp_dataset = load_dataset("codyburker/yelp_review_sampled")

# Extract the necessary information from the Yelp dataset
reformatted_data = []
for example in yelp_dataset["train"]:
    text = example["text"]
    label = example["stars"]
    
    # Map the label to the corresponding star rating
    label_map = {1: "1 star", 2: "2 stars", 3: "3 stars", 4: "4 stars", 5: "5 stars"}
    star_rating = label_map[label]
    
    # Create a new JSON object following the SQuAD format
    squad_example = {
        "context": text,
        "qas": [
            {
                "question": "What is the star rating of this Yelp review?",
                "id": f"yelp-{len(reformatted_data)}",
                "answers": [
                    {
                        "text": star_rating,
                        "answer_start": 0
                    }
                ]
            }
        ]
    }
    
    reformatted_data.append(squad_example)

# Create the final JSON object
squad_formatted_data = {
    "version": "1.1",
    "data": [
        {
            "title": "Yelp Review Full Dataset",
            "paragraphs": reformatted_data
        }
    ]
}

# Save the reformatted data to a JSON file
with open(f"/home/ouranos/Documents/Classes/NLP/HMIGenerativeReplay/training_data/yelp_to_squad-{split}-v2.0.json", "w") as file:
    json.dump(squad_formatted_data, file, indent=2)

In [22]:
import json
from datasets import load_dataset

# Load the WikiSQL dataset from Hugging Face datasets
wikisql_dataset = load_dataset("wikisql")

# Extract the necessary information from the WikiSQL dataset
reformatted_data = []
for example in wikisql_dataset[split]:
    question = example["question"]
    table = example["table"]
    sql = example["sql"]
    
    # Extract the table header and rows
    header = table["header"]
    rows = table["rows"]
    
    # Create a context string from the table information
    context = f"Table:\n"
    context += " | ".join(header) + "\n"
    for row in rows:
        context += " | ".join(str(cell) for cell in row) + "\n"
    
    # Extract the SQL query components
    sel_col = sql["sel"]
    agg_op = sql["agg"]
    cond_col_index = sql["conds"]["column_index"]
    cond_op_index = sql["conds"]["operator_index"]
    cond_value = sql["conds"]["condition"]
    
    # Generate the SQL query string
    sql_query = f"SELECT {header[sel_col]} FROM table"
    if cond_col_index and cond_op_index and cond_value:
        sql_query += f" WHERE {header[cond_col_index[0]]} = {cond_value[0]}"
    
    # Create a new JSON object following the SQuAD format
    squad_example = {
        "context": context,
        "qas": [
            {
                "question": question,
                "id": f"wikisql-{len(reformatted_data)}",
                "answers": [
                    {
                        "text": sql_query,
                        "answer_start": 0
                    }
                ]
            }
        ]
    }
    
    reformatted_data.append(squad_example)

# Create the final JSON object
squad_formatted_data = {
    "version": "1.1",
    "data": [
        {
            "title": "WikiSQL Dataset",
            "paragraphs": reformatted_data
        }
    ]
}

with open(f"/home/ouranos/Documents/Classes/NLP/HMIGenerativeReplay/training_data/wikisql_to_squad-{split}-v2.0.json", "w") as file:
    json.dump(squad_formatted_data, file, indent=2)

In [2]:
import json
from datasets import load_dataset

# Load the WoZ dataset from Hugging Face datasets
woz_dataset = load_dataset("woz_dialogue",'en')

# Extract the necessary information from the WoZ dataset
reformatted_data = []
for example in woz_dataset["train"]:
    dialogue = example["dialogue"]
    
    # Extract the user utterances and system responses
    context = ""
    qas = []
    for turn in dialogue:
        user_utterance = turn["transcript"]
        system_response = turn["system_transcript"]
        
        # Append the user utterance and system response to the context
        context += f"User: {user_utterance}\n"
        context += f"System: {system_response}\n"
        
        # Create a question-answer pair for each turn
        qa = {
            "question": user_utterance,
            "id": f"woz-{example['dialogue_idx']}-{turn['turn_idx']}",
            "answers": [
                {
                    "text": system_response,
                    "answer_start": context.find(system_response)
                }
            ]
        }
        qas.append(qa)
    
    # Create a new JSON object following the SQuAD format
    squad_example = {
        "context": context,
        "qas": qas
    }
    
    reformatted_data.append(squad_example)

# Create the final JSON object
squad_formatted_data = {
    "version": "1.1",
    "data": [
        {
            "title": "WoZ Dataset",
            "paragraphs": reformatted_data
        }
    ]
}

with open(f"/home/ouranos/Documents/Classes/NLP/HMIGenerativeReplay/training_data/woz.en_to_squad-{split}-v2.0.json", "w") as file:
    json.dump(squad_formatted_data, file, indent=2)

  from .autonotebook import tqdm as notebook_tqdm
