In [4]:
import pandas as pd
from pathlib import Path

REPO_ROOT = Path().resolve().parent

In [6]:
def format_data_for_InstructABSA(df, instances=None):
    """Df should have 'entity_type',... 'set'"""

    def format_dataframe_InstructABSA(df):
        # Create a DataFrame to hold the transformed data
        transformed_data = []

        for _, group in df.groupby("quote_id"):
            sentence_id = group["quote_id"].iloc[0]
            raw_text = group["quote_string"].iloc[0]
            aspect_terms = []

            for index, row in group.iterrows():
                aspect_term = {"term": row["entity"], "polarity": row["sentiment"]}
                aspect_terms.append(aspect_term)

            transformed_data.append([sentence_id, raw_text, aspect_terms])

        # Create the DataFrame with the desired format
        transformed_df = pd.DataFrame(
            transformed_data, columns=["sentenceId", "raw_text", "aspectTerms"]
        )

        # Add the aspectCategories column
        transformed_df["aspectCategories"] = (
            "[{'category': 'noaspectcategory', 'polarity': 'none'}]"
        )

        return transformed_df

    output_directory = REPO_ROOT / "datasets" / "InstructABSA"
    output_directory.mkdir(parents=True, exist_ok=True)
    paper_names = {
        "train": "mbio_Train",
        "test": "mbio_Test",
        "validation": "mbio_Validation",
    }
    for set in ["train", "test", "validation"]:
        temp_df = df[df["set"] == set]
        result = format_dataframe_InstructABSA(temp_df)
        result = result.set_index("sentenceId")
        if instances is not None and set == "train":
            # Shuffle the DataFrame
            shuffled_df = result.sample(
                frac=1, random_state=42
            )  # Use a random_state for reproducibility

            # Create N disjoint samples of K rows
            samples = []
            for i in range(10):
                sample = shuffled_df.iloc[i * instances : (i + 1) * instances]
                samples.append(sample)
            for i, sample in enumerate(samples):
                sample.to_csv(output_directory / f"{paper_names[set]}-{i}.csv")
        else:
            result.to_csv(output_directory / f"{paper_names[set]}.csv")

In [7]:
PATH_TO_ANNOTATIONS = REPO_ROOT / "BioMAISx-CIKM.csv"

annotations = pd.read_csv(PATH_TO_ANNOTATIONS)

format_data_for_InstructABSA(annotations)

KeyError: 'set'