## Preparing Data to train with InstructABSA

To use this data for to train InstructABSA [[GitHub](https://github.com/kevinscaria/InstructABSA), [paper](https://arxiv.org/abs/2302.08624)], run the following cells. This will output the data in the correct format to train the model.

## Training with InstructABSA

Clone the InstructABSA Github repository, place the formatted BioMAISx data in the Dataset folder, and train with the joint task. 

In [18]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split


REPO_ROOT = Path().resolve().parent

In [19]:
def format_data_for_InstructABSA(df, instances=None):
    """Df should have 'proposed_entity_type',... 'set'"""

    def format_dataframe_InstructABSA(df):
        # Create a DataFrame to hold the transformed data
        transformed_data = []

        for _, group in df.groupby("quote_id"):
            sentence_id = group["quote_id"].iloc[0]
            raw_text = group["quote_string"].iloc[0]
            aspect_terms = []

            for index, row in group.iterrows():
                aspect_term = {"term": row["proposed_entity"], "polarity": row["sentiment"]}
                aspect_terms.append(aspect_term)

            transformed_data.append([sentence_id, raw_text, aspect_terms])

        # Create the DataFrame with the desired format
        transformed_df = pd.DataFrame(
            transformed_data, columns=["sentenceId", "raw_text", "aspectTerms"]
        )

        # Add the aspectCategories column
        transformed_df["aspectCategories"] = (
            "[{'category': 'noaspectcategory', 'polarity': 'none'}]"
        )

        return transformed_df

    output_directory = REPO_ROOT / "datasets" / "InstructABSA"
    output_directory.mkdir(parents=True, exist_ok=True)
    paper_names = {
        "train": "mbio_Train",
        "test": "mbio_Test",
        "validation": "mbio_Validation",
    }
    for set in ["train", "test", "validation"]:
        temp_df = df[df["set"] == set]
        result = format_dataframe_InstructABSA(temp_df)
        result = result.set_index("sentenceId")
        if instances is not None and set == "train":
            # Shuffle the DataFrame
            shuffled_df = result.sample(
                frac=1, random_state=42
            )  # Use a random_state for reproducibility

            # Create N disjoint samples of K rows
            samples = []
            for i in range(10):
                sample = shuffled_df.iloc[i * instances : (i + 1) * instances]
                samples.append(sample)
            for i, sample in enumerate(samples):
                sample.to_csv(output_directory / f"{paper_names[set]}-{i}.csv")
        else:
            result.to_csv(output_directory / f"{paper_names[set]}.csv")

In [20]:
def find_values_above_threshold(df: pd.DataFrame, n: int) -> list[(str, str)]:
    """Given a df, return all row, column pairs by name with value > n"""
    name_pairs = []
    for column in df.columns:
        for index, value in df[df[column] > n].iterrows():
            name_pairs.append((index, column))
    return name_pairs


def filter_dataset(df: pd.DataFrame, min_instances=50) -> pd.DataFrame:
    """Eliminate samples with under min_instances"""
    pivot_table = df.pivot_table(
        index="aspect", columns="entity_type", aggfunc="size", fill_value=0
    )
    entity_type_aspect_pairs = find_values_above_threshold(pivot_table, min_instances)
    print(entity_type_aspect_pairs)
    filtered_dataset = df[
        df.apply(
            lambda row: (row["aspect"], row["entity_type"]) in entity_type_aspect_pairs,
            axis=1,
        )
    ]
    return filtered_dataset

def assign_sets_to_dataframe(df, train, test, validation):
    def assign_by_row(row):
        if row["quote_id"] in train:
            return "train"
        elif row["quote_id"] in test:
            return "test"
        elif row["quote_id"] in validation:
            return "validation"
        else:
            print(row)
            return "unknown"

    df["set"] = df.apply(assign_by_row, axis=1)
    return df

def assign_sets(filtered_dataset: pd.DataFrame) -> pd.DataFrame:    
    temp_df = filtered_dataset.groupby("quote_id")[["entity_type", "aspect"]].first()
    X = temp_df.index
    y = temp_df[["entity_type", "aspect"]]
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    X_test, X_validation, y_test, y_validation = train_test_split(
        X_temp, y_temp, test_size=0.33, stratify=y_temp, random_state=42
    )

    formatted_data = assign_sets_to_dataframe(filtered_dataset, X_train, X_test, X_validation)
    return formatted_data

In [21]:
PATH_TO_ANNOTATIONS = REPO_ROOT / "BioMAISx-CIKM.csv"

annotations = pd.read_csv(PATH_TO_ANNOTATIONS)
filtered_annoatations = filter_dataset(annotations, min_instances=50)
annotations_split = assign_sets(filtered_annoatations)

format_data_for_InstructABSA(annotations_split)

[('Consumer Perception', 'Crops'), ('Economic Impact', 'Crops'), ('Food Security', 'Crops'), ('Productivity', 'Crops'), ('Resistance', 'Crops'), ('Economic Impact', 'Economic Factors'), ('Productivity', 'Environmental Conditions'), ('Consumer Perception/Nutrition', 'GM Crop'), ('Economic Impact', 'GM Crop'), ('Food Security', 'GM Crop'), ('Miscellaneous', 'GM Crop'), ('Productivity', 'GM Crop'), ('Resistance', 'GM Crop'), ('Productivity', 'Geographical Location'), ('Economic Impact', 'Legal Aspects and Politics'), ('Productivity', 'Legal Aspects and Politics'), ('Economic Impact', 'Organizations'), ('Environment and Ethical Concerns', 'Organizations'), ('Productivity', 'Organizations'), ('Economic Impact', 'Technology'), ('Productivity', 'Technology'), ('Productivity', 'Weather/Climate')]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["set"] = df.apply(assign_by_row, axis=1)
