In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from utils import initialize_seeds
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from pathlib import Path

In [None]:
initialize_seeds()

In [None]:
def df_to_dataset(df, seed=123):
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for func in np.unique(df.functionality):
        func_df = suite_df[suite_df.functionality == func]
        ids = np.unique(func_df.test_id)
        train_ids, valtest_ids = train_test_split(ids, test_size=0.5, random_state=seed)
        val_ids, test_ids = train_test_split(valtest_ids, test_size=0.5, random_state=seed)
        train_df = pd.concat([train_df, func_df[func_df["test_id"].isin(train_ids)]], axis=0)
        val_df = pd.concat([val_df, func_df[func_df["test_id"].isin(val_ids)]], axis=0)
        test_df = pd.concat([test_df, func_df[func_df["test_id"].isin(test_ids)]], axis=0)
    train_dataset = Dataset.from_pandas(train_df).rename_columns({"__index_level_0__": "id"}).sort("id")
    val_dataset = Dataset.from_pandas(val_df).rename_columns({"__index_level_0__": "id"}).sort("id")
    test_dataset = Dataset.from_pandas(test_df).rename_columns({"__index_level_0__": "id"}).sort("id")
    return DatasetDict(train=train_dataset, validation=val_dataset, test=test_dataset)

## Sentiment Analysis

In [None]:
data_path =  Path("./data/sa/")

In [None]:
suite_df = pd.read_csv(data_path/"sa.csv", dtype=object)

In [None]:
suite_df

In [None]:
len(pd.unique(suite_df.functionality)), len(pd.unique(suite_df.capability))

In [None]:
pd.unique(suite_df.capability)

In [None]:
suite_df.drop_duplicates(["test_id", "functionality"]).type.value_counts()

In [None]:
suite_df.test_id = pd.to_numeric(suite_df.test_id)

In [None]:
suite_df.dtypes

In [None]:
datasets = df_to_dataset(suite_df)

In [None]:
datasets

In [None]:
datasets.save_to_disk(data_path/"sa")

## QQP

In [None]:
data_path =  Path("./data/qqp/")

In [None]:
suite_df = pd.read_csv(data_path/"qqp.csv", dtype=object)

In [None]:
suite_df.groupby("functionality").sample(1)

In [None]:
from ast import literal_eval

suite_df['test_case'] = suite_df.test_case.apply(lambda x: literal_eval(x))

In [None]:
q1, q2 = list(zip(*suite_df['test_case']))

In [None]:
q1[:3], q2[:3]

In [None]:
suite_df["question1"] = q1
suite_df["question2"] = q2

In [None]:
del suite_df["test_case"]

In [None]:
suite_df.groupby("functionality").sample(1)

In [None]:
suite_df[suite_df["question1"].str.contains("What will happen if Donald Trump gets elected")]

In [None]:
suite_df.dtypes

In [None]:
suite_df.test_id = pd.to_numeric(suite_df.test_id)

In [None]:
len(pd.unique(suite_df.functionality)), len(pd.unique(suite_df.capability))

In [None]:
pd.unique(suite_df.capability)

In [None]:
suite_df.drop_duplicates(["test_id", "functionality"]).type.value_counts()

In [None]:
datasets = df_to_dataset(suite_df)

In [None]:
datasets

In [None]:
datasets.save_to_disk(data_path/"qqp")

## Squad

In [None]:
data_path =  Path("./data/squad/")

In [None]:
suite_df = pd.read_csv(data_path/"squad.csv", dtype=object)

In [None]:
suite_df.groupby("functionality").sample(1)

In [None]:
suite_df['test_case'] = suite_df.test_case.apply(lambda x: literal_eval(x))

In [None]:
c, q = list(zip(*suite_df['test_case']))

In [None]:
c[:3], q[:3]

In [None]:
suite_df["context"] = c
suite_df["question"] = q

In [None]:
del suite_df["test_case"]

In [None]:
suite_df.groupby("functionality").sample(1)

In [None]:
suite_df.label = suite_df.label.fillna('NaN')

In [None]:
answers = [{"text": [label], 'answer_start': [context.find(label)]} for label, context in zip(suite_df.label, suite_df.context)]

In [None]:
suite_df["answers"] = answers

In [None]:
del suite_df["label"]

In [None]:
suite_df.test_id = pd.to_numeric(suite_df.test_id)

In [None]:
suite_df.dtypes

In [None]:
len(pd.unique(suite_df.functionality)), len(pd.unique(suite_df.capability))

In [None]:
pd.unique(suite_df.capability)

In [None]:
suite_df.drop_duplicates(["test_id", "functionality"]).type.value_counts()

In [None]:
datasets = df_to_dataset(suite_df)

In [None]:
datasets

In [None]:
datasets.save_to_disk(data_path/"squad")