In [None]:
import numpy as np
def load_glove_embeddings(path):
    word2vec_dict = {}            
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                word2vec_dict[word] = vector
            except ValueError:                
                continue    
    return word2vec_dict

path = "glove.840B.300d.txt"
word2vec = load_glove_embeddings(path)
print(len(word2vec))


import pickle

with open('w2v.pkl', 'wb') as f:
    pickle.dump(word2vec, f)

## --- process the fraction of datasets ---

In [None]:
import pandas as pd
namelist = ['cr', 'trec', 'agnews', 'pc', 'yelp', 'kaggle_med', 'cardio', 'bbc', 'sst2','subj']
for name in namelist:
    df = pd.read_csv(f'data/original/{name}/train.csv')
    df = df.sample(frac=0.5, random_state=100)
    df.to_csv(f'data/original/{name}/train_50.csv', index=False)


    

In [None]:
import torch
torch.cuda.is_available()

## --- preprocess the data ---

In [None]:
# load data and remove all rows with nan or empty string or space or NONE or None or none or NaN or nan or NaT or nat or N/A or n/a or NULL or null or Null or nil or NIL or Nil or na or NA or n.a. or N.A. or n.a or N.a or N.A or n.A or n.A. or N.a. or N.A. or n.A. or n.A. or N.A. or n.a. or N.a
# and print rows that removed
import pandas as pd
import numpy as np
import re

def clean(path):
    df = pd.read_csv(path)
    invalid_values = ['', ' ', 'NONE', 'None', 'none', 'NaN', 'nan', 'NaT', 'nat', 'N/A', 'n/a',
                        'NULL', 'null', 'Null', 'nil', 'NIL', 'Nil', 'na', 'NA', 'n.a.', 'N.A.', 'n.a', 'N.a', 'N.A',
                          'n.A', 'n.A.', 'N.a.', 'N.A.', 'n.A.', 'n.A.', 'N.A.', 'n.a.', 'N.a']
    for text in df['text']:
        if text in invalid_values and len(text) < 4:
            # delete the row

            # reset index and save to csv

            
            return df

In [None]:
namelist = ['cr', 'trec', 'agnews', 'pc', 'yelp', 'kaggle_med', 'cardio', 'bbc', 'sst2','subj']
for name in namelist:
    clean(f'data/original/{name}/train.csv')

In [None]:
import pandas as pd
import numpy as np
import re

def clean(path):
    df = pd.read_csv(path)
    invalid_values = ['', ' ', 'NONE', 'None', 'none', 'NaN', 'nan', 'NaT', 'nat', 'N/A', 'n/a',
                        'NULL', 'null', 'Null', 'nil', 'NIL', 'Nil', 'na', 'NA', 'n.a.', 'N.A.', 'n.a', 'N.a', 'N.A',
                          'n.A', 'n.A.', 'N.a.', 'N.A.', 'n.A.', 'n.A.', 'N.A.', 'n.a.', 'N.a']
    
    invalid_rows = df[df['text'].apply(lambda x: x in invalid_values and len(x) < 4) | df['class'].apply(lambda x: x in invalid_values and len(x) < 4)]

    # Print the removed rows
    print("Removed rows:")
    print(invalid_rows)

In [None]:
import pandas as pd
train_path = 'data/original/pubmed/train.csv'
test_path = 'data/original/pubmed/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df['class'] = train_df['class'].map({'OBJECTIVE': 0, 'METHODS': 1, 'RESULTS': 2, 'CONCLUSIONS': 3, 'BACKGROUND': 4})
test_df['class'] = test_df['class'].map({'OBJECTIVE': 0, 'METHODS': 1, 'RESULTS': 2, 'CONCLUSIONS': 3, 'BACKGROUND': 4})
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)


In [None]:
df

In [None]:
import pandas as pd
for i in ['10', '20', '50']:
    train_path = f'data/original/pubmed/train_{i}.csv'
    df = pd.read_csv(train_path)
    df['class'] = df['class'].map({'OBJECTIVE': 0, 'METHODS': 1, 'RESULTS': 2, 'CONCLUSIONS': 3, 'BACKGROUND': 4})
    df.to_csv(train_path, index=False)



## exploring the results

In [17]:
import pandas as pd

def load_results_into_df(path, model_list, dataset_list, percentage_list, numbers):
    results = {}

    for number, percentage in zip(numbers, percentage_list):
        for model in model_list:
            for dataset in dataset_list:
                try:
                    results[f'{model}_{dataset}_{percentage}'] = []
                    with open(f'{path}/{model}/{percentage}/{dataset}_{number}_results.txt', 'r') as f:
                        for line in f:
                            results[f'{model}_{dataset}_{percentage}'].append(line.strip())    
                except Exception as e:
                    if dataset != 'kaggle_med':
                        print(f"Error occurred: {e}")
    
    df = pd.DataFrame.from_dict(results, orient='index')

    df['f1_score'] = None
    df['accuracy'] = None

    for index, row in df.iterrows():
        for item in row:
            if item is not None:
                metric, value = item.split(":")
                if metric.strip() in ['f1', 'f1_score']:
                    df.at[index, 'f1_score'] = float(value)
                elif metric.strip() in ['acc', 'accuracy']:
                    df.at[index, 'accuracy'] = float(value)

    df = df[['f1_score', 'accuracy']]

    return df

## exploring the results

In [2]:

import pandas as pd

new_dict = {}
path = 'results/original'
model_list = ['bert', 'lstm','cnn'] # 3 models
dataset_list = ['cr', 'trec', 'agnews', 'pc', 'yelp', 'cardio', 'bbc', 'sst2','subj', 'pubmed'] # 10 datasets
percentage_list = ['10_percent', '20_percent', '50_percent','full'] # 4 percentages
numbers = ['10', '20', '50', 'full']

for number, percentage in zip(numbers, percentage_list):
    for model in model_list:
        for dataset in dataset_list:
            try:
                new_dict[f'{model}_{dataset}_{percentage}'] = []
                with open(f'{path}/{model}/{percentage}/{dataset}_{number}_results.txt', 'r') as f:
                    for line in f:
                        new_dict[f'{model}_{dataset}_{percentage}'].append(line.strip())    
            except Exception as e:
                if dataset != 'kaggle_med':
                    print(f"Error occurred: {e}")

In [3]:
df = pd.DataFrame.from_dict(new_dict, orient='index')
# create empty columns
df['f1_score'] = None
df['accuracy'] = None

In [4]:
df.head(20)

Unnamed: 0,0,1,2,3,f1_score,accuracy
bert_cr_10_percent,acc: 0.8511,f1: 0.8866,prec: 0.8833,rec: 0.8909,,
bert_trec_10_percent,acc: 0.668,f1: 0.557,prec: 0.5605,rec: 0.6375,,
bert_agnews_10_percent,acc: 0.8883,f1: 0.8882,prec: 0.8883,rec: 0.8883,,
bert_pc_10_percent,acc: 0.8824,f1: 0.8796,prec: 0.8639,rec: 0.8959,,
bert_yelp_10_percent,acc: 0.446,f1: 0.2984,prec: 0.3118,rec: 0.3616,,
bert_cardio_10_percent,acc: 0.3682,f1: 0.309,prec: 0.2966,rec: 0.3702,,
bert_bbc_10_percent,acc: 0.6338,f1: 0.5657,prec: 0.6224,rec: 0.7381,,
bert_sst2_10_percent,acc: 0.8669,f1: 0.8627,prec: 0.8376,rec: 0.8894,,
bert_subj_10_percent,acc: 0.94,f1: 0.9386,prec: 0.9316,rec: 0.946,,
bert_pubmed_10_percent,acc: 0.841,f1: 0.7812,prec: 0.7762,rec: 0.7922,,


In [5]:
for index, row in df.iterrows():
    for item in row:
        if item is not None:
            metric, value = item.split(":")
            if metric.strip() in ['f1', 'f1_score']:
                df.at[index, 'f1_score'] = float(value)
            elif metric.strip() in ['acc', 'accuracy']:
                df.at[index, 'accuracy'] = float(value)

# keep only relevant columns
df = df[['f1_score', 'accuracy']]

## creating datasets from 4 examples

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

aug_path = 'data/augmented/{dataset_name}/meth_{method_name}_pctwts_0.5_example_4.csv'
org_path = 'data/original/agnews/train.csv'
df_aug = pd.read_csv(aug_path)
df_org = pd.read_csv(org_path)

df_aug["org"] = df_aug["text"].isin(df_org["text"])
df_aug['first_aug'] = df_aug['text'].shift(-1)
df_aug['first_aug'] = df_aug['first_aug'].where(df_aug['first_aug'] != df_aug['text'], None)
df_aug['second_aug'] = df_aug['text'].shift(-2)
df_aug['second_aug'] = df_aug['second_aug'].where(df_aug['second_aug'] != df_aug['text'], None)
na_indices = df_aug[df_aug['first_aug'].isna() | df_aug['second_aug'].isna()].index
df_aug = df_aug.drop(na_indices)

# Selecting rows where 'org' is True and only keeping 'class', 'text', 'first_aug', and 'second_aug' columns
df_result = df_aug[df_aug['org'] == True][['class', 'text', 'first_aug', 'second_aug']]

# Remove rows where 'second_aug' is in the original text
# Remove rows where 'first_aug' is in the original text

df_result = df_result[~df_result['first_aug'].isin(df_aug[df_aug['org'] == True]['text'])]
df_result = df_result[~df_result['second_aug'].isin(df_aug[df_aug['org'] == True]['text'])]

df_first_aug = df_result[['class', 'first_aug']].copy()
df_first_aug.rename(columns={'first_aug': 'text'}, inplace=True)

df_second_aug = df_result[['class', 'second_aug']].copy()
df_second_aug.rename(columns={'second_aug': 'text'}, inplace=True)

df_result['aug_number'] = 'original'
df_first_aug['aug_number'] = 'first_aug'
df_second_aug['aug_number'] = 'second_aug'

df_all = pd.concat([df_result, df_first_aug, df_second_aug])

df_all.sort_index(inplace=True)

df_one_example = df_all[df_all['aug_number'].isin(['original', 'first_aug'])]
df_two_examples = df_all[df_all['aug_number'].isin(['original', 'first_aug', 'second_aug'])]

df_one_example = df_one_example[['class', 'text']]
df_two_examples = df_two_examples[['class', 'text']]

df_one_example.to_csv('data/augmented/agnews/meth_eda_pctwts_0.5_example_1.csv', index=False)
df_two_examples.to_csv('data/augmented/agnews/meth_eda_pctwts_0.5_example_2.csv', index=False)


def create_aug_df_from_4_example(dataset_name,method_name):
    aug_path = f'data/augmented/{dataset_name}/meth_{method_name}_pctwts_0.5_example_4.csv'
    org_path = f'data/original/{dataset_name}/train.csv'
    df_aug = pd.read_csv(aug_path)
    df_org = pd.read_csv(org_path)
    df_aug["org"] = df_aug["text"].isin(df_org["text"])
    df_aug['first_aug'] = df_aug['text'].shift(-1)
    df_aug['first_aug'] = df_aug['first_aug'].where(df_aug['first_aug'] != df_aug['text'], None)
    df_aug['second_aug'] = df_aug['text'].shift(-2)
    df_aug['second_aug'] = df_aug['second_aug'].where(df_aug['second_aug'] != df_aug['text'], None)
    na_indices = df_aug[df_aug['first_aug'].isna() | df_aug['second_aug'].isna()].index
    df_aug = df_aug.drop(na_indices)

    # Selecting rows where 'org' is True and only keeping 'class', 'text', 'first_aug', and 'second_aug' columns
    df_result = df_aug[df_aug['org'] == True][['class', 'text', 'first_aug', 'second_aug']]

    # Remove rows where 'second_aug' is in the original text
    # Remove rows where 'first_aug' is in the original text

    df_result = df_result[~df_result['first_aug'].isin(df_aug[df_aug['org'] == True]['text'])]
    df_result = df_result[~df_result['second_aug'].isin(df_aug[df_aug['org'] == True]['text'])]

    df_first_aug = df_result[['class', 'first_aug']].copy()
    df_first_aug.rename(columns={'first_aug': 'text'}, inplace=True)

    df_second_aug = df_result[['class', 'second_aug']].copy()
    df_second_aug.rename(columns={'second_aug': 'text'}, inplace=True)

    df_result['aug_number'] = 'original'
    df_first_aug['aug_number'] = 'first_aug'
    df_second_aug['aug_number'] = 'second_aug'

    df_all = pd.concat([df_result, df_first_aug, df_second_aug])

    df_all.sort_index(inplace=True)

    df_one_example = df_all[df_all['aug_number'].isin(['original', 'first_aug'])]
    df_two_examples = df_all[df_all['aug_number'].isin(['original', 'first_aug', 'second_aug'])]

    df_one_example = df_one_example[['class', 'text']]
    df_two_examples = df_two_examples[['class', 'text']]

    df_one_example.to_csv('data/augmented/{dataset_name}/meth_{method_name}_pctwts_0.5_example_1.csv', index=False)
    df_two_examples.to_csv('data/augmented/{dataset_name}/meth_{method_name}_pctwts_0.5_example_2.csv', index=False)
    



In [13]:
from tqdm import tqdm

def remove_na_augmentations(df_aug, df_org):
    """
    This function identifies original and augmented texts and removes NA augmentations.
    
    Parameters:
    - df_aug: DataFrame, the augmented data
    - df_org: DataFrame, the original data

    Returns:
    - df_aug: DataFrame, the augmented data with NAs removed
    """
    df_aug["org"] = df_aug["text"].isin(df_org["text"])
    df_aug['first_aug'] = df_aug['text'].shift(-1)
    df_aug['first_aug'] = df_aug['first_aug'].where(df_aug['first_aug'] != df_aug['text'], None)
    df_aug['second_aug'] = df_aug['text'].shift(-2)
    df_aug['second_aug'] = df_aug['second_aug'].where(df_aug['second_aug'] != df_aug['text'], None)
    na_indices = df_aug[df_aug['first_aug'].isna() | df_aug['second_aug'].isna()].index
    df_aug = df_aug.drop(na_indices)

    return df_aug


def select_org_and_aug_cols(df_aug):
    """
    This function selects relevant columns of original and augmented texts.

    Parameters:
    - df_aug: DataFrame, the augmented data

    Returns:
    - df_result: DataFrame, the selected columns from the data
    """
    df_result = df_aug[df_aug['org'] == True][['class', 'text', 'first_aug', 'second_aug']]
    return df_result


def create_augmentations(df_result, df_aug):
    """
    This function creates DataFrame of original and augmented texts.

    Parameters:
    - df_result: DataFrame, the selected columns from the data
    - df_aug: DataFrame, the augmented data

    Returns:
    - df_one_example: DataFrame, one example of augmented data
    - df_two_examples: DataFrame, two examples of augmented data
    """
    df_result = df_result[~df_result['first_aug'].isin(df_aug[df_aug['org'] == True]['text'])]
    df_result = df_result[~df_result['second_aug'].isin(df_aug[df_aug['org'] == True]['text'])]

    df_first_aug = df_result[['class', 'first_aug']].copy()
    df_first_aug.rename(columns={'first_aug': 'text'}, inplace=True)

    df_second_aug = df_result[['class', 'second_aug']].copy()
    df_second_aug.rename(columns={'second_aug': 'text'}, inplace=True)

    df_result['aug_number'] = 'original'
    df_first_aug['aug_number'] = 'first_aug'
    df_second_aug['aug_number'] = 'second_aug'

    df_all = pd.concat([df_result, df_first_aug, df_second_aug])

    df_all.sort_index(inplace=True)

    df_one_example = df_all[df_all['aug_number'].isin(['original', 'first_aug'])]
    df_two_examples = df_all[df_all['aug_number'].isin(['original', 'first_aug', 'second_aug'])]

    df_one_example = df_one_example[['class', 'text']]
    df_two_examples = df_two_examples[['class', 'text']]

    return df_one_example, df_two_examples


def create_aug_df_from_4_example(dataset_name,method_name):
    """
    This function creates a dataframe with one and two augmented versions from the original 
    and four augmentations.
    
    Parameters:
    - dataset_name: str, name of the dataset
    - method_name: str, name of the method used for data augmentation

    Returns:
    - None. The function writes the output to CSV files.
    """
    aug_path = f'data/augmented/{dataset_name}/meth_{method_name}_pctwts_0.5_example_4.csv'
    org_path = f'data/original/{dataset_name}/train.csv'
    
    df_aug = pd.read_csv(aug_path)
    df_org = pd.read_csv(org_path)
    
    df_aug = remove_na_augmentations(df_aug, df_org)
    df_result = select_org_and_aug_cols(df_aug)
    
    df_one_example, df_two_examples = create_augmentations(df_result, df_aug)
    
    df_one_example.to_csv(f'data/augmented/{dataset_name}/meth_{method_name}_pctwts_0.5_example_1.csv', index=False)
    df_two_examples.to_csv(f'data/augmented/{dataset_name}/meth_{method_name}_pctwts_0.5_example_2.csv', index=False)


In [37]:
create_aug_df_from_4_example('yelp','checklist')

# draft

In [18]:
path = 'results/original'
model_list = ['bert', 'lstm','cnn']
dataset_list = ['cr', 'trec', 'agnews', 'pc', 'yelp', 'cardio', 'bbc', 'sst2','subj', 'pubmed']
percentage_list = ['10_percent', '20_percent', '50_percent','full']
numbers = ['10', '20', '50', 'full']

df = load_results_into_df(path, model_list, dataset_list, percentage_list, numbers)


In [19]:
df

Unnamed: 0,f1_score,accuracy
bert_cr_10_percent,0.8866,0.8511
bert_trec_10_percent,0.557,0.668
bert_agnews_10_percent,0.8882,0.8883
bert_pc_10_percent,0.8796,0.8824
bert_yelp_10_percent,0.2984,0.446
...,...,...
cnn_cardio_full,0.2882,0.3083
cnn_bbc_full,0.9522,0.9522
cnn_sst2_full,0.8365,0.8367
cnn_subj_full,0.9066,0.9067
