In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score
import numpy as np

## create predictions file

In [60]:
epochs = [3, 3]
batch_sizes = [8]
lrs = [5e-5, 1e-6, 1e-5]  # [5e-5, 1e-6]
seeds = [42]

results = {}
results[1,2, 3, 5] = 6
results[2, 3, 3, 6] = 8
results

{(1, 2, 3, 5): 6, (2, 3, 3, 6): 8}

In [62]:
results_file_path = 'Data/output/results/{model_name}_on_{dataset}_{date}'.format(
    model_name='t5',
    dataset='igg',
    date='23_08_2023'
)

with open(results_file_path, 'a') as f:
    for k, v in results.items():
        f.write(f'ep: {k[0]}, bs: {k[1]}, lr: {k[2]}, seed: {k[3]}\n')
        f.write(f'accuracy = {v}\n')

# find smallest test size

In [74]:
path = './Data/humor_datasets/{dataset}/with_val_fixed_train/test.csv'
test_size = None
datasets = ['amazon', 'headlines', 'igg', 'twss']
for dataset in datasets:
    df = pd.read_csv(path.format(dataset=dataset))
    print(f'test size of {dataset} is {len(df)}')
test_size

test size of amazon is 8359
test size of headlines is 5150
test size of igg is 519
test size of twss is 788


In [76]:
path = './Data/humor_datasets/{dataset}/with_val_fixed_train/train.csv'
train_size = None
datasets = ['amazon', 'headlines', 'igg', 'twss']
for dataset in datasets:
    df = pd.read_csv(path.format(dataset=dataset))
    print(f'train size of {dataset} is {len(df)}')
    print(f'label 1 count is {len(df[df.label==1])}')
train_size

train size of amazon is 2376
label 1 count is 1188
train size of headlines is 2376
label 1 count is 1188
train size of igg is 2376
label 1 count is 1188
train size of twss is 2376
label 1 count is 1188


In [77]:
path = './Data/humor_datasets/{dataset}/with_val_fixed_train/val.csv'
val_size = None
datasets = ['amazon', 'headlines', 'igg', 'twss']
for dataset in datasets:
    df = pd.read_csv(path.format(dataset=dataset))
    print(f'val size of {dataset} is {len(df)}')
val_size

val size of amazon is 8359
val size of headlines is 5150
val size of igg is 519
val size of twss is 788


# distribution of test labels

In [85]:
path = './Data/humor_datasets/{dataset}/with_val_fixed_train/test.csv'
max_test_size = 3500
datasets = ['amazon', 'headlines', 'igg', 'twss']
for dataset in datasets:
    df = pd.read_csv(path.format(dataset=dataset))
    df = df.iloc[:min(len(df), max_test_size)]
    df_1 = df[df.label == 1]
    df_0 = df[df.label == 0]

    # print(f'test size of {dataset} is {len(df)}')
    print(f'for {dataset}:')
    print(f'%label 1 = {"%.2f" % (100 * len(df_1) / len(df))}, %label 0 = {"%.2f" % (100 * len(df_0) / len(df))}')


for amazon:
%label 1 = 50.77, %label 0 = 49.23
for headlines:
%label 1 = 48.86, %label 0 = 51.14
for igg:
%label 1 = 52.79, %label 0 = 47.21
for twss:
%label 1 = 47.84, %label 0 = 52.16


# distribution of paired val label

In [24]:
path = './Data/humor_datasets/paired_datasets/{dataset}/with_val_fixed_train/val.csv'
max_val_size = 3500
datasets = ["amazon_headlines", "amazon_igg", "amazon_twss", "headlines_igg", "headlines_twss", "igg_twss"]
for dataset in datasets:
    df = pd.read_csv(path.format(dataset=dataset))
    df = df.iloc[:min(len(df), max_val_size)]
    df_1 = df[df.label == 1]
    df_0 = df[df.label == 0]

    print(f'for {dataset}:')
    print(f'val size = {len(df)}')
    print(f'%label 1 = {"%.2f" % (100 * len(df_1) / len(df))}, %label 0 = {"%.2f" % (100 * len(df_0) / len(df))}')


for amazon_headlines:
val size = 1038
%label 1 = 48.84, %label 0 = 51.16
for amazon_igg:
val size = 1038
%label 1 = 48.55, %label 0 = 51.45
for amazon_twss:
val size = 1038
%label 1 = 49.90, %label 0 = 50.10
for headlines_igg:
val size = 1038
%label 1 = 47.50, %label 0 = 52.50
for headlines_twss:
val size = 1038
%label 1 = 48.84, %label 0 = 51.16
for igg_twss:
val size = 1038
%label 1 = 48.55, %label 0 = 51.45


# distribution of val labels

In [22]:
path = './Data/humor_datasets/{dataset}/with_val_fixed_train/val.csv'
max_val_size = 3500
datasets = ['amazon', 'headlines', 'igg', 'twss']
for dataset in datasets:
    df = pd.read_csv(path.format(dataset=dataset))
    df = df.iloc[:min(len(df), max_val_size)]
    df_1 = df[df.label == 1]
    df_0 = df[df.label == 0]

    print(f'for {dataset}:')
    print(f'%label 1 = {"%.2f" % (100 * len(df_1) / len(df))}, %label 0 = {"%.2f" % (100 * len(df_0) / len(df))}')


FileNotFoundError: [Errno 2] No such file or directory: './Data/humor_datasets/amazon_headlines/with_val_fixed_train/val.csv'

# compute performance of the model

In [14]:
def get_run_details(run_name):
    run_data = run_name.split('_')
    model = run_data[0]
    dataset_name = run_data[2]
    seed = run_data[3][run_data[3].index('=') + 1:]

    # return model, dataset_name, float(seed)
    return dataset_name, float(seed)

In [26]:
from os.path import exists
from sklearn.metrics import precision_score, recall_score, accuracy_score
import glob
import os

output_path = './Data/output/results/'
# dataset_names = ['amazon', 'headlines', 'igg', 'twss']
dataset_names = ["amazon-headlines", "amazon-igg", "amazon-twss", "headlines-igg", "headlines-twss", "igg-twss"]
data_path = './Data/humor_datasets/'
split_type = 'with_val_fixed_train'
models_path = './Model/SavedModels/Bert-paired'
base_model = 'bert'
models_name = [glob.glob(f'{models_path}/{base_model}_on_{dataset}*')[0] for dataset in dataset_names]

df = pd.read_excel(output_path + 'humor_results_template.xlsx')
df.fillna(method='ffill', axis=0, inplace=True)
df.set_index(['performance', 'model', 'trained on', 'seed'], inplace=True)

for model_name in models_name:
    # base_model, dataset_name, seed = get_run_details(model_name)
    dataset_name, seed = get_run_details(model_name)
    pred_path = model_name + '/predictions/'
    accuracies = {}
    recall = {}
    precision = {}
    predict_dataset_names = ['amazon', 'headlines', 'igg', 'twss']
    for dataset in predict_dataset_names:
        pred_labels_path = pred_path + f'{dataset}_preds.csv'
        test_labels_path = data_path + f'{dataset}/{split_type}/test.csv'
        if not (exists(pred_labels_path) and exists(test_labels_path)):
            print('didnt find preds/test path')
            continue

        _preds = pd.read_csv(pred_labels_path)
        _test = pd.read_csv(test_labels_path)
        _test = _test.iloc[:len(_preds)]
        if (len(_preds[_preds.label == -1]) > 0):
            illegal_indices = _preds[_preds.label == -1].index
            print(f'there are {len(illegal_indices)} illegal indices in {dataset_name} predictions on {dataset}')
            _preds = _preds.drop(labels=illegal_indices, axis=0)
            _test = _test.drop(labels=illegal_indices, axis=0)
        accuracies[dataset] = float("%.4f" % accuracy_score(_test.label, _preds.label))
        recall[dataset] = float("%.4f" % recall_score(_test.label, _preds.label))
        precision[dataset] = float("%.4f" % precision_score(_test.label, _preds.label))

    print(f'performance for {model_name}')
    print(f'accuracies = {accuracies}')
    print(f'recall = {recall}')
    print(f'precision = {precision}')

    df.loc[('accuracy', base_model, dataset_name, seed)] = accuracies
    df.loc[('recall', base_model, dataset_name, seed)] = recall
    df.loc[('precision', base_model, dataset_name, seed)] = precision

# save performance to output file
i = 0
while os.path.exists(output_path + f'humor_results_{i}.xlsx'):
    i += 1

df.to_excel(output_path + f'humor_results_{i}.xlsx')

performance for ./Model/SavedModels/Bert-paired\bert_on_amazon-headlines_seed=5
accuracies = {'amazon': 0.8329, 'headlines': 0.5674, 'igg': 0.6301, 'twss': 0.2538}
recall = {'amazon': 0.8362, 'headlines': 0.5988, 'igg': 0.3285, 'twss': 0.3581}
precision = {'amazon': 0.8348, 'headlines': 0.5529, 'igg': 0.9184, 'twss': 0.2807}
performance for ./Model/SavedModels/Bert-paired\bert_on_amazon-igg_seed=5
accuracies = {'amazon': 0.8346, 'headlines': 0.4957, 'igg': 0.8882, 'twss': 0.3566}
recall = {'amazon': 0.8244, 'headlines': 0.9912, 'igg': 0.8905, 'twss': 0.7082}
precision = {'amazon': 0.8458, 'headlines': 0.492, 'igg': 0.8971, 'twss': 0.4021}
performance for ./Model/SavedModels/Bert-paired\bert_on_amazon-twss_seed=18
accuracies = {'amazon': 0.812, 'headlines': 0.5034, 'igg': 0.8651, 'twss': 0.9937}
recall = {'amazon': 0.8351, 'headlines': 0.7544, 'igg': 0.8394, 'twss': 0.9947}
precision = {'amazon': 0.8026, 'headlines': 0.4946, 'igg': 0.8984, 'twss': 0.9921}
performance for ./Model/SavedMo

# compute T5 models mean & std accuracy

In [18]:
acc_igg = [0.9347826086956522, 0.9391304347826087, 0.9376811594202898, 0.936231884057971]
acc_amazon = [0.8557142857142858, 0.8542857142857143, 0.8551428571428571, 0.8554285714285714]
acc_headlines = [0.5831428571428572, 0.5805714285714285, 0.5822857142857143, 0.5805714285714285]
acc_twss = [0.45634920634920634, 0.4777636594663278, 0.4885786802030457, 0.799492385786802]
accs = {'amazon': acc_amazon, 'headlines': acc_headlines, 'igg': acc_igg, 'twss': acc_twss}
for k,v in accs.items():
    print(f'{k}: {"%.4f" % np.mean(v)} +- {"%.3f" % np.std(v)}')

amazon: 0.8551 +- 0.001
headlines: 0.5816 +- 0.001
igg: 0.9370 +- 0.002
twss: 0.5555 +- 0.141


# compute Bert models mean & std accuracy


In [7]:
acc_igg = [0.9094412331406551, 0.9113680154142582, 0.9036608863198459, 0.8747591522157996]
acc_amazon = [0.8414285714285714, 0.8368571428571429, 0.8305714285714285, 0.8354285714285714]
acc_headlines = [0.5834285714285714, 0.586, 0.6031428571428571, 0.6091428571428571]
acc_twss = [0.9949238578680203, 0.9898477157360406, 0.9961928934010152, 0.9949238578680203]
accs = {'amazon': acc_amazon, 'headlines': acc_headlines, 'igg': acc_igg, 'twss': acc_twss}
for k,v in accs.items():
    print(f'{k}: {"%.4f" % np.mean(v)} +- {"%.3f" % np.std(v)}')

amazon: 0.8361 +- 0.004
headlines: 0.5954 +- 0.011
igg: 0.8998 +- 0.015
twss: 0.9940 +- 0.002


# compute T5 models on pairs mean & std

In [19]:
amazon_headlines_on_amazon = [0.7637142857142857, 0.8462857142857143, 0.8562857142857143, 0.8534285714285714]
amazon_igg_on_amazon = [0.8071428571428572, 0.842, 0.8505714285714285, 0.8548571428571429]
amazon_twss_on_amazon = [0.8431428571428572, 0.8505714285714285, 0.8531428571428571, 0.8531428571428571]
headlines_igg_on_headlines = [0.5791428571428572, 0.5842857142857143, 0.5828571428571429, 0.5785714285714286]
headlines_twss_on_twss = [0.9822335025380711, 0.983502538071066, 0.983502538071066, 0.9873096446700508]
igg_twss_on_igg = [0.905587668593449, 0.905587668593449, 0.905587668593449, 0.9132947976878613]
accs = {'amazon_healines': amazon_headlines_on_amazon,
        'amazon_igg': amazon_igg_on_amazon,
        'amazon_twss': amazon_twss_on_amazon,
        'headlines_igg': headlines_igg_on_headlines,
        'headlines_twss': headlines_twss_on_twss,
        'igg_twss': igg_twss_on_igg}

for k,v in accs.items():
    print(f'{k}: {"%.4f" % np.mean(v)} +- {"%.3f" % np.std(v)}')

amazon_healines: 0.8299 +- 0.038
amazon_igg: 0.8386 +- 0.019
amazon_twss: 0.8500 +- 0.004
headlines_igg: 0.5812 +- 0.002
headlines_twss: 0.9841 +- 0.002
igg_twss: 0.9075 +- 0.003


# compute Bert models on pairs mean & std

In [27]:
amazon_headlines_on_amazon = [0.8328571428571429, 0.83, 0.8294285714285714, 0.8257142857142857]
amazon_igg_on_amazon = [0.8345714285714285, 0.8274285714285714, 0.8214285714285714, 0.8202857142857143]
amazon_twss_on_amazon = [0.8105714285714286, 0.812, 0.7977142857142857, 0.8068571428571428]
headlines_igg_on_igg = [0.8728323699421965, 0.8766859344894027, 0.8786127167630058, 0.8863198458574181]
headlines_twss_on_twss = [0.9796954314720813, 0.9873096446700508, 0.9885786802030457, 0.9860406091370558]
igg_twss_on_igg = [0.861271676300578, 0.8535645472061657, 0.7976878612716763, 0.8554913294797688]
accs = {'amazon_healines': amazon_headlines_on_amazon,
        'amazon_igg': amazon_igg_on_amazon,
        'amazon_twss': amazon_twss_on_amazon,
        'headlines_igg': headlines_igg_on_igg,
        'headlines_twss': headlines_twss_on_twss,
        'igg_twss': igg_twss_on_igg}

for k,v in accs.items():
    print(f'{k}: {"%.4f" % np.mean(v)} +- {"%.3f" % np.std(v)}')

amazon_healines: 0.8295 +- 0.003
amazon_igg: 0.8259 +- 0.006
amazon_twss: 0.8068 +- 0.006
headlines_igg: 0.8786 +- 0.005
headlines_twss: 0.9854 +- 0.003
igg_twss: 0.8420 +- 0.026


# check headlines dataset meanGrade

In [24]:
processed_headlines_path = './Data/humor_datasets/headlines/with_val_fixed_train/{split}.csv'
processed_train_df = pd.read_csv(processed_headlines_path.format(split='train'))
processed_test_df = pd.read_csv(processed_headlines_path.format(split='test'))
processed_val_df = pd.read_csv(processed_headlines_path.format(split='val'))

In [25]:
original_headlines_path = './Data/original_datasets/headlines/{split}.csv'
original_train_df = pd.read_csv(original_headlines_path.format(split='train'))
original_test_df = pd.read_csv(original_headlines_path.format(split='test'))
original_all = original_train_df.append(original_test_df, ignore_index=True)

In [26]:
def add_column_data(column_name):
    def add_col_to_row(row):
        origin_row = original_all[original_all['id'] == row['id']].squeeze()
        return origin_row[column_name]
    return add_col_to_row

In [27]:
cols = ['meanGrade', 'original', 'edit']
for col in cols:
    processed_train_df[col] = processed_train_df.apply(add_column_data(col), axis=1)
    processed_test_df[col] = processed_test_df.apply(add_column_data(col), axis=1)
    processed_val_df[col] = processed_val_df.apply(add_column_data(col), axis=1)

In [28]:
max_val_size = 3500
processed_test_df = processed_test_df.iloc[:max_val_size]
processed_val_df = processed_val_df.iloc[:max_val_size]

In [29]:
print('Mean Grade stats:')
print(f'-- train --')
print(f'mean = {processed_train_df.meanGrade.mean()}, std = {processed_train_df.meanGrade.std()}')
print(f'-- test --')
print(f'mean = {processed_test_df.meanGrade.mean()}, std = {processed_test_df.meanGrade.std()}')
print(f'-- val --')
print(f'mean = {processed_val_df.meanGrade.mean()}, std = {processed_val_df.meanGrade.std()}')

Mean Grade stats:
-- train --
mean = 0.9420454545454544, std = 0.5772500621369475
-- test --
mean = 0.9382, std = 0.5886325917698874
-- val --
mean = 0.9175238095238093, std = 0.5766939173305733


In [30]:
sample_size = 100
sample_train = processed_train_df.iloc[:sample_size]
sample_test = processed_test_df.iloc[:sample_size]

In [32]:
import os

output_path = './Data/output/headlines_data_samples/'
os.makedirs(output_path, exist_ok=True)
sample_train.to_csv(output_path + 'train.csv', index=False)
sample_test.to_csv(output_path + 'test.csv', index=False)

## check headlines dataset

In [1]:
import pandas as pd
headlines_path = './Data/humor_datasets/headlines/{sanity}with_val_fixed_train'
real_train = pd.read_csv(headlines_path.format(sanity='') + '/train.csv')
real_test = pd.read_csv(headlines_path.format(sanity='') + '/test.csv')
sanity_train = pd.read_csv(headlines_path.format(sanity='sanity-check/') + '/train.csv')
sanity_test = pd.read_csv(headlines_path.format(sanity='sanity-check/') + '/test.csv')

In [3]:
original_headlines_path = './Data/original_datasets/headlines/{split}.csv'
original_train = pd.read_csv(original_headlines_path.format(split='train'))
original_test = pd.read_csv(original_headlines_path.format(split='test'))
all_data = original_test.append(original_train, ignore_index=True)

In [5]:
def edit_headline(row):
    headline = row['original']
    edit_word = row['edit']
    res = headline[:headline.index('<')] + edit_word + headline[headline.index('>') + 1:]
    return res

In [6]:
all_data['edited_sentence'] = all_data.apply(edit_headline, axis=1)

In [9]:
def add_mean_grade(row):
    origin_row = all_data[all_data['id'] == row['id']].squeeze()
    return origin_row['meanGrade']

In [11]:
real_train['meanGrade'] = real_train.apply(add_mean_grade, axis=1)

# T5 on all headlines acccuracy

In [13]:
import numpy as np

headlines_accuracies = [0.6127912741695587, 0.6326227069905801, 0.6351016360932077, 0.6271690629647992,
0.6256817055032227, 0.6256817055032227, 0.624194348041646, 0.623202776400595]
print(f'headlines accuracy: mean = {"%.4f" % np.mean(headlines_accuracies)}, std = {"%.4f" % np.std(headlines_accuracies)}')

headlines accuracy: mean = 0.6258, std = 0.0063


## Trying kfold

In [31]:
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import os

data_path = './Data/humor_datasets/headlines/kfold_cv/'
data = pd.read_csv(data_path + 'data.csv')
os.makedirs(data_path + 'folds/', exist_ok=True)
kfold = StratifiedKFold(4)

In [32]:
for i, indices in enumerate(kfold.split(data, data['label'])):
    train = indices[0]
    test = indices[1]
    # print('train: %s, test: %s' % (train, test))
    print(f'train size = {len(train)}, test size = {len(test)}')
    data_train = data.iloc[train]
    print(f'label 1 % = {100 * len(data_train[data_train.label == 1]) / len(data_train)}')
    data_test = data.iloc[test]
    data_test, data_val = train_test_split(data_test, test_size=0.5, shuffle=True, random_state=0)
    output_path = data_path + f'folds/fold_{i}/with_val/'
    os.makedirs(output_path, exist_ok=True)
    data_train.to_csv(output_path + 'train.csv', index=False)
    data_test.to_csv(output_path + 'test.csv', index=False)
    data_val.to_csv(output_path + 'val.csv', index=False)

train size = 9295, test size = 3099
label 1 % = 49.994620763851536
train size = 9295, test size = 3099
label 1 % = 50.005379236148464
train size = 9296, test size = 3098
label 1 % = 50.0
train size = 9296, test size = 3098
label 1 % = 50.0


In [35]:
datasets = ['amazon', 'headlines', 'igg', 'twss']
output_path = './Data/humor_datasets/{dataset}/kfold_cv/'
input_path = './Data/humor_datasets/{dataset}/data.csv'
kfold = StratifiedKFold(n_splits=4)

# compute fixed train size by igg train size
igg_df = pd.read_csv(output_path.format(dataset='igg') + 'balanced_data.csv')
splits = kfold.split(igg_df, igg_df['label'])
for train, test in splits:
    # print('train: %s, test: %s' % (train, test))
    print(f'train size = {len(train)}, test size = {len(test)}')
    data_train = data.iloc[train]
    print(f'label 1 % = {100 * len(data_train[data_train.label == 1]) / len(data_train)}')

train size = 2560, test size = 854
label 1 % = 49.609375
train size = 2560, test size = 854
label 1 % = 50.1953125
train size = 2561, test size = 853
label 1 % = 51.34713002733307
train size = 2561, test size = 853
label 1 % = 50.488090589613435


In [36]:
import pandas as pd

# Sample DataFrames
df1 = pd.DataFrame({'A': [1, 2, 3, 4],
                    'B': ['X', 'Y', 'Z', 'W']})

df2 = pd.DataFrame({'A': [3, 2, 5, 6],
                    'B': ['Z', 'Y', 'P', 'Q']})

# Check for common rows
common_rows = df1[df1.isin(df2.to_dict(orient='list')).all(axis=1)]

# Check if there are common rows
if not common_rows.empty:
    print("There are common rows.")
else:
    print("There are no common rows.")

There are common rows.
