In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from codalab_utils.get_names import Names
from tqdm import tqdm, trange

In [2]:
names = Names()

In [3]:
device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


In [4]:
train_file = names.train_path + names.select_file('train')
val_file = train_file.replace('train', 'val')
test_file = train_file.replace('train', 'test')

	0 : hopeedi_train.csv
	1 : train_polyhope_spanish_cleaned.csv
	2 : train_polyhope_spanish_cleaned_noemoji.csv
	3 : train_polyhope_english_cleaned.csv
	4 : train_polyhope_spanish.csv
	5 : train_polyhope_english.csv
	6 : train_polyhope_english_cleaned_noemoji.csv


Select train file :  6


In [5]:
df_train =  pd.read_csv(train_file)
df_val = pd.read_csv(val_file)
df_test = pd.read_csv(test_file)

In [6]:
train_gh = df_train[df_train['multiclass'] == 'Generalized Hope']
train_rh = df_train[df_train['multiclass'] == 'Realistic Hope']
train_uh = df_train[df_train['multiclass'] == 'Unrealistic Hope']
train_nh = df_train[df_train['multiclass'] == 'Not Hope']

In [7]:
max_l = len(train_nh)
d
num_seq_gh = int(max_l/len(train_gh))
num_seq_rh = int(max_l/len(train_rh))
num_seq_uh = int(max_l/len(train_uh))

In [8]:
train_gh_texts = train_gh['text'].tolist()
train_gh_multi = train_gh['multiclass'].tolist()
added_gh_texts = []
added_gh_multi = []

for t, l in tqdm(zip(train_gh_texts, train_gh_multi), total=len(train_gh_texts)):
    if len(added_gh_multi)+len(train_gh_multi) > max_l:
        break
    added_gh_texts.extend(paraphrase(t, num_return_sequences=num_seq_gh))
    added_gh_multi.extend([l]*num_seq_gh)
    

 79%|███████████████████████████████████████████████▍            | 1363/1726 [22:59<06:07,  1.01s/it]


In [9]:
train_rh_texts = train_rh['text'].tolist()
train_rh_multi = train_rh['multiclass'].tolist()
added_rh_texts = []
added_rh_multi = []

for t, l in tqdm(zip(train_rh_texts, train_rh_multi), total=len(train_rh_texts)):
    if len(added_rh_multi)+len(train_rh_multi) > max_l:
        break
    added_rh_texts.extend(paraphrase(t, num_return_sequences=num_seq_rh))
    added_rh_multi.extend([l]*num_seq_rh)

 81%|██████████████████████████████████████████████████            | 590/730 [10:58<02:36,  1.12s/it]


In [11]:
train_uh_texts = train_uh['text'].tolist()
train_uh_multi = train_uh['multiclass'].tolist()
added_uh_texts = []
added_uh_multi = []

for t, l in tqdm(zip(train_uh_texts, train_uh_multi), total=len(train_uh_texts)):
    if len(added_uh_multi)+len(train_uh_multi) > max_l:
        break
    added_uh_texts.extend(paraphrase(t, num_return_sequences=num_seq_uh))
    added_uh_multi.extend([l]*num_seq_uh)

 94%|██████████████████████████████████████████████████████████▍   | 611/648 [09:09<00:33,  1.11it/s]


In [12]:
test_gh = df_test[df_test['multiclass'] == 'Generalized Hope']
test_rh = df_test[df_test['multiclass'] == 'Realistic Hope']
test_uh = df_test[df_test['multiclass'] == 'Unrealistic Hope']
test_nh = df_test[df_test['multiclass'] == 'Not Hope']

max_l_test = len(test_nh)

num_seq_gh_test = int(max_l_test/len(test_gh))
num_seq_rh_test = int(max_l_test/len(test_rh))
num_seq_uh_test = int(max_l_test/len(test_uh))

In [13]:
test_gh_texts = test_gh['text'].tolist()
test_gh_multi = test_gh['multiclass'].tolist()
added_gh_texts_test = []
added_gh_multi_test = []

for t, l in tqdm(zip(test_gh_texts, test_gh_multi), total=len(test_gh_texts)):
    if len(added_gh_multi_test)+len(test_gh_multi) > max_l_test:
        break
    added_gh_texts_test.extend(paraphrase(t, num_return_sequences=num_seq_gh_test))
    added_gh_multi_test.extend([l]*num_seq_gh_test)
    

 59%|████████████████████████████████████▋                         | 183/309 [02:58<02:02,  1.03it/s]


In [14]:
test_rh_texts = test_rh['text'].tolist()
test_rh_multi = test_rh['multiclass'].tolist()
added_rh_texts_test = []
added_rh_multi_test = []

for t, l in tqdm(zip(test_rh_texts, test_rh_multi), total=len(test_rh_texts)):
    if len(added_rh_multi_test)+len(test_rh_multi) > max_l_test:
        break
    added_rh_texts_test.extend(paraphrase(t, num_return_sequences=num_seq_rh_test))
    added_rh_multi_test.extend([l]*num_seq_rh_test)

 99%|█████████████████████████████████████████████████████████████▌| 123/124 [02:03<00:01,  1.01s/it]


In [15]:
test_uh_texts = test_uh['text'].tolist()
test_uh_multi = test_uh['multiclass'].tolist()
added_uh_texts_test = []
added_uh_multi_test = []

for t, l in tqdm(zip(test_uh_texts, test_uh_multi), total=len(test_uh_texts)):
    if len(added_uh_multi_test)+len(test_uh_multi) > max_l_test:
        break
    added_uh_texts_test.extend(paraphrase(t, num_return_sequences=num_seq_uh_test))
    added_uh_multi_test.extend([l]*num_seq_uh_test)

 89%|███████████████████████████████████████████████████████████▌       | 96/108 [01:22<00:10,  1.16it/s]


In [16]:
val_gh = df_val[df_val['multiclass'] == 'Generalized Hope']
val_rh = df_val[df_val['multiclass'] == 'Realistic Hope']
val_uh = df_val[df_val['multiclass'] == 'Unrealistic Hope']
val_nh = df_val[df_val['multiclass'] == 'Not Hope']

max_l_val = len(val_nh)

num_seq_gh_val = int(max_l_val/len(val_gh))
num_seq_rh_val = int(max_l_val/len(val_rh))
num_seq_uh_val = int(max_l_val/len(val_uh))

In [17]:
val_gh_texts = val_gh['text'].tolist()
val_gh_multi = val_gh['multiclass'].tolist()
added_gh_texts_val = []
added_gh_multi_val = []

for t, l in tqdm(zip(val_gh_texts, val_gh_multi), total=len(val_gh_texts)):
    if len(added_gh_multi_val)+len(val_gh_multi) > max_l_val:
        break
    added_gh_texts_val.extend(paraphrase(t, num_return_sequences=num_seq_gh_val))
    added_gh_multi_val.extend([l]*num_seq_gh_val)

 68%|████████████████████████████████████████████▋                     | 203/300 [03:06<01:29,  1.09it/s]


In [18]:
val_rh_texts = val_rh['text'].tolist()
val_rh_multi = val_rh['multiclass'].tolist()
added_rh_texts_val = []
added_rh_multi_val = []

for t, l in tqdm(zip(val_rh_texts, val_rh_multi), total=len(val_rh_texts)):
    if len(added_rh_multi_val)+len(val_rh_multi) > max_l_val:
        break
    added_rh_texts_val.extend(paraphrase(t, num_return_sequences=num_seq_rh_val))
    added_rh_multi_val.extend([l]*num_seq_rh_val)

 98%|████████████████████████████████████████████████████████████▌ | 125/128 [02:07<00:03,  1.02s/it]


In [19]:
val_uh_texts = val_uh['text'].tolist()
val_uh_multi = val_uh['multiclass'].tolist()
added_uh_texts_val = []
added_uh_multi_val = []

for t, l in tqdm(zip(val_uh_texts, val_uh_multi), total=len(val_uh_texts)):
    if len(added_uh_multi_val)+len(val_uh_multi) > max_l_val:
        break
    added_uh_texts_val.extend(paraphrase(t, num_return_sequences=num_seq_uh_val))
    added_uh_multi_val.extend([l]*num_seq_uh_val)

 99%|█████████████████████████████████████████████████████████████▍| 101/102 [01:31<00:00,  1.11it/s]


In [None]:
new_train_texts = df_train['text'].tolist() + train_gh_texts + train_rh_texts + train_uh_texts
new_train_labels = df_train['multiclass'].tolist()