In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.61.13-py3-none-any.whl (221 kB)
[K     |████████████████████████████████| 221 kB 8.4 MB/s 
[?25hCollecting streamlit
  Downloading streamlit-0.86.0-py2.py3-none-any.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 48.5 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 51.5 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 46.8 MB/s 
Collecting tensorboardx
  Downloading tensorboardX-2.4-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 69.2 MB/s 
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.7 MB/s 
[?25h

In [2]:
## code data_download.sh
!wget https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz -P data
!tar -xvf data/paws_wiki_labeled_final.tar.gz -C data

--2021-08-13 17:44:44--  https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.197.128, 74.125.142.128, 74.125.195.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.197.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4687157 (4.5M) [application/gzip]
Saving to: ‘data/paws_wiki_labeled_final.tar.gz’


2021-08-13 17:44:44 (234 MB/s) - ‘data/paws_wiki_labeled_final.tar.gz’ saved [4687157/4687157]

final/test.tsv
final/
final/train.tsv
final/dev.tsv


In [3]:
## code utils.py

import warnings
import pandas as pd


def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df


def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [4]:
## train.py

import os
from datetime import datetime
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [5]:
## train.py

# Google Data
train_df = pd.read_csv("/content/data/final/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("/content/data/final/dev.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

# train_df = train_df.head(5000)
# eval_df = eval_df.head(500)
print(train_df,eval_df)

                                              input_text  ...      prefix
1      The NBA season of 1975 -- 76 was the 30th seas...  ...  paraphrase
3      When comparable rates of flow can be maintaine...  ...  paraphrase
4      It is the seat of Zerendi District in Akmola R...  ...  paraphrase
5      William Henry Henry Harman was born on 17 Febr...  ...  paraphrase
7      With a discrete amount of probabilities Formul...  ...  paraphrase
...                                                  ...  ...         ...
49384  The Romanesque language , Galician ( Galego ) ...  ...  paraphrase
49390  Note that k is a vector consisting of three in...  ...  paraphrase
49393  Tim Henman won in the final 6 -- 2 , 7 -- 6 , ...  ...  paraphrase
49395  He was considered an active member of the coun...  ...  paraphrase
49397  She was in Cork on June 24 and arrived on 8 Ju...  ...  paraphrase

[21829 rows x 3 columns]                                              input_text  ...      prefix
1     They we

In [6]:
print(train_df)

                                              input_text  ...      prefix
1      The NBA season of 1975 -- 76 was the 30th seas...  ...  paraphrase
3      When comparable rates of flow can be maintaine...  ...  paraphrase
4      It is the seat of Zerendi District in Akmola R...  ...  paraphrase
5      William Henry Henry Harman was born on 17 Febr...  ...  paraphrase
7      With a discrete amount of probabilities Formul...  ...  paraphrase
...                                                  ...  ...         ...
49384  The Romanesque language , Galician ( Galego ) ...  ...  paraphrase
49390  Note that k is a vector consisting of three in...  ...  paraphrase
49393  Tim Henman won in the final 6 -- 2 , 7 -- 6 , ...  ...  paraphrase
49395  He was considered an active member of the coun...  ...  paraphrase
49397  She was in Cork on June 24 and arrived on 8 Ju...  ...  paraphrase

[21829 rows x 3 columns]


In [None]:
## train.py

model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 16
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 1
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 8
model_args.use_multiprocessing = False
model_args.wandb_project = "Paraphrasing with BART"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
)

model.train_model(train_df, eval_data=eval_df)

to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

# Saving the predictions if needed
os.makedirs("predictions", exist_ok=True)

with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f:
    for i, text in enumerate(eval_df["input_text"].tolist()):
        f.write(str(text) + "\n\n")

        f.write("Truth:\n")
        f.write(truth[i] + "\n\n")

        f.write("Prediction:\n")
        for pred in preds[i]:
            f.write(str(pred) + "\n")
        f.write(
            "________________________________________________________________________________\n"
        )

INFO:filelock:Lock 140029390796432 acquired on /root/.cache/huggingface/transformers/3f12fb71b844fcb7d591fdd4e55027da90d7b5dd6aa5430ad00ec6d76585f26c.58d5dda9f4e9f44e980adb867b66d9e0cbe3e0c05360cefe3cd86f5db4fff042.lock


Downloading:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

INFO:filelock:Lock 140029390796432 released on /root/.cache/huggingface/transformers/3f12fb71b844fcb7d591fdd4e55027da90d7b5dd6aa5430ad00ec6d76585f26c.58d5dda9f4e9f44e980adb867b66d9e0cbe3e0c05360cefe3cd86f5db4fff042.lock
INFO:filelock:Lock 140029278969424 acquired on /root/.cache/huggingface/transformers/d065edfe6954baf0b989a2063b26eb07e8c4d0b19354b5c74af9a51f5518df6e.6ca4df1a6ec59aa763989ceec10dff41dde19f0f0824b9f5d3fcd35a8abffdb2.lock


Downloading:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

INFO:filelock:Lock 140029278969424 released on /root/.cache/huggingface/transformers/d065edfe6954baf0b989a2063b26eb07e8c4d0b19354b5c74af9a51f5518df6e.6ca4df1a6ec59aa763989ceec10dff41dde19f0f0824b9f5d3fcd35a8abffdb2.lock
INFO:filelock:Lock 140026355392464 acquired on /root/.cache/huggingface/transformers/0d6fc8b2ef1860c1f8f0baff4b021e3426cc7d11b153f98e563b799603ee2f25.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05.lock


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

INFO:filelock:Lock 140026355392464 released on /root/.cache/huggingface/transformers/0d6fc8b2ef1860c1f8f0baff4b021e3426cc7d11b153f98e563b799603ee2f25.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05.lock
INFO:filelock:Lock 140026355391888 acquired on /root/.cache/huggingface/transformers/6e75e35f0bdd15870c98387e13b93a8e100237eb33ad99c36277a0562bd6d850.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

INFO:filelock:Lock 140026355391888 released on /root/.cache/huggingface/transformers/6e75e35f0bdd15870c98387e13b93a8e100237eb33ad99c36277a0562bd6d850.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
INFO:filelock:Lock 140026355390736 acquired on /root/.cache/huggingface/transformers/d94f53c8851dcda40774f97280e634b94b721a58e71bcc152b5f51d0d49a046a.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock


Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

INFO:filelock:Lock 140026355390736 released on /root/.cache/huggingface/transformers/d94f53c8851dcda40774f97280e634b94b721a58e71bcc152b5f51d0d49a046a.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
INFO:filelock:Lock 140026356565712 acquired on /root/.cache/huggingface/transformers/1abf196c889c24daca2909359ca2090e5fcbfa21a9ea36d763f70adbafb500d7.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8.lock


Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

INFO:filelock:Lock 140026356565712 released on /root/.cache/huggingface/transformers/1abf196c889c24daca2909359ca2090e5fcbfa21a9ea36d763f70adbafb500d7.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8.lock
INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/21829 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model: Training started


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: W&B syncing is set to `offline` in this directory.  Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.


Running Epoch 0 of 1:   0%|          | 0/2729 [00:00<?, ?it/s]

In [None]:
%env TOKENIZERS_PARALLELISM

In [None]:
import random

for i, text in enumerate(to_predict[:128]):
    print(f'Text  > {str(text).split(":", 1)[1].strip()}')
    print(f'Pred  < {preds[i][0]}')
    print(f'Truth = {truth[i]}')
    print()