# Installing/Importing Modules

In [None]:
!pip install -qq simpletransformers

[?25l[K     |█▌                              | 10 kB 31.7 MB/s eta 0:00:01[K     |███                             | 20 kB 23.1 MB/s eta 0:00:01[K     |████▍                           | 30 kB 18.1 MB/s eta 0:00:01[K     |██████                          | 40 kB 16.1 MB/s eta 0:00:01[K     |███████▍                        | 51 kB 7.0 MB/s eta 0:00:01[K     |████████▉                       | 61 kB 8.2 MB/s eta 0:00:01[K     |██████████▍                     | 71 kB 7.9 MB/s eta 0:00:01[K     |███████████▉                    | 81 kB 8.8 MB/s eta 0:00:01[K     |█████████████▎                  | 92 kB 9.4 MB/s eta 0:00:01[K     |██████████████▉                 | 102 kB 7.0 MB/s eta 0:00:01[K     |████████████████▎               | 112 kB 7.0 MB/s eta 0:00:01[K     |█████████████████▊              | 122 kB 7.0 MB/s eta 0:00:01[K     |███████████████████▎            | 133 kB 7.0 MB/s eta 0:00:01[K     |████████████████████▊           | 143 kB 7.0 MB/s eta 0:00:01[K 

In [None]:
import os
from datetime import datetime
import logging

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

# Helper Functions

In [None]:
def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df


def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

# Loading Data

In [None]:
!gdown --id 1mdK_PgJ0aFwb46AIfY6hv0u1DZT2LU86 -q -O data.zip
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/dev.tsv            
  inflating: data/msr_paraphrase_test.txt  
  inflating: data/msr_paraphrase_train.txt  
  inflating: data/quora_duplicate_questions.tsv  
  inflating: data/train.tsv          


## Google Data

In [None]:
train_df = pd.read_csv("data/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("data/dev.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

## MSRP Data

In [None]:
train_df = pd.concat(
    [
        train_df,
        load_data("data/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
eval_df = pd.concat(
    [
        eval_df,
        load_data("data/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)

b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSkipping line 3491: expected 5 fields, saw 6\nSkipping line 3643: expected 5 fields

## Quora Data

In [None]:
# The Quora Dataset is not separated into train/test, so we do it manually the first time.
df = load_data(
    "data/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate"
)
q_train, q_test = train_test_split(df)

q_train.to_csv("data/quora_train.tsv", sep="\t")
q_test.to_csv("data/quora_test.tsv", sep="\t")

# The code block above only needs to be run once.
# After that, the two lines below are sufficient to load the Quora dataset.

# q_train = pd.read_csv("data/quora_train.tsv", sep="\t")
# q_test = pd.read_csv("data/quora_test.tsv", sep="\t")

train_df = pd.concat([train_df, q_train])
eval_df = pd.concat([eval_df, q_test])

train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
eval_df = eval_df.dropna()

train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

# Reducing Dataset

In [None]:
len(train_df), len(eval_df)

(136422, 41937)

In [None]:
train_df = train_df.sample(n = 80000)
eval_df = eval_df.sample(n = 20000)

In [None]:
train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

In [None]:
len(train_df), len(eval_df)

(80000, 20000)

# Dataset Preview

In [None]:
train_df.head()

Unnamed: 0,prefix,input_text,target_text
0,paraphrase,What is the right age to start-up?,What is the right age to start business?
1,paraphrase,"In March 1999, Chris Anderson took over the su...",Chris Anderson took over from Wayne Bennett as...
2,paraphrase,What are the best sites for learning python?,What are best site for learning python?
3,paraphrase,What is the common denominator in a Quora user?,What are some common characteristics Quora use...
4,paraphrase,What's the most embarrassing moment you've eve...,What was the most embarrassing moment of yours?


In [None]:
eval_df.head()

Unnamed: 0,prefix,input_text,target_text
0,paraphrase,What should one refer to (books or online cour...,What are some video tutorials that I can use t...
1,paraphrase,Are there any substantial way to quit meth?,What's the best way to quit meth?
2,paraphrase,"As a small composer in the French school, he m...","A minor composer in the French school, as a co..."
3,paraphrase,How many states are there in the world?,How many states are there are in the world?
4,paraphrase,What is the normal size of the penis?,What is the normal penis size?


# Model Parameters

In [None]:
model_args = Seq2SeqArgs()
model_args.eval_batch_size = 64
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_seq_length = 128
model_args.num_train_epochs = 2
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.train_batch_size = 32
model_args.use_multiprocessing = False

model_args.do_sample = True
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.max_length = 128
model_args.top_k = 50
model_args.top_p = 0.95

model_args.wandb_project = "Paraphrasing with BART"

# Defining Model

In [None]:
model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-base",
    args=model_args,
)

Downloading:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

# Training Model

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

Fri Aug  6 09:37:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
model.train_model(train_df, eval_data=eval_df)

  0%|          | 0/80000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 2:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

(5000,
 {'eval_loss': [1.2309666961526717,
   1.2309666961526717,
   1.1696379426569223,
   1.1696379426569223],
  'global_step': [2500, 2500, 5000, 5000],
  'train_loss': [1.1618602275848389,
   1.1618602275848389,
   1.1654571294784546,
   1.1654571294784546]})

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp -R /content/outputs/best_model/ /content/gdrive/MyDrive/best_model_e14p3/

# Making Predictions

In [None]:
to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(
        eval_df["prefix"].tolist(), eval_df["input_text"].tolist()
    )
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

In [None]:
# model = Seq2SeqModel(encoder_decoder_type="bart", encoder_decoder_name="outputs")


for _ in range(5):
    original = input("Enter text to paraphrase: ")
    to_predict = [original]

    preds = model.predict(to_predict)

    print("---------------------------------------------------------")
    print(original)

    print()
    print("Predictions >>>")
    for pred in preds[0]:
        print(pred)

    print("---------------------------------------------------------")
    print()

Enter text to paraphrase: A recording of folk songs done for the Columbia society in 1942 was largely arranged by Pjetër Dungu.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


---------------------------------------------------------
A recording of folk songs done for the Columbia society in 1942 was largely arranged by Pjetër Dungu.

Predictions >>>
A recording of folk songs done for Columbia society in 1942 was largely arranged by Pjetër Dungu.
A recording of folk songs done for Columbia society in 1942 was largely arranged by Pjetër Dungu.
A recording of folk songs done for Columbia society in 1942 was largely arranged by Pjetër Dungu.
---------------------------------------------------------

Enter text to paraphrase: In mathematical astronomy, his fame is due to the introduction of the astronomical globe, and his early contributions to understanding the movement of the planets.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
In mathematical astronomy, his fame is due to the introduction of the astronomical globe, and his early contributions to understanding the movement of the planets.

Predictions >>>
His fame is due to the introduction of the astronomical globe and his early contributions to understanding the movement of the planets.
His fame is due to the introduction of the astronomical globe and his early contributions to understanding the movement of the planets.
His fame is due to the introduction of the astronomical globe and his early contributions to understanding the movement of the planets.
---------------------------------------------------------

Enter text to paraphrase: Why are people obsessed with Cara Delevingne?


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
Why are people obsessed with Cara Delevingne?

Predictions >>>
Why are people so obsessed with Cara Delevingne?
Why are people so obsessed with Cara Delevingne?
Why are people so obsessed with Cara Delevingne?
---------------------------------------------------------

Enter text to paraphrase: Earl St Vincent was a British ship that was captured in 1803 and became a French trade man.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
Earl St Vincent was a British ship that was captured in 1803 and became a French trade man.

Predictions >>>
Earl St Vincent was a British ship captured in 1803 and became a French trade man.
Earl St Vincent was a British ship captured in 1803 and became a French trade man.
Earl St Vincent was a British ship captured in 1803 and became a French trade man.
---------------------------------------------------------

Enter text to paraphrase: Worcester is a town and county city of Worcestershire in England.


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
Worcester is a town and county city of Worcestershire in England.

Predictions >>>
Worcester is a town and county city of Worcestershire in England.
Worcester is a town and county city of Worcestershire in England.
Worcester is a town and county city of Worcestershire in England.
---------------------------------------------------------

