In [1]:
"""Based on... https://github.com/GT-SALT/MixText/
requirements:
    1. fairseq
    2. hydra-core
    3. omegaconf
    4. fastbpe
    5. cython
"""
import pandas as pd
import numpy as np
import torch
import pickle
import random
from copy import deepcopy
from tqdm import tqdm
# from tqdm import tqdm_notebook as tqdm
import os
import re
import pickle

#################################################################################################################
# Reproducible
#################################################################################################################
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")
torch.cuda.set_device(device) # change allocation of current GPU
print(f'training device: {device, torch.cuda.get_device_name()}')

training device: (device(type='cuda', index=1), 'GeForce GTX 1080 Ti')


In [2]:
# Target dataset load
# full_dataset_df = pd.read_csv('./Dataset/train_final.csv') # the training set with 11.5k sentences
dataset_df = pd.read_csv("./train_drop_duplicates.csv")
dataset_df.drop('Unnamed: 0', axis=1, inplace=True)
ru_dataset_df = deepcopy(dataset_df)
de_dataset_df = deepcopy(dataset_df)

In [3]:
dataset_df

Unnamed: 0,Id,Category,Sentence
0,0,3,-LRB- The film -RRB- tackles the topic of rela...
1,1,2,"Lavishly , exhilaratingly tasteless ."
2,2,4,It is also beautifully acted .
3,3,1,"But , like Silence , it 's a movie that gets u..."
4,4,2,It 's been made with an innocent yet fervid co...
...,...,...,...
8558,11531,2,If you ignore the cliches and concentrate on C...
8559,11532,3,A macabre and very stylized Swedish fillm abou...
8560,11533,4,"... an eerily suspenseful , deeply absorbing p..."
8561,11537,1,This is n't a movie ; it 's a symptom .


In [4]:
# Load pre-trained translation model from fairseq (en -> ru -> en)
en2ru = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-ru.single_model', tokenizer='moses', bpe='fastbpe')
ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en.single_model', tokenizer='moses', bpe='fastbpe')

Using cache found in /home/nowhyun/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/nowhyun/.cache/torch/hub/pytorch_fairseq_master


In [5]:
en2ru.cuda()
ru2en.cuda()

GeneratorHubInterface(
  (models): ModuleList(
    (0): TransformerModel(
      (encoder): TransformerEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(31232, 1024, padding_idx=1)
        (embed_positions): SinusoidalPositionalEmbedding()
        (layers): ModuleList(
          (0): TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (dropout_module): FairseqDropout()
            (activation_dropout_module): FairseqDropout()
            (fc1): Linear(in_feat

In [6]:
def back_translate_ru(sent_str):
    """Back translate using Russian as middle language
    """
    bt_ru_sent_str = ru2en.translate(en2ru.translate(sent_str,
                                                     sampling=True,
                                                     temperature=0.9),
                                     sampling=True,
                                     temperature=0.9)
    return bt_ru_sent_str

In [7]:
tqdm.pandas()
ru_dataset_df["Sentence"] = ru_dataset_df["Sentence"].progress_map(lambda sent_str: back_translate_ru(sent_str))
ru_dataset_df.head()

100%|██████████| 8563/8563 [2:56:59<00:00,  1.24s/it]  


Unnamed: 0,Id,Category,Sentence
0,0,3,- The film RRB (Givers) deals with relation pr...
1,1,2,"Greedy, funny, tasteless."
2,2,4,<unk> is also expected to return to action.
3,3,1,"But, like Silence, it's a film that gets under..."
4,4,2,It was done with an innocent but hard-line con...


In [8]:
ru_dataset_df.to_csv("./train_dd_aug_ru.csv", index=False)

In [9]:
# Load pre-trained translation model from fairseq (en -> de -> en)
en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model', tokenizer='moses', bpe='fastbpe')
de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en.single_model', tokenizer='moses', bpe='fastbpe')

Using cache found in /home/nowhyun/.cache/torch/hub/pytorch_fairseq_master
Using cache found in /home/nowhyun/.cache/torch/hub/pytorch_fairseq_master


In [10]:
en2de.cuda()
de2en.cuda()

GeneratorHubInterface(
  (models): ModuleList(
    (0): TransformerModel(
      (encoder): TransformerEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(42024, 1024, padding_idx=1)
        (embed_positions): SinusoidalPositionalEmbedding()
        (layers): ModuleList(
          (0): TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (dropout_module): FairseqDropout()
            (activation_dropout_module): FairseqDropout()
            (fc1): Linear(in_feat

In [11]:
def back_translate_de(sent_str):
    """Back translate using German as middle language
    """
    bt_de_sent_str = de2en.translate(en2de.translate(sent_str,
                                                     sampling=True,
                                                     temperature=0.9),
                                     sampling=True,
                                     temperature=0.9)
    return bt_de_sent_str

In [12]:
tqdm.pandas()
de_dataset_df["Sentence"] = de_dataset_df["Sentence"].progress_map(lambda sent_str: back_translate_de(sent_str))
de_dataset_df.head()

100%|██████████| 8563/8563 [1:09:50<00:00,  2.04it/s]


Unnamed: 0,Id,Category,Sentence
0,0,3,-LRB- The film -RRB- negotiates the topic of r...
1,1,2,"In a lavish, intoxicatingly distasteful fashion."
2,2,4,It's also wonderfully played.
3,3,1,But like in silence the film gets under your s...
4,4,2,They did it with an innocent but burning convi...


In [16]:
de_dataset_df.to_csv("./train_dd_aug_de.csv", index=False)