# Translating english comments into the languages we want to analyze

We translate the english comments. The idea is that these comments, due to heavy preprocessing, majority vote labelling and data augmentation, are a good set of comments to train models on.

In [1]:
import pandas as pd
from google.colab import files
from google.colab import drive
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch

In [2]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
# Upload the original english comments, but labelled by the majority vote
uploaded = files.upload()

Saving combined_original_under_90_augmented.csv to combined_original_under_90_augmented.csv


In [13]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path
# '/content/'
path = '/content/'

In [25]:
# Load the dataset
file = 'combined_original_under_90_augmented.csv'
all_comments = pd.read_csv(path + file)


In [16]:
# Load the translation models
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [17]:
# Try to use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_

In [18]:
def translate_text(text, source_lang, target_lang):
    tokenizer.src_lang = source_lang
    encoded_text = tokenizer(text, return_tensors="pt").to(device)
    generated_tokens = model.generate(
        **encoded_text,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
    )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]


In [19]:
comments_column = "Comment" # The column with the comments
source_lang =  "en_XX"  # Source language (always english)
french_lang =  "fr_XX"
german_lang = "de_DE"
spanish_lang = "es_XX"
italian_lang = "it_IT"

In [None]:
# Select the language to translate to
language = 'french'

In [27]:
# Translate in all languages
# Choose which language to translate in, comment out the rest
all_comments["translated_french"] = all_comments[comments_column].apply(lambda x: translate_text(x, source_lang, french_lang))
#all_comments["translated_german"] = all_comments[comments_column].apply(lambda x: translate_text(x, source_lang, german_lang))
#all_comments["translated_spanish"] = all_comments[comments_column].apply(lambda x: translate_text(x, source_lang, spanish_lang))
#all_comments["translated_italian"] = all_comments[comments_column].apply(lambda x: translate_text(x, source_lang, italian_lang))


In [39]:
# Remove the column 'Unnamed: 0'
all_comments.drop(columns=[comments_column, 'Unnamed: 0'], inplace=True)

# Rename the index
all_comments = all_comments.rename(columns={'Unnamed: 0.1': 'index'} )

# Rename the column 'translated_xxx' to 'Comment'
all_comments.rename(columns={"translated_french": "Comment"}, inplace=True)

In [40]:
# Save to csv file
all_comments.to_csv(path + 'combined_original_under_90_{}.csv'.format(language), index=False)

In [41]:
# Download the file to your local machine (from google colab)
files.download(path + 'combined_original_under_90_{}.csv'.format(language))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>