In [1]:
# Imports
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm_notebook as tqdm
import torch

In [2]:
# Load the data 2deep_transcript.txt
data = pd.read_csv('data/2deep_transcript.csv', delimiter=';')

In [3]:
# Initialize the model and tokenizer
# Wrap DataFrame with tqdm for a progress bar
tqdm().pandas()
model_name = 'Helsinki-NLP/opus-mt-de-en'
tokenizer = MarianTokenizer.from_pretrained(model_name, cache_dir='../models')
model = MarianMTModel.from_pretrained(model_name, cache_dir='../models')

# Check if a GPU is available and if not, use a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the GPU
model = model.to(device)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

  return self.fget.__get__(instance, owner)()


In [4]:
# Define a function to translate text
def translate(text):
    # If the text is 'REST', return the appropriate string
    if text == 'REST':
        return "The interviewee and interviewer were silent here."

    # Determine the verb based on the punctuation at the end of the text
    verb = 'said'
    if text.strip().endswith('?'):
        verb = 'asked'
    elif text.strip().endswith('!'):
        verb = 'exclaimed'

    # If the text starts with '[Avatar]', translate what the interviewer said
    if text.startswith('[Avatar]'):
        # Remove '[Avatar]' from the text
        text = text.replace('[Avatar]', '').strip()
        
        # Tokenize the text
        tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')
        
        # Move the tokenized text to the GPU
        tokenized_text = {k: v.to(device) for k, v in tokenized_text.items()}

        # Translate the text
        translated_text = model.generate(**tokenized_text)

        # Decode the translated text
        decoded_text = tokenizer.batch_decode(translated_text, skip_special_tokens=True)
        
        return f"The interviewer {verb}: {decoded_text[0]}"

    # Otherwise, translate what the interviewee said
    else:
        # Tokenize the text
        tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')
        
        # Move the tokenized text to the GPU
        tokenized_text = {k: v.to(device) for k, v in tokenized_text.items()}

        # Translate the text
        translated_text = model.generate(**tokenized_text)

        # Decode the translated text
        decoded_text = tokenizer.batch_decode(translated_text, skip_special_tokens=True)
        
        return f"The interviewee {verb}: {decoded_text[0]}"

# Apply the function to the transcript column
data['transcript'] = data['transcript'].progress_apply(translate)

  0%|          | 0/41389 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



In [5]:
# save the translated data to a new csv file in ../data and data directory
data.to_csv('data/2deep_eng.csv', index=False)
data.to_csv('../data/2deep_eng.csv', index=False)