In [1]:
# Imports
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from tqdm.notebook import tqdm
import torch

In [2]:
# Load the data 2deep_transcript.txt
data = pd.read_csv('data/2deep_transcript.csv', delimiter=';')

In [3]:
# Initialize the model and tokenizer
# Wrap DataFrame with tqdm for a progress bar
tqdm.pandas()

# Initialize the model and tokenizer
model_name = 'facebook/mbart-large-50-many-to-many-mmt'
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, cache_dir='../models')
model = MBartForConditionalGeneration.from_pretrained(model_name, cache_dir='../models')

# model_name = 'Helsinki-NLP/opus-mt-de-en'
# tokenizer = MarianTokenizer.from_pretrained(model_name, cache_dir='../models')
# model = MarianMTModel.from_pretrained(model_name, cache_dir='../models')

# Check if a GPU is available and if not, use a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the GPU
model = model.to(device)

In [4]:
# def translate(text):
#     if text == 'REST':
#         print('[The interviewee and interviewer are silent.]\n')
#         return "[The interviewee and interviewer are silent.]"
#     if text.startswith('[Avatar]'):
#         text = text.replace('[Avatar]', '').strip()
#         # remove if there is any : at the beginning of the text
#         if text.startswith(':'):
#             text = text[1:]
        
#         # Prepare the inputs
#         inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
#         inputs = {k: v.to(device) for k, v in inputs.items()}

#         # Generate the translated text
#         translated_text = model.generate(**inputs)

#         # Decode the translated text
#         decoded_text = tokenizer.batch_decode(translated_text, skip_special_tokens=True)
        
        
#         print(f"{text} -> [Avatar] {decoded_text[0]}\n")
#         return f"[Avatar] {decoded_text[0]}"
#     else:
#         # Prepare the inputs
#         inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
#         inputs = {k: v.to(device) for k, v in inputs.items()}

#         # Generate the translated text
#         translated_text = model.generate(**inputs)

#         # Decode the translated text
#         decoded_text = tokenizer.batch_decode(translated_text, skip_special_tokens=True)
        
#         print(f"{text} -> [Interviewee] {decoded_text[0]}\n")
        
#         return f"[Interviewee] {decoded_text[0]}"


# Code for using BART from facebook

def translate(text, src_lang='de_DE', tgt_lang='en_XX'):
    if text == 'REST':
        print('[The interviewee and interviewer are silent.]\n')
        return "[The interviewee and interviewer are silent.]"
    if text.startswith('[Avatar]'):
        text = text.replace('[Avatar]', '').strip()
        # remove if there is any : at the beginning of the text
        if text.startswith(':'):
            text = text[1:]
        
        # Set the source language
        tokenizer.src_lang = src_lang

        # Prepare the inputs
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate the translated text
        translated_text = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]
        )

        # Decode the translated text
        decoded_text = tokenizer.batch_decode(translated_text, skip_special_tokens=True)
        
        print(f"{text} -> [Avatar] {decoded_text[0]}\n")
        return f"[Avatar] {decoded_text[0]}"
    else:
        # Set the source language
        tokenizer.src_lang = src_lang

        # Prepare the inputs
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate the translated text
        translated_text = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]
        )

        # Decode the translated text
        decoded_text = tokenizer.batch_decode(translated_text, skip_special_tokens=True)
        
        print(f"{text} -> [Interviewee] {decoded_text[0]}\n")
        
        return f"[Interviewee] {decoded_text[0]}"

In [5]:
data['transcript'] = data['transcript'].progress_apply(translate)

  0%|          | 0/41389 [00:00<?, ?it/s]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The interviewee and interviewer are silent.]

[The intervie

In [6]:
# save the translated data to a new csv file in ../data and data directory
data.to_csv('data/2deep_eng.csv', index=False)
data.to_csv('../data/2deep_eng.csv', index=False)