# Imports

In [None]:
from typing import Optional

In [None]:
from pathlib import Path
import requests
import gzip
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
import matplotlib.pyplot as plt

# Global

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
CORPUS_URL: str = 'https://object.pouta.csc.fi/OPUS-TED2020/v1/tmx/de-en.tmx.gz'

In [None]:
DATA_PATH: Path = Path('data')
CORPUS_PATH: Path = Path.joinpath(DATA_PATH, 'de-en.tmx')
CORPUS_DE_PATH: Path = Path.joinpath(DATA_PATH, 'de.json')
CORPUS_EN_PATH: Path = Path.joinpath(DATA_PATH, 'en.json')

# Utils

In [None]:
def download_file(
    url: str, 
    *,
    force: bool = False
) -> Path: 
    file_name: str = url.split('/')[-1]
    file_path: Path = Path.joinpath(DATA_PATH, file_name)
    
    if file_path.exists() and not force: 
        return file_path
    
    try:
        response: requests.Response = requests.get(url, stream=True)
        
        with file_path.open('wb') as file: 
            file.write(response.content)
            return file_path
    except Exception as e: 
        raise ValueError(f'An error occured: {e}')

In [None]:
def unzip_file(
    file_path_in: Path,
    *,
    file_path_out: Optional[Path] = None,
    remove_gz: bool = False
) -> Path:
    if file_path_in.suffix != '.gz':
        raise ValueError(f'File Path {file_path_in} is not a ".gz" file')

    if not file_path_out:
        file_path_out = file_path_in.with_suffix('')  # Removes the .gz suffix

    with gzip.open(file_path_in, 'rb') as file_in:
        with open(file_path_out, 'wb') as file_out:
            file_out.write(file_in.read())

    if remove_gz:
        file_path_in.unlink()

    return file_path_out

In [None]:
def hgf_sentiment_analyser(
    inputs: list[str],
    *,
    device: str,
    tokenizer: callable, # TODO correct typing
    model: callable # TODO correct typing,
) -> list[list[float]]:
    model.to(device)

    inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits

    probabilities = torch.nn.functional.softmax(logits, dim=1)

    return probabilities

# Download Data

In [None]:
if not DATA_PATH.exists(): 
    DATA_PATH.mkdir()

In [None]:
CORPUS_FILE_PATH: Path = download_file(CORPUS_URL)
CORPUS_FILE_PATH: str = unzip_file(CORPUS_FILE_PATH)

# Data

In [None]:
CORPUS_DF: pd.DataFrame = pd.read_xml(CORPUS_PATH, xpath="//tuv[@xml:lang]")
CORPUS_DF = CORPUS_DF.rename(columns={'seg': 'text'})

CORPUS_DE_S: pd.Series = CORPUS_DF[CORPUS_DF['lang'].str.contains('de')]['text']
CORPUS_EN_S: pd.Series = CORPUS_DF[CORPUS_DF['lang'].str.contains('en')]['text']

# Sentiment Analysis (Inference)

https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment") # TODO correct typing
MODEL = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment") # TODO correct typing

In [None]:
SENTIMENT_COLUMNS = ['-2', '-1', '0', '1', '2']

In [None]:
tqdm.pandas(desc='Progress German Texts:')
de_sentiments_df: pd.DataFrame = CORPUS_DE_S.progress_apply(lambda text: hgf_sentiment_analyser(list(text), tokenizer=TOKENIZER, model=MODEL, device=DEVICE).tolist()[0]).apply(pd.Series)
de_sentiments_df.columns = SENTIMENT_COLUMNS
CORPUS_DE_DF: pd.DataFrame = pd.concat([CORPUS_DE_S, de_sentiments_df], axis=1)

with CORPUS_DE_PATH.open('w') as de_file: 
    de_file.write(json.dumps(CORPUS_DE_DF['text'].tolist()))

In [None]:
tqdm.pandas(desc='Progress English Texts:')
en_sentiments_df: pd.DataFrame = CORPUS_EN_S.progress_apply(lambda text: hgf_sentiment_analyser(list(text), tokenizer=TOKENIZER, model=MODEL, device=DEVICE).tolist()[0]).apply(pd.Series)
en_sentiments_df.columns = SENTIMENT_COLUMNS
CORPUS_EN_DF: pd.DataFrame = pd.concat([CORPUS_EN_S, en_sentiments_df], axis=1)

with CORPUS_DE_PATH.open('w') as de_file, CORPUS_EN_PATH.open('w') as en_file: 
    de_file.write(json.dumps(CORPUS_DE_DF['text'].tolist()))