In [4]:
import os

root_dir = '/mnt/d/Libraries/University/bachelorarbeit'
gender_bias_dir = root_dir + '/mt_gender_german'

os.chdir(root_dir)
# Auto install the required git repositories:
# fast_align module required in the evaluations
!git clone https://github.com/clab/fast_align.git

fast_align_dir = root_dir + '/fast_align';

Cloning into 'fast_align'...
remote: Enumerating objects: 213, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 213 (delta 2), reused 4 (delta 2), pack-reused 204[K
Receiving objects: 100% (213/213), 70.68 KiB | 565.00 KiB/s, done.
Resolving deltas: 100% (110/110), done.


In [None]:
# List the contents of both projects to see if it worked.
print(gender_bias_dir, fast_align_dir)
!ls {gender_bias_dir} -al
!ls {fast_align_dir} -al

In [5]:

def install_fast_align():
  """ Install fast_align from a directory in your Google Drive.
  """

  dir = fast_align_dir;

  if os.path.isfile(dir + '/build/fast_align'):
    return dir;

  %cd {dir}
  !echo "Current dir: $(pwd)"

  # Install required libs.
  !apt-get install libgoogle-perftools-dev libsparsehash-dev

  # Build instructions from https://github.com/clab/fast_align
  %mkdir -p build
  %cd build
  !cmake -S ..
  !make

install_fast_align()

/mnt/d/Libraries/University/bachelorarbeit/fast_align
Current dir: /mnt/d/Libraries/University/bachelorarbeit/fast_align
E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?
/mnt/d/Libraries/University/bachelorarbeit/fast_align/build
  cmake_minimum_required() should be called prior to this top-level project()
  call.  Please see the cmake-commands(7) manual for usage documentation of
  both commands.
[0m
-- The C compiler identification is GNU 9.3.0
-- The CXX compiler identification is GNU 9.3.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- De

# Functions

In [6]:
import os
import time
from IPython import get_ipython
ipython = get_ipython()

def evaluate_language(source, stereotype, destination, translator):
  """ Run a bash evaluation script from the repo to determine gender bias in the source file compared to the destination.

  For example evaluating 'en to 'es' with translator 'google' looks like:
  1. It translates the input file first if it does not exist yet, which creates /translations/google/en-es.txt.
  2. It then tries to align the translated file using fast_align, which creates /src/forward/en-es.align. I believe the alignment process
  is about mapping words from one language to another.
  3. The alignments are evaluated, and the output is stored in /output/en-es.txt.

  :param str source: The input, which is a language file from the /data/aggregates folder, e.g. use "en" for the "en.txt" file.
  :param str destination: A language file from the /data/aggregates folder, e.g. use "en" for the "en.txt" file.
  :param str translator: One of "google", "bing", "aws", "sota", "systran".
  """

  fast_align_path = install_fast_align()
  %env FAST_ALIGN_BASE={fast_align_path}

  timestamp = time.strftime("%Y%m%d-%H%M%S");

  # Define the source file to be a txt file in the aggregates folder.
  source_file = '/data/aggregates/' + source + '.txt';
  # Define the output file to be in the "output" directory in the root of the GenderBias project.
  os.chdir(gender_bias_dir)
  exec(ipython.transform_cell('!mkdir -p output'))
  os.chdir(gender_bias_dir + '/output')
  exec(ipython.transform_cell('!mkdir -p ' + translator))
  os.chdir(gender_bias_dir + '/output/' + translator)
  exec(ipython.transform_cell('!mkdir -p ' + destination))
  os.chdir(gender_bias_dir + '/output/' + translator + '/' + destination)
  exec(ipython.transform_cell('!mkdir -p ' + source))


  output_file = '../output/' + translator + '/' + destination + '/' +  source + '/' + timestamp + '.txt'

  os.chdir(gender_bias_dir + '/src')

  # Execute script from the repo.
  exec(ipython.transform_cell('!../scripts/evaluate_language.sh {gender_bias_dir}{source_file} {stereotype} {destination} {translator} > {output_file}'))


  print('Output file: ' + gender_bias_dir + '/src/' + output_file);
  pass

## Setup API keys
If you are going to evaluate a language pair that has no translations yet (like `/translations/google/en-es.txt` etc), you need to add an API key for the given service. 

After a translation task, the output is automatically saved in the project folder so it does not need to be ran again (unless you remove it).

In [None]:
# Set default region for AWS services (London in this case).
%env AWS_DEFAULT_REGION=eu-central-1
# Set AWS user (restricted to using AWS Translate only).
%env AWS_ACCESS_KEY_ID=
%env AWS_SECRET_ACCESS_KEY=

## Evaluation

In [10]:
for lang in ['es', 'it', 'fr', 'uk', 'ru']:
     for model in ['deepl', 'bing', 'google', 'systran', 'aws']:
         evaluate_language(source='de', stereotype='none', destination=lang, translator=model)
         evaluate_language(source='de_anti', stereotype='anti', destination=lang, translator=model)
         evaluate_language(source='de_pro', stereotype='pro', destination=lang, translator=model)
         evaluate_language(source='de_anti_wmt', stereotype='anti_wmt',destination=lang, translator=model)
         evaluate_language(source='de_pro_wmt', stereotype='pro_wmt', destination=lang, translator=model)

for lang in ['ar', 'he']:
     for model in ['bing', 'google', 'systran', 'aws']:
         evaluate_language(source='de', stereotype='none', destination=lang, translator=model)
         evaluate_language(source='de_anti', stereotype='anti', destination=lang, translator=model)
         evaluate_language(source='de_pro', stereotype='pro', destination=lang, translator=model)
         evaluate_language(source='de_anti_wmt', stereotype='anti_wmt',destination=lang, translator=model)
         evaluate_language(source='de_pro_wmt', stereotype='pro_wmt', destination=lang, translator=model)

# evaluate_language(source='de', stereotype='none', destination='ru', translator='systran')

env: FAST_ALIGN_BASE=/mnt/d/Libraries/University/bachelorarbeit/fast_align


ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
expected target length = source length * 1.03797
ITERATION 1
  log_e likelihood: -88695.6
  log_2 likelihood: -127961
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.173144
       size counts: 78
ITERATION 2
  log_e likelihood: -20341.6
  log_2 likelihood: -29346.7
     cross entropy: 6.8567
        perplexity: 115.897
      posterior p0: 0.0477767
 posterior al-feat: -0.100233
       size counts: 78
  1  model al-feat: -0.166045 (tension=4)
  2  model al-feat: -0.138007 (tension=5.31624)
  3  model al-feat: -0.125188 (tension=6.07172)
  4  model al-feat: -0.117785 (tension=6.57084)
  5  model al-feat: -0.113021 (tension=6.92189)
  6  model al-feat: -0.109758 (tension=7.17765)
  7  model al-feat: -0.107435 (tension=7.36816)
  8  model al-feat: -0.105736 (tension=7.51221)
     final tension: 7.62227
ITERATION 3
  log_e likelihood: -11402.2
  log_2 likelihood: -16449.9
     cross entropy: 3.84343
