# First look at the project

In [90]:
import pandas as pd
import numpy as np
import tqdm
import subprocess
import torch

## Functions to read and write text/tokens

In [58]:
def get_stream_size(stream):
    result = sum(1 for _ in stream)
    stream.seek(0)
    return result

def read_token_file(file_name: str):
    out = []
    with open(file_name, 'r') as stream:
            file_size = get_stream_size(stream)
            for line in stream:
                tokens = line.strip().split()
                out.append(tokens)
    return out

def read_text_file(file_name: str):
    out = []
    with open(file_name, 'r') as stream:
            file_size = get_stream_size(stream)
            for line in stream:
                tokens = line.strip()
                out.append(tokens)
    return out

def write_text_from_tokens(tokens, output_file):
    with open(output_file, 'w+') as out_stream:
        for token in tokens:
            out_stream.write(' '.join(token) + '\n')
    

### Tokenized alligned texts

In [66]:
en_token_file = read_token_file('data/train.lang1')
fr_token_file = read_token_file('data/train.lang2')

### Not tokenized and not alligned texts

In [38]:
en_text_file = read_text_file('data/unaligned.en')
fr_text_file = read_text_file('data/unaligned.fr')

### Write tokens to text file

In [67]:
write_text_from_tokens(fr_token_file, 'french.txt')

### Function to compute bleu

In [85]:
def compute_bleu(pred_file_path: str, target_file_path: str, print_all_scores: bool):
    """
    Args:
        pred_file_path: the file path that contains the predictions.
        target_file_path: the file path that contains the targets (also called references).
        print_all_scores: if True, will print one score per example.
    Returns: None
    """
    out = subprocess.run(["sacrebleu", "--input", pred_file_path, target_file_path, '--tokenize',
                          'none', '--sentence-level', '--score-only'],
                         stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    lines = out.stdout.split('\n')
    #if print_all_scores:
        #print('\n'.join(lines[:-1]))
    
    scores = [float(x) for x in lines[:-1]]
    #print('final avg bleu score: {:.2f}'.format(sum(scores) / len(scores)))
    
    return scores, sum(scores) / len(scores)

### Compute bleu French vs. French

In [106]:
each_score, score = compute_bleu('data/train.lang2', 'data/train.lang2', True)

In [107]:
score

100.0

### Compute bleu French vs. English

In [104]:
each_score, score = compute_bleu('data/train.lang2', 'data/train.lang1', True)

In [105]:
score

1.7819545454545425

In [110]:
from enum import Enum

In [112]:
class Lang(Enum):
    """Feature which the dataloader can load."""

    french = 'french'
    english = 'english'