# Machine Translation Project
How may a (target) language change over time if we use MT persistently? Compare
(automatically) the MT output and the original translation of a (large) set of texts to check
if there are any words and/or structures that get used less/more frequently.

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#%cd 'drive/My Drive/masters/mt/final-project'
# Ramon's path

/content/drive/My Drive/masters/mt/final-project


In [None]:
#%cd 'drive/My Drive/final-project'
# Marina's path

/content/drive/.shortcut-targets-by-id/16iRmuantAnTl-P7C_imCT43a940cQrdI/final-project


In [None]:
#%cd 'drive/My Drive/Colab Notebooks/final-project/europarl'
# Char's path

/content/drive/.shortcut-targets-by-id/16iRmuantAnTl-P7C_imCT43a940cQrdI/final-project/europarl


In [None]:
!pip install tqdm
!pip install sentencepiece
!pip install transformers
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline

## Translations

In [None]:
# read write operations
def read_corpus(filepath):
    examples = []
    with open(filepath, 'r') as file:
        lines = file.readlines()
        for line in lines:
            examples.append(line.strip())
    return examples


def write_translations(filepath, translations):
    with open(filepath, "w", encoding="utf-8") as file:
        for sentence in translations:
            file.write(sentence + "\n")

In [None]:
path_to_english_file = 'corpora/europarl-v7.es-en.en'
path_to_spanish_file = 'corpora/europarl-v7.es-en.es'

en_parl = read_corpus(path_to_english_file)
es_parl = read_corpus(path_to_spanish_file)

In [None]:
# Don't rerun (30 minutes or +)
es_en_model = 'Helsinki-NLP/opus-mt-es-en'
translator = pipeline(task="translation", model=es_en_model, device=0)
#translation = translator(es_parl[0:5])

# Iterate through each sentence in es_parl and translate
tr_en_texts = []
for sentence in tqdm(es_parl[0:1000]):
    translation = translator(sentence)
    tr_en_texts.append(translation)

100%|██████████| 1000/1000 [29:55<00:00,  1.80s/it]


In [None]:
translations = [text[0]["translation_text"] for text in tr_en_texts]
print(translations[0])
write_translations("translations_europarl_es_en.txt", translations)

Resumption of the session


## Creation of a comparison file

In [None]:
spanish = es_parl[:1000]

In [None]:
english = en_parl[:1000]

In [None]:
translations = read_corpus("translations_europarl_es_en.txt")

In [None]:
comparison_df = pd.DataFrame({'spanish' : spanish,
                                'english_human' : english,
                                'english_mt' : translations },
                                columns=['spanish','english_human', 'english_mt'])


In [None]:
comparison_df

Unnamed: 0,spanish,english_human,english_mt
0,Reanudación del período de sesiones,Resumption of the session,Resumption of the session
1,Declaro reanudado el período de sesiones del P...,I declare resumed the session of the European ...,I declare resumed the session of the European ...
2,"Como todos han podido comprobar, el gran ""efec...","Although, as you will have seen, the dreaded '...","As everyone has seen, the great ""effect of the..."
3,Sus Señorías han solicitado un debate sobre el...,You have requested a debate on this subject in...,You have asked for a debate on the subject in ...
4,"A la espera de que se produzca, de acuerdo con...","In the meantime, I should like to observe a mi...","Waiting for this to happen, according to many ..."
...,...,...,...
995,"Quiero felicitarlo, pese a que, lamentablement...","I want to congratulate him, sadly in his absen...","I would like to congratulate him, although he ..."
996,Antes ha dicho que el informe se ha aprobado c...,He mentioned earlier that the report was carri...,He said earlier that the report had been adopt...
997,"De modo que, aunque puedo no compartir sus con...","So although I may not share his conclusions, I...","So, although I may not share your conclusions,..."
998,La primera es las posibilidades de renacionali...,The first is the potential for renationalisati...,The first is the possibility of renationalisat...


In [None]:
comparison_df.to_csv('comparison.csv', index=True)

## Automatic analysis

### Comparison of unigrams (words)

In [None]:
from collections import Counter
import nltk
from nltk import word_tokenize

nltk.download('punkt')


# Tokenize function
def tokenize(text):
    return word_tokenize(text.lower())

# Function to count word frequencies
def count_word_frequencies(texts):
    all_words = []
    for text in texts:
        tokens = tokenize(text)
        all_words.extend(tokens)
    return Counter(all_words)

# Calculate word frequencies for human translations
human_word_freq = count_word_frequencies(comparison_df['english_human'])

# Calculate word frequencies for machine translations
mt_word_freq = count_word_frequencies(comparison_df['english_mt'])

# Compare word frequencies
common_words = set(human_word_freq.keys()) & set(mt_word_freq.keys())

# Identify significant differences in word frequencies
differences = {}
for word in common_words:
    human_freq = human_word_freq[word]
    mt_freq = mt_word_freq[word]
    if human_freq != mt_freq:
        differences[word] = (human_freq, mt_freq)

# Set the threshold for significant frequency difference
threshold = 10

# Output the significant differences
print("Significant differences in word frequencies:")
for word, (human_freq, mt_freq) in differences.items():
    if abs(human_freq - mt_freq) >= threshold:
        print(f"Word: {word}, Human Frequency: {human_freq}, MT Frequency: {mt_freq}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Significant differences in word frequencies:
Word: (, Human Frequency: 36, MT Frequency: 49
Word: be, Human Frequency: 232, MT Frequency: 201
Word: something, Human Frequency: 16, MT Frequency: 4
Word: ladies, Human Frequency: 6, MT Frequency: 28
Word: is, Human Frequency: 444, MT Frequency: 424
Word: on, Human Frequency: 300, MT Frequency: 311
Word: safety, Human Frequency: 46, MT Frequency: 35
Word: ', Human Frequency: 98, MT Frequency: 5
Word: you, Human Frequency: 84, MT Frequency: 70
Word: gentlemen, Human Frequency: 6, MT Frequency: 28
Word: regional, Human Frequency: 59, MT Frequency: 47
Word: when, Human Frequency: 42, MT Frequency: 30
Word: particularly, Human Frequency: 35, MT Frequency: 19
Word: from, Human Frequency: 55, MT Frequency: 65
Word: that, Human Frequency: 423, MT Frequency: 509
Word: way, Human Frequency: 37, MT Frequency: 19
Word: this, Human Frequency: 358, MT Frequency: 325
Word: to, Human Frequency: 941, MT Frequency: 887
Word: first, Human Frequency: 28, MT 

### Estimating lexical richness: Perplexity & Type-Token Ratio

In [None]:
import numpy as np
from collections import Counter

def calculate_unigram_probabilities(texts):
    # Count the frequency of each word
    word_freq = count_word_frequencies(texts)

    # Calculate unigram model probabilities
    total_words = sum(word_freq.values())  # total number of words in corpus
    probs = {word: count / total_words for word, count in word_freq.items()}

    return probs

def perplexity(probs):
    log_probabilities = np.log2(probs)
    entropy = -np.mean(log_probabilities)
    perplexity = 2 ** entropy
    return perplexity

def ttr(texts):  # type-token ratio
    # Obtain all corpus words
    words = []
    for text in texts:
        tokens = tokenize(text)
        words.extend(tokens)

    # Count the number of unique words (types)
    unique_words = set(words)
    num_unique_words = len(unique_words)

    # Count the total number of words (tokens)
    num_tokens = len(words)

    # Calculate the TTR
    ttr = num_unique_words / num_tokens

    return ttr

# Calculate probabilities
human_probs = calculate_unigram_probabilities(comparison_df['english_human'])
mt_probs = calculate_unigram_probabilities(comparison_df['english_mt'])

# Calculate perplexities
human_perp = perplexity(list(human_probs.values()))
mt_perp = perplexity(list(mt_probs.values()))

# Calculate TTRs
human_ttr = ttr(comparison_df['english_human'])
mt_ttr = ttr(comparison_df['english_mt'])

# Comparison time!
print(f"Human texts PERPLEXITY: {human_perp} | MT texts PERPLEXITY: {mt_perp}")  # does this make any sense? what is our baseline?
print(f"Human texts TTR: {human_ttr} | MT texts TTR: {mt_ttr}")  # 11.7% vs 11.4% - significant?

Human texts PERPLEXITY: 13009.892610054694 | MT texts PERPLEXITY: 12575.198151663177
Human texts TTR: 0.11719503675859086 | MT texts TTR: 0.11438174781014464


### Comparison of bigrams

In [None]:
from nltk import bigrams
from string import punctuation

from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')


# Function to count bigram frequencies, excluding stopwords and punctuation
def count_bigram_frequencies_clean(texts):
    all_bigrams = []
    for text in texts:
        tokens = [token for token in tokenize(text) if token not in stop_words and token not in punctuation]
        bigrams_list = list(bigrams(tokens))
        all_bigrams.extend(bigrams_list)
    return Counter(all_bigrams)

# Calculate bigram frequencies for human translations (excluding stopwords and punctuation)
human_bigram_freq_clean = count_bigram_frequencies_clean(comparison_df['english_human'])

# Calculate bigram frequencies for machine translations (excluding stopwords and punctuation)
mt_bigram_freq_clean = count_bigram_frequencies_clean(comparison_df['english_mt'])

# Get the top 50 bigrams for human translations
top_50_human_bigrams_clean = human_bigram_freq_clean.most_common(50)

# Get the top 50 bigrams for machine translations
top_50_mt_bigrams_clean = mt_bigram_freq_clean.most_common(50)

# Output the top 50 bigrams for human translations
print("Top 50 meaningful bigrams for human translations:")
for bigram, freq in top_50_human_bigrams_clean:
    print(f"Bigram: {bigram}, Frequency: {freq}")

# Output the top 50 bigrams for machine translations
print("\nTop 50 meaningful bigrams for machine translations:")
for bigram, freq in top_50_mt_bigrams_clean:
    print(f"Bigram: {bigram}, Frequency: {freq}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Top 50 meaningful bigrams for human translations:
Bigram: ('would', 'like'), Frequency: 61
Bigram: ('mr', 'president'), Frequency: 60
Bigram: ('member', 'states'), Frequency: 48
Bigram: ('state', 'aid'), Frequency: 47
Bigram: ('european', 'union'), Frequency: 33
Bigram: ('madam', 'president'), Frequency: 29
Bigram: ('competition', 'policy'), Frequency: 29
Bigram: ('regional', 'policy'), Frequency: 25
Bigram: ('structural', 'funds'), Frequency: 24
Bigram: ('dangerous', 'goods'), Frequency: 22
Bigram: ('president', 'commissioner'), Frequency: 22
Bigram: ('president', 'would'), Frequency: 17
Bigram: ('transport', 'dangerous'), Frequency: 15
Bigram: ('white', 'paper'), Frequency: 15
Bigram: ('internal', 'market'), Frequency: 14
Bigram: ('economic', 'social'), Frequency: 14
Bigram: ('european', 'parliament'), Frequency: 13
Bigram: ('mr', 'koch'), Frequency: 13
Bigram: ('like', 'thank'), Frequency: 11
Bigram: ('rural', 'areas'), Frequency: 11
Bigram: ('periodic', 'report'), Frequency: 11
Big

### Comparison of trigrams

In [None]:
from nltk import trigrams

# Function to count trigram frequencies, excluding stopwords and punctuation
def count_trigram_frequencies_clean(texts):
    all_trigrams = []
    for text in texts:
        tokens = [token for token in tokenize(text) if token not in stop_words and token not in punctuation]
        trigrams_list = list(trigrams(tokens))
        all_trigrams.extend(trigrams_list)
    return Counter(all_trigrams)

# Calculate trigram frequencies for human translations (excluding stopwords and punctuation)
human_trigram_freq_clean = count_trigram_frequencies_clean(comparison_df['english_human'])

# Calculate trigram frequencies for machine translations (excluding stopwords and punctuation)
mt_trigram_freq_clean = count_trigram_frequencies_clean(comparison_df['english_mt'])

# Get the top 50 trigrams for human translations
top_50_human_trigrams_clean = human_trigram_freq_clean.most_common(50)

# Get the top 50 trigrams for machine translations
top_50_mt_trigrams_clean = mt_trigram_freq_clean.most_common(50)

# Output the top 50 trigrams for human translations
print("Top 50 meaningful trigrams for human translations:")
for trigram, freq in top_50_human_trigrams_clean:
    print(f"Trigram: {trigram}, Frequency: {freq}")

# Output the top 50 trigrams for machine translations
print("\nTop 50 meaningful trigrams for machine translations:")
for trigram, freq in top_50_mt_trigrams_clean:
    print(f"Trigram: {trigram}, Frequency: {freq}")


Top 50 meaningful trigrams for human translations:
Trigram: ('mr', 'president', 'commissioner'), Frequency: 22
Trigram: ('transport', 'dangerous', 'goods'), Frequency: 15
Trigram: ('president', 'would', 'like'), Frequency: 12
Trigram: ('dangerous', 'goods', 'road'), Frequency: 10
Trigram: ('mr', 'president', 'would'), Frequency: 9
Trigram: ('sixth', 'periodic', 'report'), Frequency: 9
Trigram: ('madam', 'president', 'would'), Frequency: 8
Trigram: ('would', 'also', 'like'), Frequency: 8
Trigram: ('committee', 'regional', 'policy'), Frequency: 8
Trigram: ('economic', 'social', 'cohesion'), Frequency: 8
Trigram: ('committee', 'economic', 'monetary'), Frequency: 8
Trigram: ('economic', 'monetary', 'affairs'), Frequency: 8
Trigram: ('group', 'party', 'european'), Frequency: 7
Trigram: ('party', 'european', 'socialists'), Frequency: 7
Trigram: ('regional', 'policy', 'transport'), Frequency: 7
Trigram: ('would', 'like', 'say'), Frequency: 6
Trigram: ('policy', 'transport', 'tourism'), Freque

## Comparison file for manual linguistic analysis

In [None]:
import pandas as pd

# Load the original DataFrame
comparison_df = pd.read_csv('comparison.csv')

# Create a new DataFrame for the two corpora with a category column
corpus_df = pd.concat([comparison_df[['spanish', 'english_human']].rename(columns={'english_human': 'text'}),
                       comparison_df[['spanish', 'english_mt']].rename(columns={'english_mt': 'text'})],
                      ignore_index=True)

# Add a category column indicating human or machine translation
corpus_df['category'] = ['human-translation'] * len(comparison_df) + ['machine-translation'] * len(comparison_df)

# Save the new DataFrame to a CSV file
corpus_df.to_csv('corpus_with_category-europarl.csv', index=False)

## Scattertext

In [None]:
!pip install Scattertext

Collecting Scattertext
  Downloading scattertext-0.1.19-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting flashtext (from Scattertext)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9296 sha256=6d1cf420ba1854831ab9653bf71837caca67efef8f0ba3018b03a856ad773b5c
  Stored in directory: /root/.cache/pip/wheels/bc/be/39/c37ad168eb2ff644c9685f52554440372129450f0b8ed203dd
Successfully built flashtext
Installing collected packages: flashtext, Scattertext
Successfully installed Scattertext-0.1.19 flashtext-2.7


In [None]:
import pandas as pd
import scattertext as st

corpus_df = pd.read_csv('corpus_with_category-europarl.csv')

# creating a Scattertext Corpus
corpus = st.CorpusFromPandas(corpus_df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences
                             ).build()

# generate the HTML visualization
html = st.produce_scattertext_explorer(corpus,
                                       category='human-translation',
                                       category_name='Human Translation',
                                       not_category_name='Machine Translation',
                                       width_in_pixels=1000)

# Save the HTML file
with open('mt-scattertext_visualization-2.html', 'w') as f:
    f.write(html)