# Set Up

In [None]:
from urllib import request
import os
import re
import pandas as pd
import csv
import numpy as np
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/drive/Shared drives/"

# choose where you want your project files to be saved
project_folder = "MS - Translation/Gemini/Testing for CS 224N"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

create_and_set_working_directory(project_folder)

In [None]:
!pip install -q -U google-generativeai

In [None]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
# Used to securely store your API key
from google.colab import userdata

In [None]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

In [None]:
model = genai.GenerativeModel('gemini-1.0-pro-latest')

In [None]:
%%time
response = model.generate_content("Translate this sentence into English: Allâhu ‘azîmü’ş-şân hazretleri kendü kullarını yokdan var edüb her kesin ‘ömrünü ve her ahvâlini kendü kudreti ile takdîr kılub gerek hayr u şer insanın başına ne gelürse Allâhu zü’l-celâlden bilinmelüdür.")

CPU times: user 50.7 ms, sys: 9.29 ms, total: 60 ms
Wall time: 2.36 s


In [None]:
to_markdown(response.text)

> Allah, may His glory be glorified, created His servants from nothingness, decided the life and fate of every being with His power, whatever good and evil befalls a person must be known to be from Allah, the Possessor of Majesty.

In [None]:
!pip install sacrebleu
from sacrebleu import corpus_bleu, corpus_chrf

# Baseline 1: With Safety Settings

In [None]:
def translate_and_evaluate(input_csv_path, output_csv_path, model):
    original_sentences = []
    translated_sentences = []
    original_sentences_filtered = []
    translated_sentences_filtered = []

    with open(input_csv_path, 'r', encoding='utf-8') as read_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as write_file:
        csv_reader = csv.reader(read_file)
        csv_writer = csv.writer(write_file)

        counter = 0
        for line in csv_reader:
            print(counter)
            input_sentence = line[0].strip()
            if input_sentence:
                full_prompt = "Translate this sentence into English: " + input_sentence
                try:
                    response = model.generate_content(full_prompt)
                    output_text = response.text if response.text.strip() else ''
                except (ValueError, AttributeError):
                    output_text = ''

                csv_writer.writerow([input_sentence, output_text if output_text else 'Translation not available'])

                original_sentences.append(line[1].strip())  # second column is the reference translation
                translated_sentences.append(output_text if output_text else '')

                if output_text:
                    original_sentences_filtered.append(line[1].strip())
                    translated_sentences_filtered.append(output_text)
            counter +=1

    # Calculate BLEU and chrF scores for all translations, treating 'no translation' as an empty string
    bleu_score_all = corpus_bleu(translated_sentences, [original_sentences]).score
    chrf_score_all = corpus_chrf(translated_sentences, [original_sentences]).score

    # Calculate BLEU and chrF scores for translations not labeled as 'Translation not available' (non-empty translations)
    bleu_score_filtered = corpus_bleu(translated_sentences_filtered, [original_sentences_filtered]).score if translated_sentences_filtered else 0
    chrf_score_filtered = corpus_chrf(translated_sentences_filtered, [original_sentences_filtered]).score if translated_sentences_filtered else 0

    print(f"All translations - BLEU score: {bleu_score_all}, chrF score: {chrf_score_all}")
    print(f"Filtered (valid translations only) - BLEU score: {bleu_score_filtered}, chrF score: {chrf_score_filtered}")

    return (bleu_score_all, chrf_score_all), (bleu_score_filtered, chrf_score_filtered)




In [None]:
translate_and_evaluate('test_manuscript.csv', 'gemini_manuscript.csv', model)

All translations - BLEU score: 8.86924897664771, chrF score: 36.546398551524575
Filtered (valid translations only) - BLEU score: 9.057007633205238, chrF score: 39.55344621266212

This took 16 mins

((8.86924897664771, 36.546398551524575),
 (9.057007633205238, 39.55344621266212))

In [None]:
translate_and_evaluate('test_novel.csv', 'gemini_novel.csv', model)

All translations - BLEU score: 10.831756727112259, chrF score: 34.69801347643916
Filtered (valid translations only) - BLEU score: 10.967800690810428, chrF score: 37.25066779332506

((10.831756727112259, 34.69801347643916),
 (10.967800690810428, 37.25066779332506))

 this took 1h 24m 30s


In [None]:
translate_and_evaluate('osmanaga_test.csv', 'gemini_osmanaga.csv', model)

All translations - BLEU score: 7.014749796735597, chrF score: 32.07414759239556
Filtered (valid translations only) - BLEU score: 7.848205967855889, chrF score: 36.6151500479395

((7.014749796735597, 32.07414759239556), (7.848205967855889, 36.6151500479395))

This took 24 mins

# Baseline 2: Without Safety Settings

See this guy's problem: https://www.googlecloudcommunity.com/gc/AI-ML/Gemini-Pro-for-the-same-prompt-sometimes-it-returns-a-response/td-p/703155

In [None]:
safety_settings = [
                {"category": "HARM_CATEGORY_HARASSMENT", "threshold": 'block_none'},
                {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": 'block_none'},
                {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": 'block_none'},
                {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": 'block_none'}
            ]

In [None]:
def translate_and_evaluate_no_safety(input_csv_path, output_csv_path, model):
    original_sentences = []
    translated_sentences = []
    original_sentences_filtered = []
    translated_sentences_filtered = []

    with open(input_csv_path, 'r', encoding='utf-8') as read_file, open(output_csv_path, 'w', newline='', encoding='utf-8') as write_file:
        csv_reader = csv.reader(read_file)
        csv_writer = csv.writer(write_file)

        counter = 0
        for line in csv_reader:
            print(counter)
            input_sentence = line[0].strip()
            if input_sentence:
                full_prompt = "Translate this sentence into English: " + input_sentence
                try:
                    response = model.generate_content(full_prompt, safety_settings=safety_settings)
                    output_text = response.text if response.text.strip() else ''
                except (ValueError, AttributeError):
                    output_text = ''

                csv_writer.writerow([input_sentence, output_text if output_text else 'Translation not available'])

                original_sentences.append(line[1].strip())  # second column is the reference translation
                translated_sentences.append(output_text if output_text else '')

                if output_text:
                    original_sentences_filtered.append(line[1].strip())
                    translated_sentences_filtered.append(output_text)
            counter +=1

    # Calculate BLEU and chrF scores for all translations, treating 'no translation' as an empty string
    bleu_score_all = corpus_bleu(translated_sentences, [original_sentences]).score
    chrf_score_all = corpus_chrf(translated_sentences, [original_sentences]).score

    # Calculate BLEU and chrF scores for translations not labeled as 'Translation not available' (non-empty translations)
    bleu_score_filtered = corpus_bleu(translated_sentences_filtered, [original_sentences_filtered]).score if translated_sentences_filtered else 0
    chrf_score_filtered = corpus_chrf(translated_sentences_filtered, [original_sentences_filtered]).score if translated_sentences_filtered else 0

    print(f"All translations - BLEU score: {bleu_score_all}, chrF score: {chrf_score_all}")
    print(f"Filtered (valid translations only) - BLEU score: {bleu_score_filtered}, chrF score: {chrf_score_filtered}")

    return (bleu_score_all, chrf_score_all), (bleu_score_filtered, chrf_score_filtered)



In [None]:
translate_and_evaluate_no_safety('test_manuscript.csv', 'gemini_manuscript_no_safety.csv', model)

All translations - BLEU score: 9.040167708707436, chrF score: 38.9620815453946
Filtered (valid translations only) - BLEU score: 9.040167708707436, chrF score: 39.044613686393944

((9.040167708707436, 38.9620815453946),
 (9.040167708707436, 39.044613686393944))

In [None]:
translate_and_evaluate_no_safety('test_novel.csv', 'gemini_novel_no_safety.csv', model)

All translations - BLEU score: 11.10913027990284, chrF score: 37.32569185362411
Filtered (valid translations only) - BLEU score: 11.10913027990284, chrF score: 37.37718483075166

((11.10913027990284, 37.32569185362411),
 (11.10913027990284, 37.37718483075166))

 This took 1h 50m 40s

In [None]:
translate_and_evaluate_no_safety('osmanaga_test.csv', 'gemini_osmanaga_no_safety.csv', model)

this was in two parts because of timeout. 56 + 7 mins

All translations - BLEU score: 7.8421801129838675, chrF score: 36.40565274349136
Filtered (non-empty and valid translations) - BLEU score: 7.8421801129838675, chrF score: 36.59685268728821

((7.8421801129838675, 36.40565274349136),
 (7.8421801129838675, 36.59685268728821))

Below is just for a separate evaluation from translation

In [None]:
def evaluate_translations(input_csv_path, output_file_path):
    original_sentences = []
    translated_sentences = []
    original_sentences_filtered = []
    translated_sentences_filtered = []

    with open(input_csv_path, 'r', encoding='utf-8') as read_file:
        csv_reader = csv.reader(read_file)

        for line in csv_reader:
            reference_translation = line[1].strip()
            model_translation = line[2].strip()

            # Add to lists for all translations (treating 'Translation not available' as empty string)
            original_sentences.append(reference_translation)
            translated_sentences.append(model_translation if model_translation and model_translation != 'Translation not available' else '')

            # Add to lists for non-empty and not 'Translation not available' translations
            if model_translation and model_translation != 'Translation not available':
                original_sentences_filtered.append(reference_translation)
                translated_sentences_filtered.append(model_translation)

    # Calculate BLEU and chrF scores for all translations
    bleu_score_all = corpus_bleu(translated_sentences, [original_sentences]).score
    chrf_score_all = corpus_chrf(translated_sentences, [original_sentences]).score

    # Calculate BLEU and chrF scores for non-empty and not 'Translation not available' translations
    bleu_score_filtered = corpus_bleu(translated_sentences_filtered, [original_sentences_filtered]).score if translated_sentences_filtered else 0
    chrf_score_filtered = corpus_chrf(translated_sentences_filtered, [original_sentences_filtered]).score if translated_sentences_filtered else 0

    # Write scores to a single text file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(f"All translations - BLEU score: {bleu_score_all}, chrF score: {chrf_score_all}\n")
        output_file.write(f"Filtered (non-empty and valid translations) - BLEU score: {bleu_score_filtered}, chrF score: {chrf_score_filtered}\n")

    print(f"All translations - BLEU score: {bleu_score_all}, chrF score: {chrf_score_all}")
    print(f"Filtered (non-empty and valid translations) - BLEU score: {bleu_score_filtered}, chrF score: {chrf_score_filtered}")

    return (bleu_score_all, chrf_score_all), (bleu_score_filtered, chrf_score_filtered)



In [None]:
evaluate_translations('gemini_osmanaga_eval.csv', 'gemini_osmanaga_results.txt')

All translations - BLEU score: 7.014876103281617, chrF score: 32.07380828704673
Filtered (non-empty and valid translations) - BLEU score: 7.848347281490299, chrF score: 36.61478502594365


((7.014876103281617, 32.07380828704673),
 (7.848347281490299, 36.61478502594365))

In [None]:
evaluate_translations('gemini_osmanaga_no_safety_eval.csv', 'gemini_osmanaga_no_safety_results.txt')

All translations - BLEU score: 7.8421801129838675, chrF score: 36.40565274349136
Filtered (non-empty and valid translations) - BLEU score: 7.8421801129838675, chrF score: 36.59685268728821


((7.8421801129838675, 36.40565274349136),
 (7.8421801129838675, 36.59685268728821))