## Importing packages

In [2]:
#!pip install replicate

In [2]:
import replicate
import os

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import time
import datetime
import pandas as pd
import numpy as np
import tqdm
from tqdm.notebook import tqdm
tqdm.pandas(leave=True)

## Partition test data into three different subsets for three team members to run seperately
This part of the notebook is ran only once!!!

In [11]:
## This valid _cleaned.csv is created in part1 lstm. Run "Load data" and "Clean data" part in the submit_phase1_LSTM notebook to get this file
if os.path.isfile('./data/valid_cleaned_data.csv'):
    # cleaned, filtered by length dataset
    df = pd.read_csv('./data/valid_cleaned_data.csv', index_col=False)

df

Unnamed: 0,en,fr
0,changing lives changing society how it works t...,il a transforme notre vie il a transforme la s...
1,site map,plan du site
2,feedback,retroaction
3,credits,credits
4,francais,english
...,...,...
14699301,stock assessment of the european lobster homar...,l evaluation des stocks de homards d europe ho...
14699302,which assumes that recruitment to the fishery ...,selon laquelle le recrutement pour la peche es...
14699303,yield curves show a clear maximum with a marke...,les courbes de la production montrent un net m...
14699304,error file not found sorry but the file lvtsmp...,erreur fichier introuvable nous sommes desoles...


In [15]:
# Get the last 1% as test data
df_test = df.iloc[-int(0.01*len(df)):]
df_test

Unnamed: 0,en,fr
14552313,she had an electronic signature account in the...,elle possedait un compte de signature electron...
14552314,she exercised acting unit manager line authori...,elle exercait sur les employes en poste l auto...
14552315,for an appeal board to have jurisdiction to he...,pour qu un comite d appel ait competence pour ...
14552316,years ago the public service commission delega...,il y a des annees la commission de la fonction...
14552317,the appellant has confirmed that ms beaudoin i...,l appelant a confirme que madame beaudoin n ex...
...,...,...
14699301,stock assessment of the european lobster homar...,l evaluation des stocks de homards d europe ho...
14699302,which assumes that recruitment to the fishery ...,selon laquelle le recrutement pour la peche es...
14699303,yield curves show a clear maximum with a marke...,les courbes de la production montrent un net m...
14699304,error file not found sorry but the file lvtsmp...,erreur fichier introuvable nous sommes desoles...


In [16]:
third = int(len(df_test) / 3)
df_mxl = df_test[:third]
df_ytx = df_test[third:2*third]
df_byk = df_test[2*third:]

display(df_mxl)
display(df_ytx)
display(df_byk)

Unnamed: 0,en,fr
14552313,she had an electronic signature account in the...,elle possedait un compte de signature electron...
14552314,she exercised acting unit manager line authori...,elle exercait sur les employes en poste l auto...
14552315,for an appeal board to have jurisdiction to he...,pour qu un comite d appel ait competence pour ...
14552316,years ago the public service commission delega...,il y a des annees la commission de la fonction...
14552317,the appellant has confirmed that ms beaudoin i...,l appelant a confirme que madame beaudoin n ex...
...,...,...
14601305,providing provincial sales tax numbers to card...,fournir les numeros de la taxe de vente provin...
14601306,the accounting operations section is responsib...,la section des operations comptables est charg...
14601307,the agency s internal audit division shall be ...,la direction de la verification interne sera c...
14601308,all questions regarding clarifications interpr...,toutes les questions concernant l interpretati...


Unnamed: 0,en,fr
14601310,procedures in order to properly use the acquis...,pour utiliser correctement la carte d achat le...
14601311,the cardholder must keep all related documents...,le detenteur de la carte doit conserver tous l...
14601312,advising the vendor when placing an order that...,avertir le vendeur lorsqu il passe une command...
14601313,in order to verify the monthly acquisition car...,pour verifier le releve mensuel de la carte d ...
14601314,recording and assigning a control number for e...,le detenteur de la carte doit enregistrer chaq...
...,...,...
14650302,validity of licence upon deletion of registrat...,validite de la licence apres radiation de l en...
14650303,definition of layout design of integrated circuit,objet de la protection juridique des schemas d...
14650304,for the purposes of this act reproduction mean...,une partie d un schema de configuration de cir...
14650305,where the layout design has been made through ...,le droit patrimonial du createur du schema de ...


Unnamed: 0,en,fr
14650307,authorship is inalienable and indefinite in time,droit de demander l enregistrement d un schema...
14650308,the owner of layout design shall be the person...,le titulaire du schema de configuration est la...
14650309,the owner enjoys full legal power exclusive ri...,le titulaire jouit de la totalite des droits d...
14650310,the rights of the owner of layout design may b...,les droits du titulaire du schema de configura...
14650311,application for registration of layout design ...,chapitre demande d enregistrement d un schema ...
...,...,...
14699301,stock assessment of the european lobster homar...,l evaluation des stocks de homards d europe ho...
14699302,which assumes that recruitment to the fishery ...,selon laquelle le recrutement pour la peche es...
14699303,yield curves show a clear maximum with a marke...,les courbes de la production montrent un net m...
14699304,error file not found sorry but the file lvtsmp...,erreur fichier introuvable nous sommes desoles...


In [None]:
df_mxl.to_csv('./data/mxl_llama_test.csv', index=False)
df_ytx.to_csv('./data/ytx_llama_test.csv', index=False)
df_byk.to_csv('./data/byk_llama_test.csv', index=False)

## Setting up to replicate api to test llama 2

In [2]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [None]:
# Go to https://replicate.com/meta/llama-2-70b-chat/api?tab=nodejs and get an api token

# To run this cell, either create a secret.py file and paste your api token in there and name the variable REPLICATE_API_TOKEN
# Or just put in your api token in  the second to last line and uncomment

from secret import *

#REPLICATE_API_TOKEN="YOUR_REPLICATE_API_TOKEN"
REPLICATE_API_TOKEN


In [4]:
# change this to your own api_token from replicate website!!!

replicate = replicate.Client(api_token=REPLICATE_API_TOKEN)

## Test multiple prompts

In [5]:
# change to your local address!!!

df = pd.read_csv('./data/mxl_llama_test.csv')
# df = df.iloc[:10]

In [6]:
df

Unnamed: 0,en,fr
0,she had an electronic signature account in the...,elle possedait un compte de signature electron...
1,she exercised acting unit manager line authori...,elle exercait sur les employes en poste l auto...
2,for an appeal board to have jurisdiction to he...,pour qu un comite d appel ait competence pour ...
3,years ago the public service commission delega...,il y a des annees la commission de la fonction...
4,the appellant has confirmed that ms beaudoin i...,l appelant a confirme que madame beaudoin n ex...
...,...,...
48992,providing provincial sales tax numbers to card...,fournir les numeros de la taxe de vente provin...
48993,the accounting operations section is responsib...,la section des operations comptables est charg...
48994,the agency s internal audit division shall be ...,la direction de la verification interne sera c...
48995,all questions regarding clarifications interpr...,toutes les questions concernant l interpretati...


In [7]:
def translate_row(s):
    output = replicate.run(
    "meta/llama-2-7b-chat:13c3cdee13ee059ab779f0291d29054dab00a47dad8261375654de5540165fb0",
    input={
        "debug": False,
        "top_k": -1,
        "top_p": 1,
        "prompt": s,
        "temperature": 0.2,
        "system_prompt": "Format the response like this: (English sentence):(French sentence)",
        "max_new_tokens": 300,
        "min_new_tokens": -1,
        "repetition_penalty": 1
        }
    )
    li = list(output)
    raw_str = ''.join(li)
    trans = raw_str.split('French sentence)')[-1]
    norm = normalizeString(trans)
    return norm

In [8]:
# Since running the api on the entire dataset is risky, further partition each member's dataset into three 
third = int(len(df) / 3)
df_1 = df[:third]
df_2 = df[third:2*third]
df_3 = df[2*third:]

In [10]:
start = time.time()
df_1['trans'] = df_1['en'].progress_apply(lambda s:translate_row(s))

end = time.time()
print('translating ', len(df_1) ,' lines take ', str(datetime.timedelta(seconds = end-start)))
df_1.to_csv('./data/trans_1.csv', index=False)

  0%|          | 0/16332 [00:00<?, ?it/s]

translating  16332  lines take  6:29:49.849928


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['trans'] = df_1['en'].progress_apply(lambda s:translate_row(s))


AttributeError: module 'pandas' has no attribute 'to_csv'

In [13]:
start = time.time()
df_2['trans'] = df_2['en'].progress_apply(lambda s:translate_row(s))

end = time.time()
print('translating ', len(df_2) ,' lines take ', str(datetime.timedelta(seconds = end-start)))
df_2.to_csv('./data/trans_2.csv', index=False)

  0%|          | 0/16332 [00:00<?, ?it/s]

translating  16332  lines take  6:24:33.879207


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['trans'] = df_2['en'].progress_apply(lambda s:translate_row(s))


In [18]:
start = time.time()
df_3['trans'] = df_3['en'].progress_apply(lambda s:translate_row(s))

end = time.time()
print('translating ', len(df_3) ,' lines take ', str(datetime.timedelta(seconds = end-start)))
df_3.to_csv('./data/trans_3.csv', index=False)

  0%|          | 0/16333 [00:00<?, ?it/s]

translating  16333  lines take  6:33:25.207581


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3['trans'] = df_3['en'].progress_apply(lambda s:translate_row(s))


In [19]:
df = pd.concat([df_1, df_2, df_3])

In [20]:
df.head()

Unnamed: 0,en,fr,trans
0,she had an electronic signature account in the...,elle possedait un compte de signature electron...,elle avait un compte de signature electronique...
1,she exercised acting unit manager line authori...,elle exercait sur les employes en poste l auto...,elle exercait des competences d agent de gesti...
2,for an appeal board to have jurisdiction to he...,pour qu un comite d appel ait competence pour ...,pour que le tribunal d appel ait competence po...
3,years ago the public service commission delega...,il y a des annees la commission de la fonction...,il y a des annees la commission des services p...
4,the appellant has confirmed that ms beaudoin i...,l appelant a confirme que madame beaudoin n ex...,l appellant a confirme que mme beaudoin n est ...


## manual check

In [21]:
df['en'][7]

'the provisions relevant to the proceedings are the following'

In [22]:
df['fr'][7]

'les dispositions pertinentes au debat sont les suivantes'

In [23]:
df['trans'][7]

'les dispositions applicables aux procedures sont les suivantes'

## Evaluate with nltk bleu score for one member

In [24]:
from nltk.translate.bleu_score import sentence_bleu

In [26]:
def bleu(truth,pred):
    return sentence_bleu([truth], pred) #, weights=(1, 0, 0, 0)
# Applying it to two columns
df["bleu"] = df.progress_apply(lambda x: bleu(x["fr"], x["trans"]), axis=1)

  0%|          | 0/48997 [00:00<?, ?it/s]

In [27]:
df

Unnamed: 0,en,fr,trans,bleu
0,she had an electronic signature account in the...,elle possedait un compte de signature electron...,elle avait un compte de signature electronique...,0.772857
1,she exercised acting unit manager line authori...,elle exercait sur les employes en poste l auto...,elle exercait des competences d agent de gesti...,0.529824
2,for an appeal board to have jurisdiction to he...,pour qu un comite d appel ait competence pour ...,pour que le tribunal d appel ait competence po...,0.785111
3,years ago the public service commission delega...,il y a des annees la commission de la fonction...,il y a des annees la commission des services p...,0.672235
4,the appellant has confirmed that ms beaudoin i...,l appelant a confirme que madame beaudoin n ex...,l appellant a confirme que mme beaudoin n est ...,0.623357
...,...,...,...,...
48992,providing provincial sales tax numbers to card...,fournir les numeros de la taxe de vente provin...,fournir les numeros de taxe de vente provincia...,0.806354
48993,the accounting operations section is responsib...,la section des operations comptables est charg...,l section des operations comptables est respon...,0.632861
48994,the agency s internal audit division shall be ...,la direction de la verification interne sera c...,la division d audit internes de l agence sera ...,0.733510
48995,all questions regarding clarifications interpr...,toutes les questions concernant l interpretati...,english sentence all questions regarding clari...,0.406074


In [28]:
# score = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
# assign all weight to 1 gram

for i in range(7):
      print(i, ' ', sentence_bleu([df['fr'][i]], df['trans'][i]))

0   0.7728574965778328
1   0.5298243507232901
2   0.7851109264437648
3   0.6722352454155893
4   0.6233572635195772
5   0.600292143650524
6   0.6994755655129972


In [31]:
# save it to your own path!

df.to_csv('./data/mxl_pretrained_llama_results.csv', index=False)

In [30]:
df['bleu'].mean()

0.5034942328940446

## Concatenate results from the three parts
This part of the cell is ran only once!!! This is concatenating all three members phase2 test results into one.

In [3]:
df_mxl = pd.read_csv('./data/mxl_pretrained_llama_results.csv', index_col=False)
df_mxl

Unnamed: 0,en,fr,trans,bleu
0,she had an electronic signature account in the...,elle possedait un compte de signature electron...,elle avait un compte de signature electronique...,0.772857
1,she exercised acting unit manager line authori...,elle exercait sur les employes en poste l auto...,elle exercait des competences d agent de gesti...,0.529824
2,for an appeal board to have jurisdiction to he...,pour qu un comite d appel ait competence pour ...,pour que le tribunal d appel ait competence po...,0.785111
3,years ago the public service commission delega...,il y a des annees la commission de la fonction...,il y a des annees la commission des services p...,0.672235
4,the appellant has confirmed that ms beaudoin i...,l appelant a confirme que madame beaudoin n ex...,l appellant a confirme que mme beaudoin n est ...,0.623357
...,...,...,...,...
48992,providing provincial sales tax numbers to card...,fournir les numeros de la taxe de vente provin...,fournir les numeros de taxe de vente provincia...,0.806354
48993,the accounting operations section is responsib...,la section des operations comptables est charg...,l section des operations comptables est respon...,0.632861
48994,the agency s internal audit division shall be ...,la direction de la verification interne sera c...,la division d audit internes de l agence sera ...,0.733510
48995,all questions regarding clarifications interpr...,toutes les questions concernant l interpretati...,english sentence all questions regarding clari...,0.406074


In [4]:
df_ytx = pd.read_csv('./data/ytx_llama2_translated.csv', index_col=False)
df_ytx

Unnamed: 0,en,fr,trans,bleu
0,procedures in order to properly use the acquis...,pour utiliser correctement la carte d achat le...,pour utiliser correctement la carte d acquisit...,0.617802
1,the cardholder must keep all related documents...,le detenteur de la carte doit conserver tous l...,le titulaire de la carte doit conserver tous l...,0.596520
2,advising the vendor when placing an order that...,avertir le vendeur lorsqu il passe une command...,le fournisseur doit etre informe de ne charger...,0.325965
3,in order to verify the monthly acquisition car...,pour verifier le releve mensuel de la carte d ...,pour verifier le statement mensuel de la carte...,0.505737
4,recording and assigning a control number for e...,le detenteur de la carte doit enregistrer chaq...,enregistrez et attribuez un numero de controle...,0.554317
...,...,...,...,...
48992,validity of licence upon deletion of registrat...,validite de la licence apres radiation de l en...,la validite de la licence a l effet de suppres...,0.521245
48993,definition of layout design of integrated circuit,objet de la protection juridique des schemas d...,definition de conception de layout d integrate...,0.060112
48994,for the purposes of this act reproduction mean...,une partie d un schema de configuration de cir...,pour les fins de cette loi la reproduction sig...,0.343523
48995,where the layout design has been made through ...,le droit patrimonial du createur du schema de ...,lorsque le design de la layout a ete elabore a...,0.267986


In [6]:
df_btk = pd.read_csv('./data/btk_llama2_translated.csv', index_col=False)
df_btk = df_btk[['en', 'fr', 'trans', 'bleu']]
df_btk

Unnamed: 0,en,fr,trans,bleu
0,authorship is inalienable and indefinite in time,droit de demander l enregistrement d un schema...,l auteur est inalienable et indefiniment temps,0.131400
1,the owner of layout design shall be the person...,le titulaire du schema de configuration est la...,le proprietaire de la conception de layout ser...,0.382321
2,the owner enjoys full legal power exclusive ri...,le titulaire jouit de la totalite des droits d...,le proprietaire a plein pouvoir legal et des d...,0.533971
3,the rights of the owner of layout design may b...,les droits du titulaire du schema de configura...,les droits de l exploitant d un design de layo...,0.611139
4,application for registration of layout design ...,chapitre demande d enregistrement d un schema ...,demande de inscription d un design de layout t...,0.208958
...,...,...,...,...
48994,stock assessment of the european lobster homar...,l evaluation des stocks de homards d europe ho...,l evaluation de la stock de homarus gammarus e...,0.726244
48995,which assumes that recruitment to the fishery ...,selon laquelle le recrutement pour la peche es...,l assumption selon laquelle la recrue dans la ...,0.321226
48996,yield curves show a clear maximum with a marke...,les courbes de la production montrent un net m...,les courbes de rendement montrent une maximum ...,0.751903
48997,error file not found sorry but the file lvtsmp...,erreur fichier introuvable nous sommes desoles...,erreur le fichier lvtsmp html n a pas ete trouve,0.319866


In [7]:
df_phase2 = pd.concat([df_mxl, df_ytx, df_btk])
df_phase2.reset_index(drop=True, inplace=True)
df_phase2



Unnamed: 0,en,fr,trans,bleu
0,she had an electronic signature account in the...,elle possedait un compte de signature electron...,elle avait un compte de signature electronique...,0.772857
1,she exercised acting unit manager line authori...,elle exercait sur les employes en poste l auto...,elle exercait des competences d agent de gesti...,0.529824
2,for an appeal board to have jurisdiction to he...,pour qu un comite d appel ait competence pour ...,pour que le tribunal d appel ait competence po...,0.785111
3,years ago the public service commission delega...,il y a des annees la commission de la fonction...,il y a des annees la commission des services p...,0.672235
4,the appellant has confirmed that ms beaudoin i...,l appelant a confirme que madame beaudoin n ex...,l appellant a confirme que mme beaudoin n est ...,0.623357
...,...,...,...,...
146988,stock assessment of the european lobster homar...,l evaluation des stocks de homards d europe ho...,l evaluation de la stock de homarus gammarus e...,0.726244
146989,which assumes that recruitment to the fishery ...,selon laquelle le recrutement pour la peche es...,l assumption selon laquelle la recrue dans la ...,0.321226
146990,yield curves show a clear maximum with a marke...,les courbes de la production montrent un net m...,les courbes de rendement montrent une maximum ...,0.751903
146991,error file not found sorry but the file lvtsmp...,erreur fichier introuvable nous sommes desoles...,erreur le fichier lvtsmp html n a pas ete trouve,0.319866


In [8]:
df_phase2['bleu'].mean()

0.49353953601963874

In [9]:
df_phase2.to_csv('./data/phase2_results.csv', index=False)