In [None]:
!perl /kaggle/input/cleaning/clean-corpus-n.perl -max-word-length 50 /kaggle/input/newdataset-en-to-ar/train ar en train.clean 0 150

In [None]:
!wc -l train.clean.en train.clean.ar

In [None]:
!git clone https://github.com/kpu/preprocess.git
%cd preprocess
!mkdir build
%cd build
!cmake ..
!make -j4

In [None]:
!preprocess/build/bin/simple_cleaning -p train.clean.ar train.clean.en newdata/train.clean.pp.ar newdata/train.clean.pp.en


In [None]:
!preprocess/build/bin/simple_cleaning -p test.clean.ar test.clean.en newdata/test.clean.pp.ar newdata/test.clean.pp.en

In [None]:
!preprocess/build/bin/dedupe -p newdata/test.clean.pp.ar newdata/test.clean.pp.en newdata/test.clean.pp.dedup.ar newdata/test.clean.pp.dedup.en

In [None]:
!pip install sacremoses

In [None]:
!sacremoses normalize < newdata/valid.clean.pp.dedup.ar > normalize/valid.norm.ar

In [None]:
!sacremoses normalize < newdata/valid.clean.pp.dedup.en > normalize/valid.norm.en

In [None]:
!pip install sentencepiece

In [None]:
!cat normalize/train.norm.ar normalize/train.norm.en > normalize/train.norm.ar-en

In [None]:
import sentencepiece as spm

# Path to the input text file
input_file = 'normalize/train.norm.ar-en'

# Model prefix for saving the trained model
model_prefix = 'ar-en.32kspm'

# Vocabulary size for the SentencePiece model
vocab_size = 32000

# Train the SentencePiece model
spm.SentencePieceTrainer.train(
    input=input_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size
)


In [None]:
import wandb
wandb.login()


In [None]:
import sys
import os
import math
from tqdm import tqdm

import torch
import torch.optim as optim
import torch.nn as nn

from torchtext.data import Field, BucketIterator,TabularDataset

sys.path.append('/kaggle/input/modelfile')

from model import Encoder, Decoder, Seq2Seq
import random
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)

import sentencepiece as spm


sp = spm.SentencePieceProcessor()
sp.Load('ar-en.32kspm.model')

def tokenize_ar(x):
    x = str(x).lower()
    return sp.EncodeAsPieces(x)

def tokenize_en(x):
    x = str(x).lower()
    x = x.translate({ord(c): None for c in '!.?,'})
    return x.split()


SRC = Field(tokenize=tokenize_ar, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

dataset = TabularDataset(path='dataset.csv', format='csv', fields=[('ar', SRC), ('en', TRG)], skip_header=True)
train_dt, valid_dt, test_dt = dataset.split(split_ratio=[0.7, 0.1, 0.2], random_state=random.getstate())

SRC.build_vocab(train_dt, min_freq=2)
TRG.build_vocab(train_dt, min_freq=2)

bsize = 32
gpu = True
device = torch.device('cuda' if gpu and torch.cuda.is_available() else 'cpu')
train_it, valid_it, test_it = BucketIterator.splits((train_dt, valid_dt, test_dt), batch_size=bsize, sort_key=lambda x: len(x.ar), sort_within_batch=False, device=device)

'''
for b in train_it:
    print (b.ar, b.en)
    sys.exit()
'''




In [None]:
!pip install Torchtext==0.6.0

In [None]:
num_examples = len(train_it.dataset)
batch_size = train_it.batch_size
num_iterations = num_examples // batch_size
print(num_iterations)

In [None]:
wandb.init(project='translationmodel_lstm2')
def train(model, train_it, optimizer, criterion, clip, accumulation_steps):
    model.train()
    epoch_loss = 0
    accumulation_steps_counter = 0

    for i, batch in tqdm(enumerate(train_it)):
        src = batch.ar
        trg = batch.en
        optimizer.zero_grad()
        output = model(src, trg)
        loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        accumulation_steps_counter += 1
        if accumulation_steps_counter == accumulation_steps:
            optimizer.step()
            accumulation_steps_counter = 0

        epoch_loss += loss.item()
        wandb.log({'iteration': i + 1, 'loss': loss.item()})

    if accumulation_steps_counter != 0:
        optimizer.step()  # Perform the remaining update

    return epoch_loss / len(train_it)

def evaluate(model, data_it, criterion):
    model.eval()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(data_it)):
        src = batch.ar
        trg = batch.en
        output = model(src, trg, 0)
        loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
        epoch_loss += loss.item()
    return epoch_loss/ len(data_it)

input_dim = len(SRC.vocab)
out_dim = len(TRG.vocab)
enc_emb_dim = 128
dec_emb_dim = 128
hidden_dim = 256
nlayers = 2
enc_dropout = 0.3
dec_dropout = 0.3
enc = Encoder(input_dim, enc_emb_dim, hidden_dim, nlayers, enc_dropout)
dec = Decoder(out_dim, dec_emb_dim, hidden_dim, nlayers, dec_dropout)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters())
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

epoch = 10
clip = 1
savedir = 'models'
model_save_path = os.path.join(savedir, 's2smodel.pt')
best_valid_loss = float('inf')

if not os.path.isdir(f'{savedir}'):
    os.makedirs(f'{savedir}')
for ep in range(epoch):
    train_loss = train(model, train_it, optimizer, criterion, clip, accumulation_steps=4)
    valid_loss = evaluate(model, valid_it, criterion)
    wandb.log({'epoch': ep+1, 'train_loss': train_loss, 'valid_loss': valid_loss})
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_save_path)
    
    print (f'epoch: {ep+1:03} | train loss: {train_loss: .3f} | train_ppl: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')
wandb.finish()

In [None]:
from torch.nn.parallel import DataParallel
wandb.init(project='translationmodel_lstm')
def train(model, train_it, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(train_it)):
        src = batch.ar
        trg = batch.en
        optimizer.zero_grad()
        output = model(src, trg)
        loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        wandb.log({'iteration': i+1, 'loss': loss.item()})
    return epoch_loss/ len(train_it)

def evaluate(model, data_it, criterion):
    model.eval()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(data_it)):
        src = batch.ar
        trg = batch.en
        output = model(src, trg, 0)
        loss = criterion(output[1:].view(-1, output.shape[-1]), trg[1:].view(-1))
        epoch_loss += loss.item()
    return epoch_loss/ len(data_it)

input_dim = len(SRC.vocab)
out_dim = len(TRG.vocab)
enc_emb_dim = 128
dec_emb_dim = 128
hidden_dim = 256
nlayers = 2
enc_dropout = 0.3
dec_dropout = 0.3
enc = Encoder(input_dim, enc_emb_dim, hidden_dim, nlayers, enc_dropout)
dec = Decoder(out_dim, dec_emb_dim, hidden_dim, nlayers, dec_dropout)
model = Seq2Seq(enc, dec, device)
model = DataParallel(model)
model = model.to(device)


optimizer = optim.Adam(model.parameters())
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

epoch = 10
clip = 1
savedir = 'models'
model_save_path = os.path.join(savedir, 's2smodel.pt')
best_valid_loss = float('inf')

if not os.path.isdir(f'{savedir}'):
    os.makedirs(f'{savedir}')
for ep in range(epoch):
    train_loss = train(model, train_it, optimizer, criterion, clip)
    valid_loss = evaluate(model, valid_it, criterion)
    wandb.log({'epoch': ep+1, 'train_loss': train_loss, 'valid_loss': valid_loss})
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_save_path)
    
    print (f'epoch: {ep+1:03} | train loss: {train_loss: .3f} | train_ppl: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')
wandb.finish()

In [None]:
! git clone https://github.com/pytorch/fairseq
%cd fairseq
! pip install --editable ./
%cd ..

In [None]:
import subprocess

SRCS = ["ar"]  # Replace with your desired source languages
TGT = "en"  # Replace with your target language

SPM_ENCODE = "fairseq/scripts/spm_encode.py"  # Replace with the path to your sentencepiece_encode.py script
DATA = "normalize"  # Replace with the path to your data directory
TRAIN_MINLEN = 1  # Replace with your desired minimum length
TRAIN_MAXLEN = 1000  # Replace with your desired maximum length

# Encoding train and valid data
print("Encoding train/valid with learned BPE...")
for SRC in SRCS:
    for LANG in [SRC, TGT]:
        train_input = f"{DATA}/train.norm.{LANG}"
        train_output = f"{DATA}/train.bpe.{SRC}-{TGT}.{LANG}"
        valid_input = f"{DATA}/valid.norm.{LANG}"
        valid_output = f"{DATA}/valid.bpe.{SRC}-{TGT}.{LANG}"
        
        # Encoding train data
        subprocess.run([
            "python", SPM_ENCODE,
            "--model", f"ar-en.32kspm.model",
            "--output_format=piece",
            "--inputs", train_input,
            "--outputs", train_output,
            "--min-len", str(TRAIN_MINLEN),
            "--max-len", str(TRAIN_MAXLEN)
        ])
        
        # Encoding valid data
        subprocess.run([
            "python", SPM_ENCODE,
            "--model", f"ar-en.32kspm.model",
            "--output_format=piece",
            "--inputs", valid_input,
            "--outputs", valid_output
        ])


In [None]:
! fairseq-preprocess \
 --source-lang ar \
 --target-lang en \
 --trainpref normalize/train.bpe.ar-en\
 --validpref normalize/valid.bpe.ar-en \
 --joined-dictionary \
 --destdir ar-en-lstm \
 --workers 10


In [None]:
! fairseq-preprocess \
 --source-lang ar \
 --target-lang en \
 --trainpref normalize/train.bpe.ar-en\
 --validpref normalize/valid.bpe.ar-en \
 --destdir en-ar-lstm \
 --workers 10

In [None]:
! fairseq-train en-ar-lstm \
  --arch lstm \
  --encoder-layers 2 \
  --decoder-layers 2 \
  --dropout 0.3 \
  --optimizer adam \
  --lr 5e-4 \
  --criterion label_smoothed_cross_entropy \
  --encoder-bidirectional \
  --label-smoothing 0.1 \
  --save-dir checkpoints/lstm \
  --save-interval-updates 30000 \
  --max-update 100000 \
  --batch-size 100 \
  --update-freq 1 \
  --wandb-project "multilangual lstm en to ar"



In [None]:
! fairseq-train en-ar-lstm \
  --arch transformer \
  --dropout 0.1 \
  --max-tokens 2000 \
  --attention-dropout 0.1 \
  --activation-dropout 0.1 \
  --encoder-embed-dim 256 \
  --encoder-ffn-embed-dim 512 \
  --encoder-layers 3 \
  --encoder-attention-heads 8 \
  --encoder-learned-pos \
  --decoder-embed-dim 256 \
  --decoder-ffn-embed-dim 512 \
  --decoder-layers 3 \
  --decoder-attention-heads 8 \
  --decoder-learned-pos \
  --max-epoch 10 \
  --optimizer adam \
  --adam-betas "[0.9, 0.98]" \
  --lr 5e-4 \
  --batch-size 128 \
  --seed 1 \
  --save-interval 2 \
  --memory-efficient-fp16 \
  --update-freq 1 \
  --save-dir checkpoints \
  --wandb-project "Translation-senp_transformer"

In [None]:
import torch
import sentencepiece as spm
from fairseq.models.lstm import LSTMModel

# Load the SentencePiece model
sp_model_path = "ar-en.32kspm.model"
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

# Load the Fairseq trained model
model_path = "checkpoints/lstm"
model = LSTMModel.from_pretrained(
    model_path,
    checkpoint_file="checkpoint_best.pt",
    data_name_or_path="en-ar-lstm",
    source_lang="ar",  # Specify the source language code
    target_lang="en",  # Specify the target language code
)

# Set the model to evaluation mode
model.eval()

# Translate interactively
while True:
    # Take user input
    input_sentence = input("Enter a sentence in English (or 'q' to quit): ")

    if input_sentence.lower() == "q":
        break

    # Tokenize the input sentence using SentencePiece
    tokens = sp.encode_as_ids(input_sentence)

    # Convert the tokens to PyTorch tensor
    input_tensor = torch.LongTensor(tokens).unsqueeze(0)  # Add batch dimension

    # Generate translation using the model
    with torch.no_grad():
        translation = model.generate(input_tensor, beam=5)

    # Get the translated sentence without special tokens
    translation_sentence = sp.decode_ids(translation[0][0]["tokens"].tolist())

    print("Translated Sentence:", translation_sentence)


In [None]:
import torch
import sentencepiece as spm
from fairseq.models.lstm import LSTMModel

# Load the SentencePiece model
sp_model_path = "ar-en.32kspm.model"
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

# Load the Fairseq trained model
model_path = "checkpoints/lstm"
model = LSTMModel.from_pretrained(
    model_path,
    checkpoint_file="checkpoint_best.pt",
    data_name_or_path="en-ar-lstm",
    source_lang="ar",  # Specify the source language code
    target_lang="en",  # Specify the target language code
)

# Set the model to evaluation mode
model.eval()

# Translate interactively
while True:
    # Take user input
    input_sentence = input("Enter a sentence in English (or 'q' to quit): ")

    if input_sentence.lower() == "q":
        break

    # Encode the input sentence using SentencePiece
    encoded_sentence = sp.encode_as_pieces(input_sentence)

    # Remove BPE encoding
    encoded_sentence = [piece.replace("@@ ", "") for piece in encoded_sentence]

    # Convert the tokens to PyTorch tensor
    input_tensor = torch.LongTensor([sp.piece_to_id(piece) for piece in encoded_sentence]).unsqueeze(0)  # Add batch dimension

    # Generate translation using the model
    with torch.no_grad():
        translation = model.generate(input_tensor, beam=1)

    # Get the translated sentence without special tokens
    translation_ids = translation[0][0]["tokens"].tolist()
    translation_sentence = sp.decode_ids(translation_ids)

    print("Translated Sentence:", translation_sentence)
    print(input_tensor)
    print(translation)



In [2]:
import torch
from fairseq.models.transformer import TransformerModel
from fairseq.data.encoders import register_bpe

class SentencePieceBPE(object):
    def __init__(self, args):
        import sentencepiece as spm
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(args.spm_model_path)

    def encode(self, x: str) -> str:
        return self.sp.EncodeAsPieces(x)

    def decode(self, x: str) -> str:
        return self.sp.DecodePieces(x)


# Register SentencePieceBPE as BPE for Fairseq
register_bpe("sentencepiece", SentencePieceBPE)

# Load the Fairseq trained model
model_path = "transformer"
model = TransformerModel.from_pretrained(
    model_path,
    checkpoint_file="snp-transf.pt",
    data_name_or_path="en-ar-lstm",
    source_lang="ar",  # Specify the source language code
    target_lang="en",  # Specify the target language code
    bpe="sentencepiece",  # Use SentencePiece BPE
    sentencepiece_vocab="ar-en.32kspm.vocab",  # Path to SentencePiece vocabulary
    spm_model_path="ar-en.32kspm.model",  # Path to SentencePiece model
    sentencepiece_model="ar-en.32kspm.model",  # Path to SentencePiece model
)


# Set the model to evaluation mode
model.eval()

# Translate interactively
while True:
    # Take user input
    input_sentence = input("Enter a sentence in English (or 'q' to quit): ")

    if input_sentence.lower() == "q":
        break

    # Tokenize the input sentence
    tokens = model.encode(input_sentence)

    # Convert the tokens to PyTorch tensor
    input_tensor = torch.LongTensor(tokens).unsqueeze(0)  # Add batch dimension

    # Generate translation using the model
    with torch.no_grad():
        translation = model.generate(input_tensor, beam=5)

    # Get the translated sentence without special tokens
    translation_sentence = model.decode(translation[0][0]["tokens"])

    print("Translated Sentence:", translation_sentence)

    

2023-06-25 01:46:38 | INFO | fairseq.file_utils | loading archive file transformer
2023-06-25 01:46:38 | INFO | fairseq.file_utils | loading archive file en-ar-lstm
2023-06-25 01:46:39 | INFO | fairseq.tasks.translation | [ar] dictionary: 32488 types
2023-06-25 01:46:39 | INFO | fairseq.tasks.translation | [en] dictionary: 19704 types
2023-06-25 01:46:39 | INFO | fairseq.models.fairseq_model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'Translation-senp_transformer', 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None,

Enter a sentence in English (or 'q' to quit):  sameh


Translated Sentence: Sameh Sameh


Enter a sentence in English (or 'q' to quit):  سامح


Translated Sentence: Samhah


Enter a sentence in English (or 'q' to quit):  نرجس


Translated Sentence: Virgins Nession


Enter a sentence in English (or 'q' to quit):  نرجس


Translated Sentence: Virgins Nession


Enter a sentence in English (or 'q' to quit):  مروان


Translated Sentence: Marwan


Enter a sentence in English (or 'q' to quit):  سماح


Translated Sentence: Samar Smam


Enter a sentence in English (or 'q' to quit):  لا


Translated Sentence: No No No


Enter a sentence in English (or 'q' to quit):  لا لا


Translated Sentence: No No No No No


Enter a sentence in English (or 'q' to quit):  لا ياض


Translated Sentence: Do not persuade


Enter a sentence in English (or 'q' to quit):  لا ياصديقي


Translated Sentence: No friend or friendy


Enter a sentence in English (or 'q' to quit):  لا noel


Translated Sentence: No noel


Enter a sentence in English (or 'q' to quit):  ذهبت مع jonathan إلى السوق.


Translated Sentence: I went with jonathan to the market.


KeyboardInterrupt: Interrupted by user

In [None]:
!tar -czvf archive.tar.gz en-ar-lstm

In [None]:
!pip install fairseq 

In [None]:
!rm -r checkpoints