## Download github repo

In [None]:
#just clone the repo
!git clone https://github.com/nhvd3500111/ditto

In [None]:
#In case you want to reupload ditto if you have made changes in the respective
#github repo
'''
import shutil
shutil.rmtree("/kaggle/working/ditto")
%cd /kaggle/working/
!git clone https://github.com/nhvd3500111/ditto
%cd /kaggle/working/ditto
'''


## Install packages and fp16 optimization

In [None]:
# First install requirements
%cd /kaggle/working/ditto
!pip install -r requirements.txt

#Then some necessary imports
import nltk
import os
import time
from IPython.display import FileLink


nltk.download('stopwords')
!git clone https://github.com/NVIDIA/apex
%cd apex
!pip install -v --no-cache-dir ./
%cd ..
# some issue with colab
!pip install --upgrade "urllib3==1.25.4" awscli
%cd /kaggle/working/ditto
!pip install transformers
!pip install tensorboardX
!pip install jsonlines
!pip install openpyxl


## Loading the 2 pre-trained models that our Ditto models exploit

In [None]:
#This cell is executed at this point because of time - counting reasons, since we want the time
#counted in every training session to be exclusively pure model - training time

#Firstly we import AutoModel Class directly into our notebook 
from transformers import AutoModel

AutoModel.from_pretrained('distilbert-base-uncased')
AutoModel.from_pretrained('roberta-base')


## Preprocess - define the specific details of each run

In [None]:
datasets = ["cameras",  "shoes", "watches","computers"]
sizes = ["small", "medium","large"]
neurals = ["gru","linear","cls_sep","lstm","cls_sep_gru"]
gpu_id = 0

#we will execute two runs for each customized ditto_model , so we will define each
#run's cuda.manual_seed for reproducibility reasons
run_ids=range(1,3)


## Train DITTO - Run Matcher


In [None]:
#wdc cameras small

d=datasets[0]
size=sizes[0]

for dk in [False,True]:
    for run_id in run_ids:
        for neural in neurals:
            if dk:
                domain='product'
            else:
                domain = None
            dataset = '_'.join(['wdc', d, size])
            
            print("-----------------------------------------------------------------------------------------")
            print ("\n\nTask: "+dataset+"\nNeural: "+neural+"\nrun_id: "+str(run_id)+"\nDomain Knowledge: "+str(domain)+"\n\n")
            
            time_start=time.time()
            
            !CUDA_VISIBLE_DEVICES=$gpu_id python train_ditto.py \
              --task $dataset \
              --batch_size 32 \
              --max_len 128 \
              --n_epochs 20 \
              --finetuning \
              --save_model \
              --run_id $run_id \
              --da entry_swap \
              --dk $domain \
              --neural $neural  
            
            training_time=round(time.time()- time_start,2)

            #Running the matcher to obtain the results. Remember to provide the same args as above

            !CUDA_VISIBLE_DEVICES=$gpu_id python matcher.py \
              --task $dataset \
              --input_path data/wdc/$d/test.txt \
              --output_path output/output_small_1.jsonl \
              --max_len 128 \
              --use_gpu \
              --da entry_swap \
              --dk $domain \
              --checkpoint_path checkpoints/ \
              --neural $neural \
              --file_excel F1_SCORES.xlsx \
              --run_id $run_id \
              --time_trained $training_time
            
#After every completed for loop, we will download F1_SCORES.xlsx for security reasons
FileLink(r'F1_SCORES.xlsx')

In [None]:
#wdc shoes small

d=datasets[1]
size=sizes[0]

for dk in [False,True]:
    for run_id in run_ids:
        for neural in neurals:
            if dk:
                domain='product'
            else:
                domain = None
            dataset = '_'.join(['wdc', d, size])

            print("-----------------------------------------------------------------------------------------")
            print ("\n\nTask: "+dataset+"\nNeural: "+neural+"\nrun_id: "+str(run_id)+"\nDomain Knowledge: "+str(domain)+"\n\n")
            
            time_start=time.time()
            
            !CUDA_VISIBLE_DEVICES=$gpu_id python train_ditto.py \
              --task $dataset \
              --batch_size 32 \
              --max_len 128 \
              --n_epochs 20 \
              --finetuning \
              --save_model \
              --run_id $run_id \
              --da entry_swap \
              --dk $domain \
              --neural $neural  

            training_time=round(time.time()- time_start,2)
            
            #Running the matcher to obtain the results. Remember to provide the same args as above
            
            !CUDA_VISIBLE_DEVICES=$gpu_id python matcher.py \
              --task $dataset \
              --input_path data/wdc/$d/test.txt \
              --output_path output/output_small_1.jsonl \
              --max_len 128 \
              --use_gpu \
              --da entry_swap \
              --dk $domain \
              --checkpoint_path checkpoints/ \
              --neural $neural \
              --file_excel F1_SCORES.xlsx \
              --run_id $run_id \
              --time_trained $training_time
            
            
#After every completed for loop, we will download F1_SCORES.xlsx for security reasons
FileLink(r'F1_SCORES.xlsx')

In [None]:
#wdc watches small

d=datasets[2]
size=sizes[0]

for dk in [False,True]:
    for run_id in run_ids:
        for neural in neurals:
            if dk:
                domain='product'
            else:
                domain = None
            dataset = '_'.join(['wdc', d, size])

            print("-----------------------------------------------------------------------------------------")
            print ("\n\nTask: "+dataset+"\nNeural: "+neural+"\nrun_id: "+str(run_id)+"\nDomain Knowledge: "+str(domain)+"\n\n")
            
            time_start=time.time()
            
            !CUDA_VISIBLE_DEVICES=$gpu_id python train_ditto.py \
              --task $dataset \
              --batch_size 32 \
              --max_len 128 \
              --n_epochs 20 \
              --finetuning \
              --save_model \
              --run_id $run_id \
              --da entry_swap \
              --dk $domain \
              --neural $neural  

            training_time=round(time.time()- time_start,2)
            
            #Running the matcher to obtain the results. Remember to provide the same args as above
            
            !CUDA_VISIBLE_DEVICES=$gpu_id python matcher.py \
              --task $dataset \
              --input_path data/wdc/$d/test.txt \
              --output_path output/output_small_1.jsonl \
              --max_len 128 \
              --use_gpu \
              --da entry_swap \
              --dk $domain \
              --checkpoint_path checkpoints/ \
              --neural $neural \
              --file_excel F1_SCORES.xlsx \
              --run_id $run_id \
              --time_trained $training_time
            
            
#After every completed for loop, we will download F1_SCORES.xlsx for security reasons
FileLink(r'F1_SCORES.xlsx')

In [None]:
#wdc computers small

d=datasets[3]
size=sizes[0]

for dk in [False,True]:
    for run_id in run_ids:
        for neural in neurals:
            if dk:
                domain='product'
            else:
                domain = None
            dataset = '_'.join(['wdc', d, size])

            print("-----------------------------------------------------------------------------------------")
            print ("\n\nTask: "+dataset+"\nNeural: "+neural+"\nrun_id: "+str(run_id)+"\nDomain Knowledge: "+str(domain)+"\n\n")
            
            time_start=time.time()
            
            !CUDA_VISIBLE_DEVICES=$gpu_id python train_ditto.py \
              --task $dataset \
              --batch_size 32 \
              --max_len 128 \
              --n_epochs 20 \
              --finetuning \
              --save_model \
              --run_id $run_id \
              --da entry_swap \
              --dk $domain \
              --neural $neural  

            training_time=round(time.time()- time_start,2)
            
            #Running the matcher to obtain the results. Remember to provide the same args as above
            
            !CUDA_VISIBLE_DEVICES=$gpu_id python matcher.py \
              --task $dataset \
              --input_path data/wdc/$d/test.txt \
              --output_path output/output_small_1.jsonl \
              --max_len 128 \
              --use_gpu \
              --da entry_swap \
              --dk $domain \
              --checkpoint_path checkpoints/ \
              --neural $neural \
              --file_excel F1_SCORES.xlsx \
              --run_id $run_id \
              --time_trained $training_time
            
            
#After every completed for loop, we will download F1_SCORES.xlsx for security reasons
FileLink(r'F1_SCORES.xlsx')

In [None]:
#wdc all medium 

# We will compare  the original model's performance to our most
# prominent solution: gru, since wdc_all_medium is a relatively big dataset

d='all'
size=sizes[1]

for run_id in range(2,4): 
    for neural in ["gru","linear"]:
        dataset = '_'.join(['wdc', d, size])

        print("-----------------------------------------------------------------------------------------")
        print ("\n\nTask: "+dataset+"\nNeural: "+neural+"\nrun_id: "+str(run_id)+"\nDomain Knowledge: None\n\n")

        time_start=time.time()

        !CUDA_VISIBLE_DEVICES=$gpu_id python train_ditto.py \
          --task $dataset \
          --batch_size 32 \
          --max_len 128 \
          --n_epochs 20 \
          --finetuning \
          --save_model \
          --run_id $run_id \
          --da del \
          --neural $neural  

        training_time=round(time.time()- time_start,2)

        #Running the matcher to obtain the results. Remember to provide the same args as above

        !CUDA_VISIBLE_DEVICES=$gpu_id python matcher.py \
          --task $dataset \
          --input_path data/wdc/$d/test.txt \
          --output_path output/output_small_1.jsonl \
          --max_len 128 \
          --use_gpu \
          --da del \
          --checkpoint_path checkpoints/ \
          --neural $neural \
          --file_excel F1_SCORES.xlsx \
          --run_id $run_id \
          --time_trained $training_time


#After every completed for loop, we will download F1_SCORES.xlsx for security reasons
FileLink(r'F1_SCORES.xlsx')

In [None]:
#Cheat script to download the whole ditto folder from kaggle/working and extracting 
#the F1_SCORES.xlsx file  from there, because there is a bug in kaggle's working 
#directory when trying to download only the xlsx file

'''
%cd /kaggle/working
import shutil
shutil.make_archive('all_folder', 'zip', '/kaggle/working/ditto')
FileLink(r'all_folder.zip')
'''