## Run the whole pipeline

In [13]:
import string
import subprocess
import threading
import os
from pathlib import Path
import random

from shared import DATADIR, MODELDIR, dataset_path

In [14]:
def save_results(notebook_name):
    copy_data_command = f"cp -r {DATADIR} {persistent_location / id }/"
    copy_model_command = f"cp -r {MODELDIR} {persistent_location / id }/"
    copy_notebook_command = f"cp {notebook_name} {persistent_location / id}/"
    
    try: 
        subprocess.run(copy_data_command, shell=True, check=True)
        if os.path.exists(MODELDIR):
            subprocess.run(copy_model_command, shell=True, check=True)
        subprocess.run(copy_notebook_command, shell=True, check=True)
    except subprocess.CalledProcessError as e:
        print(e.output)
        raise e
    if use_drive:
        drive.flush_and_unmount()
        drive.mount('/content/drive', force_remount=True)

In [15]:
def run_notebook(path):
    command = f"jupyter nbconvert --to notebook --execute --inplace {path} --log-level WARN"
    try: 
        subprocess.run(command, shell=True, check=True)
    except subprocess.CalledProcessError as e:
        print(e.output)
        print("Error running notebook, exit code:", e.returncode)
        if e.stdout:
            print("Standard Output:")
            print(e.stdout.decode())
        if e.stderr:
            print("Standard Error:")
            print(e.stderr.decode())
        save_results(path)
        raise e

Clone rest of the repo if in colab

In [16]:
if 'COLAB_GPU' in os.environ:
    !git clone https://github.com/martin3398/nlp-project-2023.git
    %mv nlp-project-2023/* ./
    %rm -rf nlp-project-2023/
    

When in colab, safe results to drive

In [18]:
if 'COLAB_GPU' in os.environ:
    # noinspection PyUnresolvedReferences
    from google.colab import drive
    use_drive = True
    drive.mount('/content/drive')
    persistent_location = Path("/content/drive/MyDrive/NLP-Project/runs/")
    if not os.path.exists(dataset_path):
        copy_dataset_command = f"cp /content/drive/MyDrive/NLP-Project/WELFake_Dataset.csv {dataset_path}"
        subprocess.run(copy_dataset_command, shell=True, check=True)
else:
    use_drive = False
    persistent_location = Path("./runs/")

id = ''.join(random.choices(string.ascii_uppercase, k=4))
while True:
    if not os.path.exists(persistent_location / id):
        print('run id:', id)
        os.makedirs(persistent_location / id)
        break

run id: JSLX


Split dataset

In [19]:
run_notebook("01-Clean_and_Split.ipynb")
save_results("01-Clean_and_Split.ipynb")

Start style transfer (can run in the background before the evaluation step)

In [20]:
fakenews_style_transfer_thread = threading.Thread(target=run_notebook, args=("02a-Styletransfer_fakenews.ipynb",))
realnews_style_transfer_thread = threading.Thread(target=run_notebook, args=("02b-Styletransfer_realnews.ipynb",))
fakenews_style_transfer_thread.start()
realnews_style_transfer_thread.start()

Tokenize

In [21]:
run_notebook("02b-Tokenize_Original.ipynb")
save_results("02b-Tokenize_Original.ipynb")

Train

In [None]:
run_notebook("03_Train.ipynb")

Evaluate

In [23]:
# Wait for style transfer
if fakenews_style_transfer_thread.is_alive():
    fakenews_style_transfer_thread.join()
    save_results("02a-Styletransfer_fakenews.ipynb")

if realnews_style_transfer_thread.is_alive():
    realnews_style_transfer_thread.join()
    save_results("02b-Styletransfer_realnews.ipynb")

In [24]:
# Evaluate
run_notebook("04_Evaluate.ipynb")
save_results("04_Evaluate.ipynb")