# T5-base Finetuned for Question Generation

## Install Requirements

In [None]:
%pip install -r requirements.txt

## Gather dataset

In [None]:

import math
import tarfile
import zipfile
import pandas as pd
import urllib.request
import os
import ijson


def download_dataset(url, file_name):
    urllib.request.urlretrieve(
        url,
        os.path.join("dataset/", file_name),
        reporthook=(
            lambda count, block, total: print(
                f"Downloading {file_name}: {math.floor((count * block) / total * 100)}%",
                end="\r",
            )
        ),
    )
    print(f"Downloaded {file_name} from {url}")


# Check if the dataset already exists
if not (os.path.exists("dataset")):
    os.mkdir("dataset")

    download_dataset("https://data.deepai.org/squad1.1.zip", "squad.zip")
    download_dataset(
        "http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz", "race.tar.gz"
    )

    with zipfile.ZipFile(os.path.join("dataset/", "squad.zip"), mode="r") as obj:
        obj.extractall(os.path.join("dataset/", "squad/"))

    with tarfile.open(os.path.join("dataset/", "race.tar.gz"), mode="r") as obj:
        obj.extractall(os.path.join("dataset/", "race/"))

# This list will store all the Q&A
source_list = []
target_list = []

# Start cleaning data
squad = open(os.path.join("dataset/", "squad/train-v1.1.json"))
objects = ijson.items(squad, "data.item")

for obj in objects:
    title = obj["title"]
    paragraphs = obj["paragraphs"]
    for p in paragraphs:
        context = p["context"]
        qas = [p for p in p["qas"] if len(p) > 0]

        source_text = f"generate {len(qas)} questions: {context}"
        target_text = ""

        for number, qa in enumerate(qas):
            target_text += (
                f"{number + 1}. {qa['question']}\nA: {qa['answers'][0]['text']}\n"
            )

        source_list.append(source_text)
        target_list.append(target_text)

dataframe = pd.DataFrame({"source_text": source_list, "target_text": target_list})

train_df = dataframe.sample(frac=0.8, random_state=20)
test_df = dataframe.drop(train_df.index)

## Start training

In [None]:
from simplet5 import SimpleT5

model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-small")

model.train(train_df=train_df,
          eval_df=test_df, 
          source_max_token_len=1024, 
          target_max_token_len=1024, 
          batch_size=10, max_epochs=3, use_gpu=True)

## Load trained model

In [None]:
# let's load the trained model for inferencing:
model.load_model("t5","simplet5-epoch-0-train", use_gpu=False)

text_to_summarize="""generate 5 questions: The Saline Valley salt tram is located in Inyo County, California. The electric aerial tramway was constructed from 1911 to 1913 to carry salt from the Saline Valley, over the Inyo Mountains, and into the Owens Valley. Covering a distance of 13.4 miles (21.6 km), it operated sporadically from 1913 to 1935 for four different companies. The tram was built for the Saline Valley Salt Company (SVSC) by the Trenton Iron Company, but the costs of its construction and operation were ruinously expensive for the SVSC. The salt mining operation and tram were leased in 1915 to the Owens Valley Salt Company until the latter went bankrupt in 1918. In 1920, the tram was taken over by the Trenton Iron Company, which sold it to the Sierra Salt Company in 1928. The Sierra Salt Company put it back into service until the company went bankrupt in 1935. During its operation, it was the steepest tram in the United States. The tram was included in the National Register of Historic Places in 1974. 
"""

print(model.predict(text_to_summarize, max_length=1024)[0])

# Misc Scripts

Scripts that are useful throughout the duration of this

## Create zip of outputs

In [8]:
import zipfile
import pathlib
import os

directory = pathlib.Path("outputs/")

if os.path.exists("outputs.zip"):
  os.remove("outputs.zip")

with zipfile.ZipFile("outputs.zip", mode="w") as archive:
  for file_path in directory.rglob("*"):
    archive.write(file_path, arcname=file_path.relative_to(directory))

with zipfile.ZipFile("outputs.zip", mode="r") as archive:
  archive.printdir()

File Name                                             Modified             Size
simplet5-epoch-0-train/                        2022-05-16 05:47:10            0
.ipynb_checkpoints/                            2022-05-16 05:49:20            0
simplet5-epoch-0-train/config.json             2022-05-16 05:47:10         1380
simplet5-epoch-0-train/spiece.model            2022-05-16 05:47:10       791656
simplet5-epoch-0-train/special_tokens_map.json 2022-05-16 05:47:10         1786
simplet5-epoch-0-train/tokenizer_config.json   2022-05-16 05:47:10         1924
simplet5-epoch-0-train/tokenizer.json          2022-05-16 05:47:10      2422095
simplet5-epoch-0-train/pytorch_model.bin       2022-05-16 05:47:12    242085627


## Unzip outputs

In [4]:
import zipfile
import os

if os.path.exists("/content/drive/MyDrive/outputs.zip"):
  with zipfile.ZipFile("/content/drive/MyDrive/outputs.zip", mode="r") as archive:
    archive.extractall()

## Mount google drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Clear pytorch cache

In [5]:
import torch
torch.cuda.empty_cache()