In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os, psutil  

def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

In [3]:
!nvidia-smi

Sun Oct  4 06:04:02 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

# Download the preprocessed dataset

In [4]:
cpu_stats()

'memory GB:0.14'

In [9]:
# !wget https://www.dropbox.com/s/o3g9y88qgi8eapt/papers_eval_small.csv
# !wget https://www.dropbox.com/s/pmw1jlmv8sy4gp5/papers_train_small.csv
# # # !wget https://www.dropbox.com/s/slaa8uk2jlkq3wl/pytorch_model.bin

In [10]:
# !pip install simpletransformers wandb pytorch-lightning

In [11]:
import logging
import wandb
import random
import pandas as pd
import torch
from simpletransformers.t5 import T5Model
# from pytorch_lightning.metrics.nlp import BLEUScore




In [12]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


### Without domain info
**Simpletransformers implementation of T5 model expects a data to be a dataframe with 3 columns:**
`<prefix>, <input_text>, <target_text>`
* `<prefix>`: A string indicating the task to perform. (E.g. "question", "stsb")
* `<input_text>`: The input text sequence (we will use Paper's abstract as `input_text`  )
* `<target_text`: The target sequence (we will use Paper's title as `output_text` )
    
    
 You can read about the data format:  https://github.com/ThilinaRajapakse/simpletransformers#t5-transformer

In [13]:
def load_dataset(include_domain=False):
    
    train_df = pd.read_csv("./papers_train_small.csv")
    eval_df = pd.read_csv("./papers_eval_small.csv")
    
    train_df.dropna()
    eval_df.dropna()
  
    # add domain tokens
    if include_domain:
        train_df.abstract = train_df.abstract + " @domain: " + train_df.categories
        eval_df.abstract = eval_df.abstract + " @domain: " + eval_df.categories
        
    train_df = train_df[['title','abstract']]
    eval_df = eval_df[['title','abstract']]
    
    train_df.columns = ['target_text', 'input_text']
    eval_df.columns = ['target_text', 'input_text']
    
    
    # task tokens
    train_df['prefix'] = "summarize"
    eval_df['prefix'] = "summarize"
    
    return train_df, eval_df

    

In [40]:
%%time
train_df, eval_df = load_dataset(include_domain=False)

CPU times: user 1.02 s, sys: 83.7 ms, total: 1.1 s
Wall time: 1.1 s


In [41]:
train_df.input_text.iloc[0]

'  A rather non-standard quantum representation of the canonical commutation\nrelations of quantum mechanics systems, known as the polymer representation has\ngained some attention in recent years, due to its possible relation with Planck\nscale physics. In particular, this approach has been followed in a symmetric\nsector of loop quantum gravity known as loop quantum cosmology. Here we explore\ndifferent aspects of the relation between the ordinary Schroedinger theory and\nthe polymer description. The paper has two parts. In the first one, we derive\nthe polymer quantum mechanics starting from the ordinary Schroedinger theory\nand show that the polymer description arises as an appropriate limit. In the\nsecond part we consider the continuum limit of this theory, namely, the reverse\nprocess in which one starts from the discrete theory and tries to recover back\nthe ordinary Schroedinger quantum mechanics. We consider several examples of\ninterest, including the harmonic oscillator, th

In [42]:
print(train_df.shape, eval_df.shape)

(100213, 3) (10520, 3)


### Train without domain information

In [43]:
model_args = {
    "max_seq_length": 512,
    "train_batch_size": 8,
    "eval_batch_size": 8,
    "num_train_epochs": 5,
    "evaluate_during_training": False,
    "evaluate_during_training_steps": 1000,
    "evaluate_during_training_verbose": True,
    
    "use_multiprocessing": False,
    "fp16": False,

    "save_steps": -1,
    "save_eval_checkpoints": True,
    "save_model_every_epoch": True,

    "reprocess_input_data": True,
    "overwrite_output_dir": True,

    "wandb_project": "title-generation",
    
}


In [44]:
# Create T5 Model
model = T5Model("../input/trainedmodelwithout-domain/outputs/", args=model_args, use_cuda=True)

In [None]:
# Train T5 Model on new task
model.train_model(train_data=train_df, eval_data=eval_df)

In [None]:
torch.save("t5-general.pth", model)

In [None]:
model = torch.load("t5-general.pth")

In [None]:
model.save_model("t5-general.pth")

In [None]:
print("hello")

In [23]:
model = torch.load("./pytorch_model.bin")

In [25]:
type(model)

collections.OrderedDict

In [45]:
# Evaluate T5 Model on new task
results = model.eval_model(eval_df)

# Predict with trained T5 model
#print(model.predict(["convert: four"]))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10520.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=1315.0), HTML(value='')))




In [46]:
print(results)

{'eval_loss': 1.8634847951026017}


## And We're Done ! 
**Let's see how our model performs in generating paper's titles**

In [47]:
random_num = 351
actual_title = eval_df.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+eval_df.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)

print(f'Actual Title: {actual_title}')
print(f'Predicted Title: {predicted_title}')
print(f'Actual Abstract: {actual_abstract}')


HBox(children=(HTML(value='Generating outputs'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Decoding outputs'), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Actual Title: Hydrodynamics and beyond in the strongly coupled N=4 plasma
Predicted Title: ['Hydrodynamic and higher quasinormal modes in AdS black hole background']
Actual Abstract: ['summarize:   We continue our investigations on the relation between hydrodynamic and\nhigher quasinormal modes in the AdS black hole background started in\narXiv:0710.4458 [hep-th]. As is well known, the quasinormal modes can be\ninterpreted as the poles of the retarded Green functions of the dual N=4 gauge\ntheory at finite temperature. The response to a generic perturbation is\ndetermined by the residues of the poles. We compute these residues numerically\nfor energy-momentum and R-charge correlators. We find that the diffusion modes\nbehave in a similar way: at small wavelengths the residues go over into a form\nof a damped oscillation and therefore these modes decouple at short distances.\nThe sound mode behaves differently: its residue does not decay and at short\nwavelengths this mode behaves as t

In [48]:
random_num = 777
actual_title = eval_df.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+eval_df.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)
print(f'Actual Title: {actual_title}')
print(f'Predicted Title: {predicted_title}')
print(f'Actual Abstract: {actual_abstract}')

HBox(children=(HTML(value='Generating outputs'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Decoding outputs'), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Actual Title: Towards a homotopy theory of process algebra
Predicted Title: ['labelled flows: a homotopical approach to higher dimensional automata']
Actual Abstract: ['summarize:   This paper proves that labelled flows are expressive enough to contain all\nprocess algebras which are a standard model for concurrency. More precisely, we\nconstruct the space of execution paths and of higher dimensional homotopies\nbetween them for every process name of every process algebra with any\nsynchronization algebra using a notion of labelled flow. This interpretation of\nprocess algebra satisfies the paradigm of higher dimensional automata (HDA):\none non-degenerate full $n$-dimensional cube (no more no less) in the\nunderlying space of the time flow corresponding to the concurrent execution of\n$n$ actions. This result will enable us in future papers to develop a\nhomotopical approach of process algebras. Indeed, several homological\nconstructions related to the causal structure of time flow a

In [49]:
random_num = 187
actual_title = eval_df.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+eval_df.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)

print(f'Actual Title: {actual_title}')
print(f'Predicted Title: {predicted_title}')
print(f'Actual Abstract: {actual_abstract}')

HBox(children=(HTML(value='Generating outputs'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Decoding outputs'), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Actual Title: D=5 M-theory radion supermultiplet dynamics
Predicted Title: ['Radion Supermultiplet and the Cosmological Model']
Actual Abstract: ['summarize:   We show how the bosonic sector of the radion supermultiplet plus d=4, N=1\nsupergravity emerge from a consistent braneworld Kaluza-Klein reduction of D=5\nM--theory. The radion and its associated pseudoscalar form an SL(2,R)/U(1)\nnonlinear sigma model. This braneworld system admits its own brane solution in\nthe form of a 2-supercharge supersymmetric string. Requiring this to be free of\nsingularities leads to an SL(2,Z) identification of the sigma model target\nspace. The resulting radion mode has a minimum length; we suggest that this\ncould be used to avoid the occurrence of singularities in brane-brane\ncollisions. We discuss possible supersymmetric potentials for the radion\nsupermultiplet and their relation to cosmological models such as the cyclic\nuniverse or hybrid inflation.\n']


#### Predict the abstract for whole test data

In [50]:
predicted_titles  =  model.predict("summarize: "+eval_df.input_text)

HBox(children=(HTML(value='Generating outputs'), FloatProgress(value=0.0, max=1315.0), HTML(value='')))




HBox(children=(HTML(value='Decoding outputs'), FloatProgress(value=0.0, max=10520.0), HTML(value='')))




In [53]:
print(type(predicted_titles), len(predicted_titles), eval_df.shape)

<class 'list'> 10520 (10520, 3)


In [54]:
eval_df["predicted_titles"] = predicted_titles

In [55]:
eval_df.to_csv("eval_df_preds_without_domain.csv", index=False)

## Domain Controlled

In [15]:
%%time
train_df, eval_df = load_dataset(include_domain=True)

CPU times: user 950 ms, sys: 181 ms, total: 1.13 s
Wall time: 1.13 s


In [16]:
model_args = {
    "max_seq_length": 512,
    "train_batch_size": 8,
    "eval_batch_size": 8,
    "num_train_epochs": 5,
    "evaluate_during_training": False,
    "evaluate_during_training_steps": 1000,
    "evaluate_during_training_verbose": True,
    
    "use_multiprocessing": False,
    "fp16": False,

    "save_steps": -1,
    "save_eval_checkpoints": True,
    "save_model_every_epoch": True,

    "reprocess_input_data": True,
    "overwrite_output_dir": True,

    "wandb_project": "title-generation",
    
}


In [17]:
# Create T5 Model
model = T5Model("t5-small", args=model_args, use_cuda=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




In [18]:
# Train T5 Model on new task
model.train_model(train_data=train_df, eval_data=eval_df)

HBox(children=(FloatProgress(value=0.0, max=100213.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmacab[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.10.2
[34m[1mwandb[0m: Run data is saved locally in wandb/run-20201004_061335-38h1907u
[34m[1mwandb[0m: Syncing run [33mfresh-vortex-14[0m





HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=12527.0, style=ProgressStyle(d…








HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=12527.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=12527.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=12527.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=12527.0, style=ProgressStyle(d…





In [19]:
print("continue")

continue


In [20]:
# Evaluate T5 Model on new task
results = model.eval_model(eval_df)

# Predict with trained T5 model
#print(model.predict(["convert: four"]))

HBox(children=(FloatProgress(value=0.0, max=10520.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=1315.0, style=ProgressStyle(desc…




In [21]:
print(results)

{'eval_loss': 1.8594654024327208}


In [22]:
random_num = 351
actual_title = eval_df.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+eval_df.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)

print(f'Actual Title: {actual_title}')
print(f'Predicted Title: {predicted_title}')
print(f'Actual Abstract: {actual_abstract}')
8594654024327208}

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=1.0, style=ProgressStyle(descrip…






HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=1.0, style=ProgressStyle(descripti…






Actual Title: Hydrodynamics and beyond in the strongly coupled N=4 plasma
Predicted Title: ['Hydrodynamic and higher quasinormal modes in AdS black hole background']
Actual Abstract: ['summarize:   We continue our investigations on the relation between hydrodynamic and\nhigher quasinormal modes in the AdS black hole background started in\narXiv:0710.4458 [hep-th]. As is well known, the quasinormal modes can be\ninterpreted as the poles of the retarded Green functions of the dual N=4 gauge\ntheory at finite temperature. The response to a generic perturbation is\ndetermined by the residues of the poles. We compute these residues numerically\nfor energy-momentum and R-charge correlators. We find that the diffusion modes\nbehave in a similar way: at small wavelengths the residues go over into a form\nof a damped oscillation and therefore these modes decouple at short distances.\nThe sound mode behaves differently: its residue does not decay and at short\nwavelengths this mode behaves as t

In [23]:
random_num = 777
actual_title = eval_df.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+eval_df.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)
print(f'Actual Title: {actual_title}')
print(f'Predicted Title: {predicted_title}')
print(f'Actual Abstract: {actual_abstract}')

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=1.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=1.0, style=ProgressStyle(descripti…


Actual Title: Towards a homotopy theory of process algebra
Predicted Title: ['labelled flows and homotopical approach to synchronization algebras']
Actual Abstract: ['summarize:   This paper proves that labelled flows are expressive enough to contain all\nprocess algebras which are a standard model for concurrency. More precisely, we\nconstruct the space of execution paths and of higher dimensional homotopies\nbetween them for every process name of every process algebra with any\nsynchronization algebra using a notion of labelled flow. This interpretation of\nprocess algebra satisfies the paradigm of higher dimensional automata (HDA):\none non-degenerate full $n$-dimensional cube (no more no less) in the\nunderlying space of the time flow corresponding to the concurrent execution of\n$n$ actions. This result will enable us in future papers to develop a\nhomotopical approach of process algebras. Indeed, several homological\nconstructions related to the causal structure of time flow are

In [24]:
random_num = 187
actual_title = eval_df.iloc[random_num]['target_text']
actual_abstract = ["summarize: "+eval_df.iloc[random_num]['input_text']]
predicted_title = model.predict(actual_abstract)

print(f'Actual Title: {actual_title}')
print(f'Predicted Title: {predicted_title}')
print(f'Actual Abstract: {actual_abstract}')

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=1.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=1.0, style=ProgressStyle(descripti…


Actual Title: D=5 M-theory radion supermultiplet dynamics
Predicted Title: ['Radion Supermultiplet and the Cosmological Model']
Actual Abstract: ['summarize:   We show how the bosonic sector of the radion supermultiplet plus d=4, N=1\nsupergravity emerge from a consistent braneworld Kaluza-Klein reduction of D=5\nM--theory. The radion and its associated pseudoscalar form an SL(2,R)/U(1)\nnonlinear sigma model. This braneworld system admits its own brane solution in\nthe form of a 2-supercharge supersymmetric string. Requiring this to be free of\nsingularities leads to an SL(2,Z) identification of the sigma model target\nspace. The resulting radion mode has a minimum length; we suggest that this\ncould be used to avoid the occurrence of singularities in brane-brane\ncollisions. We discuss possible supersymmetric potentials for the radion\nsupermultiplet and their relation to cosmological models such as the cyclic\nuniverse or hybrid inflation.\n @domain: hep-th astro-ph']


In [25]:
predicted_titles  =  model.predict("summarize: "+eval_df.input_text)

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=1315.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=10520.0, style=ProgressStyle(descr…




In [26]:
predicted_titles[0]

'Semi-structured interviews with smart home owners'

In [None]:
eval_df.to_csv("eval_df_preds_with_domain.csv", index=False)