# Table-to-text Natural Language Generation Model

This notebook was created to explore possibilities on creating a machine learning model generating natural language desriptions of tabular data. Proof of concept for Collabothon 2021 hackathon, Natural Language Generation challenge by Sages.

## Dataset

Using ToTTo dataset by Google (https://github.com/google-research-datasets/ToTTo)

In [1]:
import numpy as np
import json

In [2]:
with open('totto_data/totto_train_data.jsonl') as f:
    data = np.array([json.loads(line) for line in f])

In [3]:
# print(data[0])

<img src="favorita.png">

Wiki page: List of 8/9 PM telenovelas of Rede Globo. Focus on 13th index and 2nd column (counted from 0). output sentence: A Favorita is the telenovela aired in the 9 pm timeslot.

In [4]:
# with open('totto_data/unlabeled_totto_test_data.jsonl') as f:
#     test = np.array([json.loads(line) for line in f])

## Data processing

In [5]:
data.shape

(120761,)

In [6]:
import pandas as pd

df = pd.DataFrame.from_records(data)
df.head(5)

Unnamed: 0,table,table_webpage_url,table_page_title,table_section_title,table_section_text,highlighted_cells,example_id,sentence_annotations
0,"[[{'value': '#', 'is_header': True, 'column_sp...",http://en.wikipedia.org/wiki/List_of_8/9_PM_te...,List of 8/9 PM telenovelas of Rede Globo,2000s,,"[[13, 2]]",1762238357686640028,[{'original_sentence': 'It is also the first t...
1,"[[{'value': 'Year', 'is_header': True, 'column...",http://en.wikipedia.org/wiki/List_of_Chicago_B...,List of Chicago Bears first-round draft picks,Player selections,,"[[100, 0], [100, 2], [100, 3], [100, 4]]",7906730525723842956,[{'original_sentence': 'The team's most recent...
2,"[[{'value': 'Res.', 'is_header': True, 'column...",http://en.wikipedia.org/wiki/Brian_Ebersole,Brian Ebersole,Mixed martial arts record,,"[[9, 2], [9, 4]]",6196487034766762341,[{'original_sentence': 'Ebersole made his UFC ...
3,"[[{'value': 'State (class)', 'is_header': True...",http://en.wikipedia.org/wiki/78th_United_State...,78th United States Congress,Senate,,"[[1, 1], [1, 2], [1, 3], [1, 4], [8, 1], [8, 3...",5254211070576123135,[{'original_sentence': 'William Warren Barbour...
4,[[{'value': 'Elagabalus Severan dynastyBorn: c...,http://en.wikipedia.org/wiki/Elagabalus,Elagabalus,External links,Media related to Elagabalus at Wikimedia Commo...,"[[0, 0], [2, 1]]",-5206051586137920777,[{'original_sentence': 'Elagabalus (/ˌɛləˈɡæbə...


In [7]:
df = df.drop(['table_webpage_url', 'example_id', 'table_section_text'], axis=1)

In [8]:
df['sentence_annotations'] = df['sentence_annotations'].apply(lambda row: row[0]['final_sentence'])

In [9]:
df.sample(10)

Unnamed: 0,table,table_page_title,table_section_title,highlighted_cells,sentence_annotations
41131,[[{'value': 'United States presidential electi...,1996 United States presidential election in Iowa,Results,"[[4, 1], [4, 2], [4, 5]]",The Reform Party candidate Ross Perot finished...
102660,"[[{'value': 'Team', 'is_header': True, 'column...",John Kaye (footballer),Management,"[[2, 7]]",John Kaye had a win percentage of 31.2%.
63916,"[[{'value': 'Year', 'is_header': True, 'column...",Brittany Daniel,Television,"[[8, 0], [8, 1], [8, 2]]","In 2002, Brittany Daniel played the role of Pe..."
103657,"[[{'value': 'Year', 'is_header': True, 'column...",Joffrey Lauvergne,EuroLeague,"[[2, 2], [2, 8], [2, 12]]","Over 24 games in the EuroLeague, Joffrey Lauve..."
95424,"[[{'value': 'Nom de Guerre', 'is_header': True...",List of commanders of the LTTE,,"[[54, 0], [54, 5]]","Gangai Amaran, was an LTTE leader killed by th..."
23081,"[[{'value': 'State (class)', 'is_header': True...",95th United States Congress,Senate,"[[8, 1], [8, 3], [8, 4]]",Nancy Kassebaum (R) succeeded James B. Pearson...
39664,"[[{'value': 'Year', 'is_header': True, 'column...",Kosovo in the Bala Turkvision Song Contest,Participation,"[[1, 0], [1, 1], [1, 2]]","Ela Kazaz, was the first participant to repres..."
95178,"[[{'value': 'Climate data for Obigarm', 'is_he...",Obigarm,Climate,"[[2, 7], [3, 1]]","In Obigarm, in July the average temperature is..."
88682,[[{'value': 'Alabama Crimson Tide Team NCAA Na...,Alabama Crimson Tide women's gymnastics,NCAA titles,"[[4, 0], [4, 1], [4, 2]]",Alabama finished the 1996 season with their th...
54615,"[[{'value': 'Name', 'is_header': True, 'column...",World record progression 1000 m speed skating ...,The world record progression as recognised by ...,"[[9, 0], [9, 1], [9, 2], [9, 3]]",Laila Schou Nilsen made a new record in the 10...


In [10]:
data_sentences = df['sentence_annotations']
df = df.drop('sentence_annotations', axis=1)
df.sample(3)

Unnamed: 0,table,table_page_title,table_section_title,highlighted_cells
106944,"[[{'value': 'Season', 'is_header': True, 'colu...",Doosan Bears,Season-by-season records,"[[3, 0], [3, 14], [23, 0], [23, 12], [30, 0], ..."
29608,"[[{'value': 'Rank', 'is_header': True, 'column...",Swimming at the 2000 Summer Olympics – Women's...,Final,"[[1, 2], [1, 4]]"
19897,"[[{'value': 'Event', 'is_header': True, 'colum...",César Cielo,Short course (25 meter pool),"[[7, 0], [7, 1], [7, 4]]"


In [11]:
 def focus_words(row):
    words = []
    table = row['table']
    for cell in row['highlighted_cells']:
        words.append(table[cell[0]][cell[1]]['value'])
    return words

df['focus_words'] = df.apply(focus_words, axis=1)
df = df.drop(['table', 'highlighted_cells'], axis=1)

df.sample(3)

Unnamed: 0,table_page_title,table_section_title,focus_words
39612,List of Commanders of the Turkish Land Forces,,"[General Semih Sancar, 28 August 1972, 7 March..."
66654,List of awards and nominations received by Kel...,Grammy Awards,"[2013, Stronger, Best Pop Vocal Album, Won]"
119073,National Register of Historic Places listings ...,Current listings,[147]


## Train and validation sets

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_tables, val_tables, train_sentences, val_sentences = train_test_split(df, data_sentences, test_size=0.25)

In [14]:
print(train_tables.shape, val_tables.shape)

(90570, 3) (30191, 3)


## Model training with PyTorch

Using mainly Beginners guide to text generation (https://www.kaggle.com/ab971631/beginners-guide-to-text-generation-pytorch) tutorial.

In [15]:
import re
import pickle
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [16]:
# Check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

No GPU available, training on CPU; consider making n_epochs very small.


In [17]:
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size*2, hidden_size, n_layers,batch_first=True,
                          bidirectional=False)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden):
        input = self.encoder(input.view(1, -1))
        output, hidden = self.gru(input.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_size))

In [18]:
def train(inp, target):
    hidden = decoder.init_hidden()
    decoder.zero_grad()
    loss = 0
    
    for c in range(input_size):
        output, hidden = decoder(inp[c], hidden)
        loss += criterion(output, target[c])

    loss.backward()
    decoder_optimizer.step()

    return loss.data.item() / input_size

In [19]:
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [20]:
def yield_data_pair():
    for i in range(train_tables):
        yield train_tables[i], train_sentences[i]

In [21]:
n_epochs = 300
print_every = 100
plot_every = 10
hidden_size = 100
n_layers = 1
lr = 0.015

input_size = len(set(train_tables))
output_size = len(set(train_sentences))

decoder = RNN(input_size, hidden_size, output_size, n_layers)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

start = time.time()
all_losses = []
loss_avg = 0

# if(train_on_gpu):
#     decoder.cuda()

for epoch in range(1, n_epochs + 1):
#     inp, tar = yield_data_pair()
    loss = train(train_tables, train_sentences)       
    loss_avg += loss

    if epoch % print_every == 0:
        print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 50, loss))
#         print(evaluate('ge', 200), '\n')

    if epoch % plot_every == 0:
        all_losses.append(loss_avg / plot_every)
        loss_avg = 0

KeyError: 0