# Transformer Model

In [2]:
import pandas as pd
import warnings; warnings.simplefilter('ignore')
import numpy as np
import warnings
import os
import sys
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

# Load the local code
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.data.read_parallel import read_parallel_local

In [7]:
import trax
from trax import layers as tl
from trax.supervised import training

In [16]:
# Load the text data from individual files and store

REDUCE_BY_FACTOR = 20.0 # Make the dataset smaller for development purposes

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/home/luke/tmp_vol'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)    
print(f"Original number of examples: {len(df)}")
df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

df['text'] = read_parallel_local(df['id'], DATA_VOL + "/clean/")

df = df.reset_index(drop=True)

Original number of examples: 199451
Reduced number of examples:  9972
Took 0.5911680499712626 min to open 9972 files with 20 processes.


In [12]:
df.head()

Unnamed: 0,bill_id,version_number,id,partisan_lean,sc_id,signed,text
0,1115149,1,2148609,0.230769,567-2,0,1 state of oklahoma 1st session of the 57th le...
1,1197491,2,2275685,0.708522,592-2,0,state of new york 6080-a regular sessions in ...
2,1321268,1,2575706,0.294118,627-2,0,"section a. chapter rsmo, is amended by adding..."
3,1254840,1,2458098,0.230769,624-2,0,1 state of oklahoma 1st session of the 57th le...
4,1130443,1,2168794,0.294118,572-2,0,"section a. chapter rsmo, is amended by adding..."


In [15]:
from sklearn.model_selection import train_test_split
(X_train, X_test, Y_train, Y_test) = train_test_split(df[['version_number', 'partisan_lean', 'sc_id', 'text']], df['signed'], test_size=0.08, random_state=0)
print(f"X_train.shape: {X_train.shape}")

X_train.shape: (22, 4)


In [None]:

def tokenize(input_str, vocab_file=None, vocab_dir=None):
    """Encodes a string to an array of integers

    Args:
        input_str (str): human-readable string to encode
        vocab_file (str): filename of the vocabulary text file
        vocab_dir (str): path to the vocabulary file
  
    Returns:
        numpy.ndarray: tokenized version of the input string
    """
    
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    
    # Use the trax.data.tokenize method. It takes streams and returns streams,
    # we get around it by making a 1-element stream with `iter`.
    inputs =  next(trax.data.tokenize(iter([input_str]),
                                      vocab_file=vocab_file, vocab_dir=vocab_dir))
    
    # Mark the end of the sentence with EOS
    inputs = list(inputs) + [EOS]
    
    # Adding the batch dimension to the front of the shape
    batch_inputs = np.reshape(np.array(inputs), [1, -1])
    
    return batch_inputs
