In [4]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import datasets
import re
import time
import math
import ast

In [5]:
# Read in raw text data
df = pd.read_csv('../../modified_data/text_and_summaries_filtered.csv')
# Make sure split_cleaned_text column is in the form of true lists not strings of lists
df['split_cleaned_text'] = df['split_cleaned_text'].apply(lambda x: ast.literal_eval(x))
# Print the first few rows
df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,split_cleaned_text,summary,summary_source,category,status,link
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,100 AS 14 18 040 is amended by adding a new su...,"[is amended by adding a new sub, read, d in th...",The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
1,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
2,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
3,Arizona,AZ,AZ SB1028,SB1028,1,i Senate Engrossed adult cabaret pe...,100 Title 13 chapter 14 Arizona Revised Statut...,"[chapter, arizona revised statutes is, amended...",This bill prohibits a person or business from ...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...
4,Arizona,AZ,AZ SB1026,SB1026,1,i Senate Engrossed state monies dr...,100 Title 35 chapter 1 article 5 Arizona Revis...,"[chapter, article, arizona revised, statutes i...",This bill prohibits the use of state monies an...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...


In [6]:
## PREPROCESSING
from transformers import AutoTokenizer
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [7]:
def doc_splitting(l, max_token_len = 512):
  '''Function to split the documents dynamically to abide by token limit''' 
  # Initialize sum and list
  total_sum = 0
  idx_list = []
  # Iterate over each item in list
  for i,l in enumerate(l):
    # Check if adding element to sum causes sum to be over limit
    if total_sum + l + 1 > max_token_len: # add 1 because the T-5 model only does an end token
      idx_list.append(i) # add split location to list of indices for splitting
      total_sum = 0 # reset to 0
    total_sum += l
  return idx_list

In [8]:
# Tokenize
df['token_list'] = df['split_cleaned_text'].apply(lambda x: [tokenizer(i)["input_ids"] for i in x])
# Calculate the length of the token list & split
df['token_list_lens'] = df['token_list'].apply(lambda x: [len(i)-1 for i in x])
df['split_idx_list'] = df['token_list_lens'].apply(lambda x: doc_splitting(x))
df.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (665 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,split_cleaned_text,summary,summary_source,category,status,link,token_list,token_list_lens,split_idx_list
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,100 AS 14 18 040 is amended by adding a new su...,"[is amended by adding a new sub, read, d in th...",The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[19, 21012, 57, 2651, 3, 9, 126, 769, 1], [60...","[8, 1, 4, 5, 1, 6, 16, 11, 13, 15, 14, 6, 13, ...",[]
1,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49]
2,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49]
3,Arizona,AZ,AZ SB1028,SB1028,1,i Senate Engrossed adult cabaret pe...,100 Title 13 chapter 14 Arizona Revised Statut...,"[chapter, arizona revised statutes is, amended...",This bill prohibits a person or business from ...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...,"[[5800, 1], [3, 1665, 8892, 9, 15760, 18692, 7...","[1, 8, 3, 2, 3, 12, 7, 14, 3, 8, 18, 17, 3, 6,...",[]
4,Arizona,AZ,AZ SB1026,SB1026,1,i Senate Engrossed state monies dr...,100 Title 35 chapter 1 article 5 Arizona Revis...,"[chapter, article, arizona revised, statutes i...",This bill prohibits the use of state monies an...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...,"[[5800, 1], [1108, 1], [3, 1665, 8892, 9, 1576...","[1, 1, 5, 6, 2, 4, 15, 18, 12, 11, 15, 18, 12,...",[]


In [9]:
# Check whether it worked
print(df.loc[3, 'split_idx_list'])
print(df.loc[3, 'token_list_lens'])
print(sum(df.loc[3, 'token_list_lens'])) # should be under 512

[]
[1, 8, 3, 2, 3, 12, 7, 14, 3, 8, 18, 17, 3, 6, 11, 2, 7, 11, 14, 8]
158


In [10]:
# Check whether it worked
print(df.loc[3, 'split_idx_list'])
print(sum(df.loc[3, 'token_list_lens'][:80])) # should be under 512
print(sum(df.loc[3, 'token_list_lens'][:80])) # should be under 512

[]
158
158


In [11]:
def text_split(l, idx):
  '''Function to split a list of text using a list of indices'''
  return [list(i) for i in np.split(np.array(l), idx)]

In [12]:
# Split text using function
df['split_text_512'] = df.apply(lambda x: text_split(x['split_cleaned_text'], x['split_idx_list']), axis=1)

In [13]:
# Explode the split text 512 column so each row is one chunk
df = df.explode('split_text_512', ignore_index=True)
df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,split_cleaned_text,summary,summary_source,category,status,link,token_list,token_list_lens,split_idx_list,split_text_512
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,100 AS 14 18 040 is amended by adding a new su...,"[is amended by adding a new sub, read, d in th...",The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[19, 21012, 57, 2651, 3, 9, 126, 769, 1], [60...","[8, 1, 4, 5, 1, 6, 16, 11, 13, 15, 14, 6, 13, ...",[],"[is amended by adding a new sub, read, d in th..."
1,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49],"[a is amended to read, a a local school boards..."
2,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49],[health from a parent foster parent or guardia...
3,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49],"[a is amended to read, a a local school boards..."
4,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49],[health from a parent foster parent or guardia...


In [14]:
# Create a group by index number to keep track of chunks
df['doc_number'] = df.groupby('bill_id')['split_text_512'].cumcount().add(1)
# Concatenate string list items into single string
df['split_text'] = df['split_text_512'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,split_cleaned_text,summary,summary_source,category,status,link,token_list,token_list_lens,split_idx_list,split_text_512,doc_number,split_text
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,100 AS 14 18 040 is amended by adding a new su...,"[is amended by adding a new sub, read, d in th...",The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[19, 21012, 57, 2651, 3, 9, 126, 769, 1], [60...","[8, 1, 4, 5, 1, 6, 16, 11, 13, 15, 14, 6, 13, ...",[],"[is amended by adding a new sub, read, d in th...",1,is amended by adding a new sub read d in this ...
1,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49],"[a is amended to read, a a local school boards...",1,a is amended to read a a local school boards h...
2,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49],[health from a parent foster parent or guardia...,2,health from a parent foster parent or guardian...
3,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49],"[a is amended to read, a a local school boards...",1,a is amended to read a a local school boards h...
4,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,100 AS 14 03 016 a is amended to read 6 a A lo...,"[a is amended to read, a a local school boards...",This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[[3, 9, 19, 21012, 12, 608, 1], [3, 9, 3, 9, 4...","[6, 14, 12, 9, 17, 2, 12, 8, 1, 13, 15, 1, 11,...",[49],[health from a parent foster parent or guardia...,2,health from a parent foster parent or guardian...


In [16]:
df.columns

Index(['state_name', 'state', 'bill_id', 'bill_name', 'keep', 'original_text',
       'cleaned_text', 'split_cleaned_text', 'summary', 'summary_source',
       'category', 'status', 'link', 'token_list', 'token_list_lens',
       'split_idx_list', 'split_text_512', 'doc_number', 'split_text'],
      dtype='object')

In [19]:
filtered_df = df[["state_name", "state", "bill_id", "bill_name", "doc_number", "split_text", "summary", "summary_source", "category", "status", "link"]]

In [20]:
filtered_df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,doc_number,split_text,summary,summary_source,category,status,link
0,Alaska,AK,AK HB27,HB27,1,is amended by adding a new sub read d in this ...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
1,Alaska,AK,AK HB105,HB105,1,a is amended to read a a local school boards h...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
2,Alaska,AK,AK HB105,HB105,2,health from a parent foster parent or guardian...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
3,Alaska,AK,AK SB96,SB96,1,a is amended to read a a local school boards h...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
4,Alaska,AK,AK SB96,SB96,2,health from a parent foster parent or guardian...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...


In [21]:
# Save dataframe
filtered_df.to_csv('../../modified_data/text_and_summaries_filtered_split.csv', index = False)