In [1]:
!pip install datasets
!pip install transformers 
!pip install evaluate
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/20

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import datasets
import re
import time
import math

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Read in final data

In [4]:
# Read in raw text data
df = pd.read_csv('/content/gdrive/MyDrive/ANLY521_Data/text_and_summaries_filtered.csv')
df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,summary,summary_source,category,status,link
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,Section AS 18 is amended by adding a new subse...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
1,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
2,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...
3,Arizona,AZ,AZ SB1028,SB1028,1,i Senate Engrossed adult cabaret pe...,Section Title chapter Arizona Revised Statutes...,This bill prohibits a person or business from ...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...
4,Arizona,AZ,AZ SB1026,SB1026,1,i Senate Engrossed state monies dr...,Section Title chapter article Arizona Revised ...,This bill prohibits the use of state monies an...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...


## Test Splitting Text on Numerical + Whitespace areas

In [5]:
df.loc[50, 'original_text']

'  Introduced Version HOUSE BILL No  1118       DIGEST OF  INTRODUCED B ILL Citations Affected   IC 16 18 2  IC 16 35 12  IC 34 30 2 1 218 5  IC 35 52 16 23 7  Synopsis   Prohibited services relating to care of minors  Prohibits specified health care professionals from   1  performing  or causing to be performed  certain medical procedures on a minor  or  2  subjecting a minor to certain activities that purposely attempt to change  reinforce  or affirm a minor s perception of the minor s own sexual attraction or sexual behavior  or attempt to change  reinforce  or affirm a minor s gender identity when the identity is inconsistent with the minor s biological sex   Effective   July 1  2023  Sweet January 10  2023  read first time and referred to Commi ttee on Public Health  2023 IN 1118LS 7160 DI 147 Introduced First Reg ular Sessio n of the 123rd General A ssem bly  2023  PRINTING CODE  Amendments  Whenever an existing statute  or a section of the Indiana Constitution  is being amended 

In [6]:
split_text = re.split(r'\s+\d+\s+', df.loc[50, 'original_text']) # Split text where there are numbers surrounded by whitespace
split_text = [x for x in split_text if not any(c.isdigit() for c in x)] # Remove list items that are just numbers
split_text = [x for x in split_text if x] # remove whitespace/empty string list items
split_text[0:5]

['  Introduced Version HOUSE BILL No',
 'DIGEST OF  INTRODUCED B ILL Citations Affected   IC',
 'IC',
 'IC',
 'Synopsis   Prohibited services relating to care of minors  Prohibits specified health care professionals from']

## Apply text splitting

In [7]:
# Split the text on numerical text surrounded by whitespace
df['split_original_text'] = df['original_text'].apply(lambda x: re.split(r'\s+\d+\s+', x))
# Remove singular numbers from list of text
df['split_original_text'] = df['split_original_text'].apply(lambda x: [m for m in x if not any(c.isdigit() for c in m)])
# Remove empty whitespace text
df['split_original_text'] = df['split_original_text'].apply(lambda x: [m for m in x if m] )

In [8]:
# Compare results
df.loc[0, 'original_text']

'  HB0027a  1  HB 27   New Text Underlined  DELETED TEXT BRACKETED      33 LS0270 A        HOUSE BILL NO  27  IN THE LEGISLATURE OF THE STATE OF ALASKA  THIRTY THIRD LEGISLATURE   FIRST SESSION   BY REPRESENTATIVE MCKAY  Introduced   1 19 23 Referred   Education  Health and Social Services     A BILL  FOR AN ACT ENTITLED   An Act relating to school athletics  recreation  athletic teams  and sports   1 BE IT ENACTED BY THE LEGISLATURE OF THE STATE OF ALASKA  2      Section 1  AS 14 18 040 is amended by adding a new subsection to read  3  d   In this section   sex  means biological sex  4      Sec  2  AS 14 18 is amended by adding new sections to read  5 Article 2  Designation of Athletic Teams and Sports  6 Sec  14 18 150  Athletic team and sport designation   a  A public school  or 7 a private school whose students or teams compete against a public school  must 8 designate each school sponsored athletic team or sport a 9  1   male  men  or boys team or sport  10  2   female  women  or 

In [9]:
' '.join(df.loc[0, 'split_original_text'])

'HB New Text Underlined  DELETED TEXT BRACKETED IN THE LEGISLATURE OF THE STATE OF ALASKA  THIRTY THIRD LEGISLATURE   FIRST SESSION   BY REPRESENTATIVE MCKAY  Introduced Referred   Education  Health and Social Services     A BILL  FOR AN ACT ENTITLED   An Act relating to school athletics  recreation  athletic teams  and sports BE IT ENACTED BY THE LEGISLATURE OF THE STATE OF ALASKA Section AS is amended by adding a new subsection to read d   In this section   sex  means biological sex Sec AS Article Designation of Athletic Teams and Sports Sec Athletic team and sport designation   a  A public school  or a private school whose students or teams compete against a public school  must designate each school sponsored athletic team or sport a b   A student who participates in an athletic team or sport designated female women  or girls must be female  based on the participant s biological sex Sec Compliance protected  A governmental entity  licensing or complaint  open an investigation  or ta

In [10]:
df.loc[20, 'original_text']

'AMENDED IN ASSEMBL Y MARCH 9  2023  calif ornia le gislature20 232 4 regular sessio n  ASSEMBL Y BILL  No  1314  Introduced by Assembly  Member  Members  Essayli  and Gallagher February 16  2023  An act to amend Section  33502  51101  of the Education Code  relating  to state educational entities   pupils  legislative counse ls dig est  AB 1314  as amended  Essayli   State educational entities  Educational  Innovation and Planning Commission  Gender identity  par ental  notification  Existing law pr ovides that par ents and guar dians of c hildr en enr olled  in public sc hools have the right  and should have the opportunity   as  mutually supportive and r espectful partner s in the education of their  childr en within the public sc hools  to be informed by the sc hool  and  to participate in the education of their c hildr en  as specified to include    among other things  having access to the sc hool r ecords of their c hild   Existing law authorizes a minor who is 12 year s of a ge 

In [11]:
' '.join(df.loc[20, 'split_original_text'])

'AMENDED IN ASSEMBL Y MARCH Introduced by Assembly  Member  Members  Essayli  and Gallagher February as amended  Essayli   State educational entities  Educational  Innovation and Planning Commission  Gender identity  par ental  notification  Existing law pr ovides that par ents and guar dians of c hildr en enr olled  in public sc hools have the right  and should have the opportunity   as  mutually supportive and r espectful partner s in the education of their  childr en within the public sc hools  to be informed by the sc hool  and  to participate in the education of their c hildr en  as specified to include    among other things  having access to the sc hool r ecords of their c hild   Existing law authorizes a minor who is year s of a ge or older to  consent to mental health tr eatment or counseling services   notwithstanding any pr ovision of law to the contr ary  if  in the opinion  of the attending pr ofessional per son  the minor is matur e enough to  participate intellig ently in

In [12]:
df.shape

(345, 13)

## Dynamically Split Text

Since the token limit truncates the text if the bill is long, we will be dynamically splitting the text into chunks that abide by the token limitation.

In [13]:
## PREPROCESSING
from transformers import AutoTokenizer
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [14]:
text = ["This is line of sentence to check how long it is.", "Please see document for a reference of values.", "Can only be used as an example for planning purposes.", "The end"]
token_list = [tokenizer(i)['input_ids'] for i in text]
token_list_len = [len(i)-1 for i in token_list]
print(token_list)
token_list_len

[[100, 19, 689, 13, 7142, 12, 691, 149, 307, 34, 19, 5, 1], [863, 217, 1708, 21, 3, 9, 2848, 13, 2620, 5, 1], [1072, 163, 36, 261, 38, 46, 677, 21, 1459, 3659, 5, 1], [37, 414, 1]]


[12, 10, 11, 2]

In [15]:
max = 14 
total_sum = 0
idx_list = []
for i,l in enumerate(token_list_len):
  if total_sum + l + 1 > max: 
    idx_list.append(i) # add split location to list of indices for splitting
    total_sum = 0 # reset to 0
  total_sum += l

In [16]:
idx_list

[1, 2]

In [17]:
text

['This is line of sentence to check how long it is.',
 'Please see document for a reference of values.',
 'Can only be used as an example for planning purposes.',
 'The end']

In [18]:
np.split(np.array(token_list_len), idx_list) # we want 12, 10, and 11+2

[array([12]), array([10]), array([11,  2])]

In [19]:
def doc_splitting(l, max_token_len = 512):
  '''Function to split the documents dynamically to abide by token limit''' 
  # Initialize sum and list
  total_sum = 0
  idx_list = []
  # Iterate over each item in list
  for i,l in enumerate(l):
    # Check if adding element to sum causes sum to be over limit
    if total_sum + l + 1 > max_token_len: # add 1 because the T-5 model only does an end token
      idx_list.append(i) # add split location to list of indices for splitting
      total_sum = 0 # reset to 0
    total_sum += l
  return idx_list

In [20]:
text = ["This is line of sentence to check how long it is.", "Please see document for a reference of values.", "Can only be used as an example for planning purposes.", "The end"]
token_list = [tokenizer(i)['input_ids'] for i in text]
token_list_len = [len(i)-1 for i in token_list]
print(token_list)
token_list_len

[[100, 19, 689, 13, 7142, 12, 691, 149, 307, 34, 19, 5, 1], [863, 217, 1708, 21, 3, 9, 2848, 13, 2620, 5, 1], [1072, 163, 36, 261, 38, 46, 677, 21, 1459, 3659, 5, 1], [37, 414, 1]]


[12, 10, 11, 2]

In [21]:
# Create word tokens for each list
df['token_list'] = df['split_original_text'].apply(lambda x: [tokenizer(i)['input_ids'] for i in x])
# Calculate the length of each token list
df['token_list_lens'] = df['token_list'].apply(lambda x: [len(i)-1 for i in x])
df.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (686 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,summary,summary_source,category,status,link,split_original_text,token_list,token_list_lens
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,Section AS 18 is amended by adding a new subse...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 43, 32, 21, 1, 1, 10, 13, 1, 1, 1, 7, ..."
1,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12..."
2,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[SB, New Text Underlined DELETED TEXT BRACKET...","[[16757, 1], [368, 5027, 3526, 747, 26, 309, 1...","[1, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12..."
3,Arizona,AZ,AZ SB1028,SB1028,1,i Senate Engrossed adult cabaret pe...,Section Title chapter Arizona Revised Statutes...,This bill prohibits a person or business from ...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...,[ i Senate Engrossed adult cabaret p...,"[[3, 23, 7819, 19650, 1859, 3843, 3165, 212, 5...","[26, 5, 10, 3, 20, 30, 1, 1, 1, 7, 4, 2, 3, 22..."
4,Arizona,AZ,AZ SB1026,SB1026,1,i Senate Engrossed state monies dr...,Section Title chapter article Arizona Revised ...,This bill prohibits the use of state monies an...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...,[ i Senate Engrossed state monies d...,"[[3, 23, 7819, 19650, 1859, 3843, 538, 3, 2641...","[26, 5, 10, 3, 4, 20, 6, 1, 1, 1, 1, 3, 8, 2, ..."


In [22]:
# Apply the document splitting on the token length lists to see where we have to split the documents on
df['split_idx_list'] = df['token_list_lens'].apply(lambda x: doc_splitting(x))
df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,summary,summary_source,category,status,link,split_original_text,token_list,token_list_lens,split_idx_list
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,Section AS 18 is amended by adding a new subse...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 43, 32, 21, 1, 1, 10, 13, 1, 1, 1, 7, ...",[37]
1,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]"
2,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[SB, New Text Underlined DELETED TEXT BRACKET...","[[16757, 1], [368, 5027, 3526, 747, 26, 309, 1...","[1, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]"
3,Arizona,AZ,AZ SB1028,SB1028,1,i Senate Engrossed adult cabaret pe...,Section Title chapter Arizona Revised Statutes...,This bill prohibits a person or business from ...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...,[ i Senate Engrossed adult cabaret p...,"[[3, 23, 7819, 19650, 1859, 3843, 3165, 212, 5...","[26, 5, 10, 3, 20, 30, 1, 1, 1, 7, 4, 2, 3, 22...",[]
4,Arizona,AZ,AZ SB1026,SB1026,1,i Senate Engrossed state monies dr...,Section Title chapter article Arizona Revised ...,This bill prohibits the use of state monies an...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...,[ i Senate Engrossed state monies d...,"[[3, 23, 7819, 19650, 1859, 3843, 538, 3, 2641...","[26, 5, 10, 3, 4, 20, 6, 1, 1, 1, 1, 3, 8, 2, ...",[31]


In [23]:
# Check whether it worked
print(df.loc[3, 'split_idx_list'])
print(sum(df.loc[3, 'token_list_lens'])) # should be under 512

[]
393


In [24]:
# Check whether it worked
print(df.loc[2, 'split_idx_list'])
print(sum(df.loc[2, 'token_list_lens'][:36])) # should be under 512
print(sum(df.loc[2, 'token_list_lens'][36:84])) # should be under 512
print(sum(df.loc[2, 'token_list_lens'][84:])) # should be under 512

[36, 84]
502
509
261


## Split text lists at the indices

In [25]:
# Test how to use split function
[list(i) for i in np.split(np.array(text), idx_list)]

[['This is line of sentence to check how long it is.'],
 ['Please see document for a reference of values.'],
 ['Can only be used as an example for planning purposes.', 'The end']]

In [26]:
def text_split(l, idx):
  '''Function to split a list of text using a list of indices'''
  return [list(i) for i in np.split(np.array(l), idx)]

# Split text using function
df['split_text_512'] = df.apply(lambda x: text_split(x['split_original_text'], x['split_idx_list']), axis=1)

In [27]:
df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,summary,summary_source,category,status,link,split_original_text,token_list,token_list_lens,split_idx_list,split_text_512
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,Section AS 18 is amended by adding a new subse...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 43, 32, 21, 1, 1, 10, 13, 1, 1, 1, 7, ...",[37],"[[HB, New Text Underlined DELETED TEXT BRACKE..."
1,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]","[[HB, New Text Underlined DELETED TEXT BRACKE..."
2,Alaska,AK,AK SB96,SB96,1,SB0096A 1 SB 96 New Text Underlined DEL...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[SB, New Text Underlined DELETED TEXT BRACKET...","[[16757, 1], [368, 5027, 3526, 747, 26, 309, 1...","[1, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]","[[SB, New Text Underlined DELETED TEXT BRACKE..."
3,Arizona,AZ,AZ SB1028,SB1028,1,i Senate Engrossed adult cabaret pe...,Section Title chapter Arizona Revised Statutes...,This bill prohibits a person or business from ...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...,[ i Senate Engrossed adult cabaret p...,"[[3, 23, 7819, 19650, 1859, 3843, 3165, 212, 5...","[26, 5, 10, 3, 20, 30, 1, 1, 1, 7, 4, 2, 3, 22...",[],[[ i Senate Engrossed adult cabaret ...
4,Arizona,AZ,AZ SB1026,SB1026,1,i Senate Engrossed state monies dr...,Section Title chapter article Arizona Revised ...,This bill prohibits the use of state monies an...,fast democracy,Free Speech & Expression,Passed Senate; House Committee of the Whole pa...,https://apps.azleg.gov/BillStatus/BillOverview...,[ i Senate Engrossed state monies d...,"[[3, 23, 7819, 19650, 1859, 3843, 538, 3, 2641...","[26, 5, 10, 3, 4, 20, 6, 1, 1, 1, 1, 3, 8, 2, ...",[31],[[ i Senate Engrossed state monies ...


In [28]:
# Explode the split text 512 column so each row is one chunk
df = df.explode('split_text_512', ignore_index=True)

In [29]:
df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,summary,summary_source,category,status,link,split_original_text,token_list,token_list_lens,split_idx_list,split_text_512
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,Section AS 18 is amended by adding a new subse...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 43, 32, 21, 1, 1, 10, 13, 1, 1, 1, 7, ...",[37],"[HB, New Text Underlined DELETED TEXT BRACKET..."
1,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,Section AS 18 is amended by adding a new subse...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 43, 32, 21, 1, 1, 10, 13, 1, 1, 1, 7, ...",[37],[the school or school district may br ing a pr...
2,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]","[HB, New Text Underlined DELETED TEXT BRACKET..."
3,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]",[pursue legal action against a school district...
4,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]",[with the collaboration of members of each sch...


In [30]:
df.loc[1, 'split_text_512']

['the school or school district may br ing a private cause of',
 'action against the violating entity for injunctive relief  dama ges  and any other relief',
 'available under law',
 'd   An action brought under this section must be commenced wit hin two years',
 'of the event giving rise to the complaint',
 'Sec',
 'Definitions  In AS',
 'postsecondary school',
 'district  a regional educational attendance area  a state boarding school  and the state',
 'centralized correspondence study program']

In [31]:
# Create a group by index number to keep track of chunks
df['doc_number'] = df.groupby('bill_id')['split_text_512'].cumcount().add(1)

In [36]:
# Concatenate string list items into single string
df['split_text'] = df['split_text_512'].apply(lambda x: ' '.join(x))

In [37]:
df.head()

Unnamed: 0,state_name,state,bill_id,bill_name,keep,original_text,cleaned_text,summary,summary_source,category,status,link,split_original_text,token_list,token_list_lens,split_idx_list,split_text_512,doc_number,split_text
0,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,Section AS 18 is amended by adding a new subse...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 43, 32, 21, 1, 1, 10, 13, 1, 1, 1, 7, ...",[37],"[HB, New Text Underlined DELETED TEXT BRACKET...",1,HB New Text Underlined DELETED TEXT BRACKETED...
1,Alaska,AK,AK HB27,HB27,1,HB0027a 1 HB 27 New Text Underlined DEL...,Section AS 18 is amended by adding a new subse...,The bill amends an existing regulation of scho...,handwritten,Schools & Education,"Referred to committee, 01/19/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 43, 32, 21, 1, 1, 10, 13, 1, 1, 1, 7, ...",[37],[the school or school district may br ing a pr...,2,the school or school district may br ing a pri...
2,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]","[HB, New Text Underlined DELETED TEXT BRACKET...",1,HB New Text Underlined DELETED TEXT BRACKETED...
3,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]",[pursue legal action against a school district...,2,pursue legal action against a school district ...
4,Alaska,AK,AK HB105,HB105,1,HB0105a 1 HB 105 New Text Underlined DE...,Section AS 03 a is amended to read a A local s...,This bill adds to the list of parental rights ...,fast democracy,Schools & Education,"First read and referred to committee, 03/08/2023",https://www.akleg.gov/basis/Bill/Detail/33?Roo...,"[HB, New Text Underlined DELETED TEXT BRACKET...","[[3, 19990, 1], [368, 5027, 3526, 747, 26, 309...","[2, 16, 57, 40, 20, 14, 2, 21, 1, 1, 6, 13, 12...","[36, 84]",[with the collaboration of members of each sch...,3,with the collaboration of members of each scho...


In [49]:
# Save dataframe
df.to_csv('/content/gdrive/MyDrive/ANLY521_Data/text_and_summaries_filtered_split.csv', index = False)

In [48]:
df.shape

(1790, 19)

## Summary Statistics

In [52]:
df['total_tokens'] = df['token_list_lens'].apply(lambda x: sum(x))
print(f'Total number of documents: {df["bill_id"].nunique()}')
print(f'Total average token length across documents: {df["total_tokens"].mean()}')

Total number of documents: 344
Total average token length across documents: 18004.345251396648
