# ESG report text processing

This code retrieves converts PDFs (from both online links and downloaded files) into plain text using the PyMuPDF library. The extracted text is saved locally for each report as a .txt file. 

Two dataframes are created for future analysis:
- **Report details**: Saves report-level details (e.g. number of pages, number of paragraphs, number of supply chain paragraphs)
- **Paragraph**: Dataframe where each row is a paragraph; includes meta-data (filename, year, company ID), the paragraph text, and whether the paragraph is supply-chain-related

## Import & set-up

In [None]:
import pandas as pd
import numpy as np
import requests
import os
import os.path
import io
import fitz # install using: pip install PyMuPDF
from tqdm import tqdm
import matplotlib.pyplot as plt
from operator import itemgetter
import sqlite3
import spacy
from tqdm import tqdm
import string
import re
from re import search
import urllib
import pickle
from unidecode import unidecode

## Upload list of links and file locations

The links to the PDFs were manually collected in an Excel file where each row is a company and there are columns for each year. There could be multiple reports in a given year. When this was the case, I included all links separated by ' and '. For a few companies, I manually downloaded the PDFs while collecting the link (either because they automatically downloaded or because they weren't stored at a .pdf link, for example if the company used ISSUU to host the files). In these cases, the links would refer to filenames within a local folder.

In [None]:
# Upload file
esg_urls_pd = pd.read_excel('../Data/report_links.xlsx')
esg_urls_pd.head(3)

In [None]:
# Clean dataframe
esg_urls_pd.rename(columns={'Exchange:Ticker':'Exchange-Ticker'},inplace=True)
esg_urls_pd['Exchange-Ticker'] = esg_urls_pd['Exchange-Ticker'].str.replace(':','-')

# Change from wide to long format
esg_urls_pd_m = esg_urls_pd.melt(id_vars=['Company Name','Exchange-Ticker','Primary Sector'], 
                 value_vars=[2012,2013,2014,2015,2016,
                             2017,2018,2019,2020,2021],value_name='URL',var_name='Year')
esg_urls_pd_m['URL'] = esg_urls_pd_m['URL'].str.split(' and ')
esg_urls_pd_m = esg_urls_pd_m.explode('URL')
esg_urls_pd_m.head(3)

In [None]:
# Only need to download reports with URL starting with "http"
esg_urls_pd_m['to_download'] = esg_urls_pd_m['URL'].apply(lambda x: False if (pd.isna(x) or x[:4]!='http') else True)

In [None]:
# Set filenames such that reports from the same year have unique filenames
esg_urls_pd_m['dup_num'] = esg_urls_pd_m.groupby(['Exchange-Ticker','Year']).cumcount()+1
esg_urls_pd_m['filename'] = esg_urls_pd_m.apply(lambda x: f"{x['Exchange-Ticker']}-{x['Year']}-{x['dup_num']}", axis=1)

## Functions

This section defines a number of functions that are needed to process the PDFs and clean & save text:
- **Set-up**: Initializes key variables and the SpaCy model
- **Ordering functions**: Defines custom functions that determine the ordering of text blocks when extracting text with PyMuPDF; this is necesssary because many ESG reports have pages with multiple columns or even multiple blocks of multi-column text.
- **Text cleaning & other helper functions**: Defines a variety of custom helper functions that are used when extracting & cleaning text
- **Main text extraction functions**: Defines functions that are called when extracting text from a PDF link (or local file location). 
- **Text to dict function**: Transforms a plain text file into a dict (which are later used to create a dataframe) that includes both paragraph text and whether the paragraph is supply chain related

### Set-up

In [None]:
# Enable checking for non ASCII characters
printable = set(string.printable)

# Load spacy model
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=['ner'])

# Get list of English words
with urllib.request.urlopen("https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt") as word_file:
    english_words = set(word.decode("utf-8").strip().lower() for word in word_file)

### Ordering functions

In [None]:
def custom_sort(blocks, merge_margin=20):
    filtered_blocks = [b for b in blocks if b['type'] == 0] #filter to only text blocks
    big_boxes = group_blocks_y(filtered_blocks, merge_margin=merge_margin) # get min & max of y groups
    sorted_blocks = []
    # sort big boxes first by x0, then by y0
    for box in sorted(big_boxes, 
                      key=lambda box: box[0]):
        y0, y1 = box
        blocks_in_box = [b for b in filtered_blocks if (b['bbox'][1] >= y0 and 
                                                        b['bbox'][1] <= y1)]
        
  
        # sort blocks within box first by x0, then by y0
        sorted_blocks.extend(sorted(blocks_in_box, 
                                    key=lambda b: (round_to_base(b['bbox'][0], 100), b['bbox'][1])))
    #sort big blocks
    #sort blocks within big blocks
    return sorted_blocks

def y_overlap(source, target):
    # unpack points both boxes
    top1, bottom1 = source
    top2, bottom2 = target

    # boxes don't overlap if y0 of one box is greater than y1 of the other box
    if (top1 > bottom2 or top2 > bottom1):
        return False
    return True

def get_all_y_overlaps(boxes, bounds, index):
    overlaps = []
    for a in range(len(boxes)):
        if a != index:
            if y_overlap(bounds, boxes[a]):
                overlaps.append(a)
    return overlaps

def group_blocks_y(filtered_blocks, merge_margin=20):
    boxes = [] # saved as list of [y0,y1]
    for b in filtered_blocks:
        x0, y0, x1, y1 = b['bbox'] # PyMuPDF bbox ordering
        boxes.append([y0,y1])
    
    finished = False
    
    while not finished:
        # set end con
        finished = True

        index = len(boxes) - 1
        while index >= 0:
            curr = boxes[index]

            # add margin
            top = curr[0]
            bottom = curr[1]
            top -= merge_margin #extend upwards
            bottom += merge_margin #extend downwards

            # get matching boxes
            overlaps = get_all_y_overlaps(boxes, [top, bottom], index)

            # check if empty
            if len(overlaps) > 0:
                overlap_y0s = []
                overlap_y1s = []

                # combine y-coords
                overlaps.append(index)

                for ind in overlaps:
                    top, bottom = boxes[ind]
                    overlap_y0s.append(top)
                    overlap_y1s.append(bottom)

                merged = [min(overlap_y0s), max(overlap_y1s)]

                overlaps.sort(reverse = True)
                for ind in overlaps:
                    del boxes[ind]
                boxes.append(merged)

                # set flag
                finished = False;
                break

            index -= 1
    return boxes

### Text cleaning & other helper functions

In [None]:
# Checks whether a word is a valid English word
def is_english_word(word):
    return word.lower() in english_words

# Checks if two strings are actually a single word that was incorrectly split
def is_split_word(first, second):
    if not (is_english_word(first.lower())) or not (is_english_word(second.lower())):
        combined = first + second
        return is_english_word(combined)
    else:
        return False

# Because sometimes fontsizes differ by minuscule amounts,
# need function to round
def round_to_base(x, base=100):
    return base * round(x/base)

# Check for most common fontsize
def get_frequency_font_sizes(doc):
    styles = {}
    font_counts = {}
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        rounded_fontsize = round(s['size'])
                        identifier = "{0}".format(rounded_fontsize)
                        styles[identifier] = {'size': rounded_fontsize}
                        
                        # Count the fonts usage (by character rather than span 
                        # because some reports have many figs w/many spans)
                        font_counts[identifier] = font_counts.get(identifier, 0) + len(s['text'].strip())

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        return None # No discriminating fonts were found
    p_style = styles[font_counts[0][0]]
    p_size = p_style['size']
    return p_size

# Check whether a space needs to be added when combining 2 lines
def need_to_add_space(prev_line, next_line):
    if prev_line != "" and not prev_line.strip().endswith('-') and not prev_line.endswith(' '): 
        # check if there seems to be a word split across lines, 
        # only if first character of new line is a lowercase letter
        # and last character of last line is a lowercase letter
        if next_line.strip()[0].isalpha() and next_line.strip()[0].islower() \
        and prev_line.strip()[-1].isalpha() and \
        prev_line.strip()[-1].strip()[0].islower():
            last_word_prev = prev_line.strip().split(' ')[-1]
            first_word_line = next_line.strip().split(' ')[0]
            if is_split_word(last_word_prev, first_word_line):
                return False #don't add space if split word
            else:
                return True #add space if not split word
        else: 
            return True #add space
    else:
        return False #don't add space if no text in prev or if prev ends w/hyphen

# Basic check to limit number of bullet points without period before them
def is_capitalized_verb(paragraph):
    if paragraph.strip()[0].isupper():
        first_word = paragraph.split(' ')[0]
        doc = nlp(paragraph)
        pos = doc[0].pos_
        if first_word == 'To' or pos=='VERB':
            return True # either "To" or a captalized verb
        else:
            return False # not "To" or a capitalized verb
    else:
        return False # not capitalized

# Combine words split by hyphen
def hyphenation_correction(word):
    word_split = word.split('-')
    if len(word_split)==2 and len([word for word in word_split if word.strip()])==2:
        # check if last letter of first word & first letter of last word is a letter
        if word_split[0][-1].isalpha() and word_split[1][0].isalpha():
            if is_split_word(word_split[0], word_split[1]):
                return word_split[0] + word_split[1] #remove hyphen
            else:
                return word #leave hyphen
        else:
            return word #leave hyphen
    else:
        return word #leave hyphen

def combine_words_split_by_hyphen(paragraph):
    words = paragraph.split(' ')
    new_para = ' '.join([word if ('-' not in word) else hyphenation_correction(word) for word in words])
    return new_para

# A manual inspection of the data revealed a number of headers/indices that repeated across multiple paragraphs
# Remove these as they are not helpful for the analysis
headers = ["Leadership Message Our Perspective Products Planet Appendix GRI Index People",
           "Introduction Our ESG Strategy Creating Shared Value Harm Reduction Environment Social Governance Performance and Assurance ESG Governance Environment",
           "Corporate Governance Ethics and Compliance Respect for Human Rights Technology Employees Responsible Supply Chain Quality and Services Environment Community Engagement Management Message Approach to Sustainability Contents About the Sustainability Report",
           "CEO Message Portfolio Data Appendix Our Company People Packaging Frameworks Water Operations Climate Agriculture",
           "About This Report About Loblaw Message to Stakeholders Live Life Well Governance Environment Sourcing Community Targets",
           "\|",
           "CEO Message Overview Team Members Customers Community Environment Supply Chain Governance SASB Index",
           "Corporate Governance Ethics and Compliance Respect for Human Rights Technology Employees Responsible Supply Chain Quality and Services Environment Community Engagement Management Message Approach to Sustainability Contents About the Sustainability Report",
           "Contents Editorial Policy Overview of Honda Message from the President and CEO Special Feature Sustainability Management Performance Report GRI Content Index Assurance Financial Data",
           "Overview Editorial Policy Honda Philosophy Message from the President and CEO Sustainability Management GRI Content Index Assurance",
           "Introduction Our ESG Strategy Creating Shared Value Harm Reduction Environment Social Governance Performance and Assurance ESG Governance Performance and Assurance",
           "CEO Message Agriculture Portfolio Data Appendix Our Company People Packaging Frameworks Water Operations Climate"]

def regex_cleaning(paragraph):
    # removing header number
    paragraph = re.sub(r'^\s?\d+(.*)$', r'\1', paragraph)
    # removing trailing spaces
    paragraph = paragraph.strip()
    # words may be split between lines, ensure we link them back together
    paragraph = re.sub('\s?-\s?', '-', paragraph)
    # remove space prior to punctuation
    paragraph = re.sub(r'\s?([,:;\.])', r'\1', paragraph)
    # replace numbers with '<NUM>'
    # paragraph = re.sub('\d+[,.]?\d*[,.]?\d*[%]?', '<NUM>', paragraph)
    # remove mentions of URLs
    paragraph = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', paragraph)
    # remove multiple spaces
    paragraph = re.sub('\s+', ' ', paragraph)
    return paragraph

def remove_extra_headers(text):
    text = re.sub('|'.join(headers), '', text)
    return text

# Checks whether a paragraph is made up mostly of capitalized words
def mostly_cap(paragraph):
    words_in_para = paragraph.split(' ')
    words_in_para = [word for word in words_in_para if (word.strip() and not bool(re.search('\d', word)))]
    total_words = len(words_in_para)
    upper_words = [word for word in words_in_para if word.strip()[0].isupper()]
    if total_words > 0:
        ratio = len(upper_words) / total_words
        return ratio>.7 # will return true if most non-digit words start with a capital letter
    else:
        return True # paragraphs with no words (incl. paragraphs with only numbers / words with digits) will be deleted

### Main text extraction functions

In [None]:
def extract_text(doc):
    text_all = []
    text_mcf = []
    p_size = get_frequency_font_sizes(doc)
    insufficient_blocking = None # NEED TO DEFINE
    for page in doc:
        # Get text in dict format, making sure to split ligatures into letters
        blocks = page.get_text("dict", sort=True,
                              flags=~fitz.TEXT_PRESERVE_LIGATURES+fitz.TEXT_INHIBIT_SPACES)["blocks"]
        paragraphs_all, paragraphs_mcf = paragraphs_from_blocks(blocks, p_size)
        text_all.extend(paragraphs_all)
        text_mcf.extend(paragraphs_mcf)
        
    line_word_counts = [len(line.split(' ')) for line in text_mcf]

    # Don't filter by font size if lines is empty
    # or if there is no line with more than 5 words
    if not line_word_counts or max(line_word_counts)<5:
        cleaned_text = clean_paragraphs_from_blocks(text_all)
    else:
        cleaned_text = clean_paragraphs_from_blocks(text_mcf)
    
    return (text_all, text_mcf, cleaned_text)

def paragraphs_from_blocks(blocks, p_size):
    paragraphs_all = []
    paragraphs_mcf = []
    # ESG reports have a lot of multi-column pages so sort first horizontally 
    # (don't be too picky -> want to pick up indented lines in right order)
    # and then sort vertically within those horizontal groups
    for b in custom_sort(blocks): 
        paragraph_all_fonts = ""
        paragraph_mcf = ""
        for l in b["lines"]:  # iterate through the text lines
            for s in l["spans"]:  # iterate through the text spans
                if s['text'].strip():  # removing lines w/just whitespaces:
                    # add space to prev text if needed
                    if need_to_add_space(paragraph_all_fonts, s['text']): 
                        paragraph_all_fonts += " "
                    paragraph_all_fonts += s['text']
                    if (p_size != None) and (round(s['size']) == p_size):
                        # add space to prev text if needed
                        if need_to_add_space(paragraph_mcf, s['text']):
                            paragraph_mcf += " "
                        paragraph_mcf += s['text']
        paragraphs_all.append(paragraph_all_fonts)
        paragraphs_mcf.append(paragraph_mcf)

    return (paragraphs_all, paragraphs_mcf)

def clean_paragraphs_from_blocks(paragraphs):
    # Remove empty strings
    paragraphs = [line for line in paragraphs if line] 

    # Remove non-ASCII characters (& strings with just non-ASCII characters)
    paragraphs = [''.join(map(lambda x: x if x in string.printable and x not in string.whitespace else ' ', line)) for line in paragraphs]
    paragraphs = [line.rstrip() for line in paragraphs if line.strip()]

    # Remove number-only lines
    paragraphs = [line for line in paragraphs if not line.isdigit()]
    
    # Remove headers (mark with '|' for easier paragraph splitting later):
    #remove lines in all caps
    paragraphs = [line if not line.isupper() else '|' for line in paragraphs]
    #remove lines with fewer than 4 words where there is no period and the first letter is uppercase
    paragraphs = [line if ('.' in line) or (len(line.strip().split(' '))>=4) or (not line[0].strip().isupper()) else '|' for line in paragraphs]
    paragraphs = [line if not (('....' in line) or ('. . . .' in line)) else '|' for line in paragraphs]
    #remove GRI/other index lines ('###-#:','G4-','GRI ','F ','P ','EC#', 'EN#','LA#','HR#','SO#', 'PR#','DMA-','FC#','principle #','DMA ')
    # MAYBE ALSO ADD: #.# uppercase or #.## uppercase
    paragraphs = [line if not bool(re.match(r'[0-9]{3}-[0-9]:|G4-|GRI \d|A\d|B\d|F |P |EC\s?[0-9]|FC\s?[0-9]|EN\s?[0-9]|LA\s?[0-9]|HR\s?[0-9]|SO\s?[0-9]|PR\s?[0-9]|DMA-|principle [0-9]|DMA |[0-9].[0-9] [A-Z]|[0-9].[0-9]{2} [A-Z]',line)) else '|' for line in paragraphs]

    # Combine paragraphs if needed
    cleaned_paragraphs =[]
    prev = ""
    for paragraph in paragraphs:
        # paragraph will be appended to prev paragraph if:
        # (1) its first char is lowercase and/or the previous line doesn't end in a period
        # (2) paragraph and/or prev paragraph are not '|' (aka was a header)
        if (paragraph.strip()[0].islower() or not prev.strip().endswith('.')) and not (paragraph=='|' or prev=='|'):
            # check for words split across lines; combine if needed, otherwise add space & combine
            if need_to_add_space(prev, paragraph):
                # check if line is a bullet point (common when listing goals/risks, etc. in ESG reports
                # and usually seen if line starts with space or capitalized "To" or some verb);
                # if so, add a period to previous paragraph for better comprehension
                if not prev.strip().endswith(('.',';',',',':')) and (paragraph.startswith(' ') or is_capitalized_verb(paragraph)):
                    prev += '.'
                
                # add space
                prev = prev + ' ' + paragraph
            
            else: # don't add space
                prev = prev + paragraph
        else:
            # New paragraph
            cleaned_paragraphs.append(prev)
            prev = paragraph

    # & don't forget left-over paragraph
    cleaned_paragraphs.append(prev)

    # Now we can remove the header placeholders
    cleaned_paragraphs = [paragraph for paragraph in cleaned_paragraphs if not paragraph=='|']

    # Regex-based cleaning
    cleaned_paragraphs = [regex_cleaning(paragraph) for paragraph in cleaned_paragraphs]

    # Get rid of empty paragraphs
    cleaned_paragraphs = [paragraph for paragraph in cleaned_paragraphs if paragraph]

    # Get rid of paragraphs that are mostly upper case
    cleaned_paragraphs = [paragraph for paragraph in cleaned_paragraphs if not mostly_cap(paragraph)]

    # Remove hyphens that are splitting words
    cleaned_paragraphs = [combine_words_split_by_hyphen(paragraph) for paragraph in cleaned_paragraphs]

    # There are certain headers that seem to recur; remove these
    cleaned_paragraphs = [remove_extra_headers(header) for paragraph in cleaned paragraphs]

    return cleaned_paragraphs

### Text to dict function

In [None]:
# Set keywords for checking whether supply chain related
sc_keywords = ['supplier', 'supply chain', 'value chain', 'procurement', 'vendor', ' sourcing']

def cleaned_text_to_dict(paragraphs):
    paragraph_dict = {}
    for i, paragraph in enumerate(paragraphs):
        paragraph_dict[i] = {}
        paragraph_dict[i]['Paragraph'] = paragraph

        # Check for any substrings in the supply chain keywords list
        if any(substring in paragraph_dict[i]['Paragraph'].lower() for substring in sc_keywords):
            paragraph_dict[i]['Supply_Chain'] = 'Yes'
        else:
            paragraph_dict[i]['Supply_Chain'] = 'No'
    return paragraph_dict

## Extract & clean text from PDFs

This section extracts and cleans the PDF text and saves the text to .txt files which are organized in folders by company.

In [None]:
# Make folders for all tickers
for company in list(esg_urls_pd_m['Exchange-Ticker'].unique()):
    path = f"../Text files/{company}"
    if os.path.exists(path) == False:
        os.mkdir(path)

In [None]:
esg_urls_pd_m.set_index("filename", inplace=True)
to_download_list = esg_urls_pd_m[esg_urls_pd_m['to_download']==True].index.values.tolist()
report_details_dict = esg_urls_pd_m[esg_urls_pd_m['URL'].notna()].to_dict('index')

In [None]:
# Access file and extract & save text without downloading full PDFs
exceptions = {}

for filename in tqdm(to_download_list):    
    url = esg_urls_pd_m.loc[filename]['URL']
    company = esg_urls_pd_m.loc[filename]['Exchange-Ticker']
    try:
        r = requests.get(url, timeout=5)
        f = io.BytesIO(r.content)
        txt_path_all = f"../Text files/{company}/{filename}.txt"
        txt_path_most_common_fontsize = f"../Text files/{company}/{filename}-mcf.txt"
        txt_path_cleaned = f"../Text files/{company}/{filename}-c.txt"
        with fitz.open(stream=f) as doc:
            report_details_dict[filename]['Num_pages'] = len(doc)
            text_all, text_most_common_fontsize, cleaned_text = extract_text(doc)
            with open(txt_path_all, "w") as file:
                for element in text_all:
                    file.write(element + "\n")
            with open(txt_path_most_common_fontsize, "w") as file:
                for element in text_most_common_fontsize:
                    file.write(element + "\n")
            with open(txt_path_cleaned, "w") as file:
                for element in cleaned_text:
                    file.write(element + "\n")
    except Exception as e:
        exceptions[filename] = e

In [None]:
# See how many PDFs weren't successfully converted with above code
print(len(exceptions))

In [None]:
# Transform downloaded PDFs into text files

for folder in os.listdir(f"../Downloaded_reports/"):
    if not folder.startswith("."):
        for filename in os.listdir(f"../Downloaded_reports/{folder}"):
            if filename.endswith(('.pdf','.PDF')):
                company = esg_urls_pd_m.loc[filename[:-4]]['Exchange-Ticker']
                pdf_path = f"../Initially_downloaded_reports/{folder}/{filename}"
                txt_path_all = f"../Text files/{company}/{filename[:-4]}.txt"
                txt_path_most_common_fontsize = f"../Text files/{company}/{filename[:-4]}-mcf.txt"
                txt_path_cleaned = f"../Text files/{company}/{filename[:-4]}-c.txt"
                with fitz.open(pdf_path) as doc:
                    report_details_dict[filename[:-4]]['Num_pages'] = len(doc)
                    text_all, text_most_common_fontsize, cleaned_text = extract_text(doc)
                    with open(txt_path_all, "w") as file:
                        for element in text_all:
                            file.write(element + "\n")
                    with open(txt_path_most_common_fontsize, "w") as file:
                        for element in text_most_common_fontsize:
                            file.write(element + "\n")
                    with open(txt_path_cleaned, "w") as file:
                        for element in cleaned_text:
                            file.write(element + "\n")

In [None]:
# Create list of remaining text files
remaining_list = []
for filename in list(esg_urls_pd_m[esg_urls_pd_m['URL'].notna()].index.values):
    company = esg_urls_pd_m.loc[filename]['Exchange-Ticker']
    txt_path = f"../Text files/{company}/{filename}.txt"
    if os.path.exists(txt_path) == False:
        remaining_list.append(filename)
        
# Make sure there aren't any missing text files
if len(remaining_list) > 0:
    print("The following files are missing:")
    for filename in remaining_list:
        print(filename)
else:
    print("No missing files!")

## Classify as SC and extract sentences

In [None]:
# Save additional report metadata to report details dict (report_details_dict)
# Save paragraphs to a dataframe (paragraph_df)
no_file_available = []
paragraph_df = pd.DataFrame(columns=['Paragraph_Order', 'Filename', 'Paragraph', 'Supply_Chain'])
for filename in tqdm(list(esg_urls_pd_m[esg_urls_pd_m['URL'].notna()].index.values)):
    company = esg_urls_pd_m.loc[filename]['Exchange-Ticker']
    path_cleaned = f"../Text files/{company}/{filename}-c.txt"
    if os.path.exists(path_cleaned):
        with open(path_cleaned) as f:
            paragraphs = f.readlines()
            paragraph_dict = cleaned_text_to_dict(paragraphs)
        report_details_dict[filename]['Num_paras'] = len(paragraph_dict)
        report_details_dict[filename]['Num_SC_paras'] = len({k:v for k,v in paragraph_dict.items() if v['Supply_Chain']=='Yes'})
        
        # Add to DF
        temp_df = pd.DataFrame.from_dict(paragraph_dict, orient='index')
        temp_df['Filename'] = filename
        temp_df.reset_index(inplace=True)
        temp_df.rename(columns={'index':'Paragraph_Order'}, inplace=True)
        paragraph_df = pd.concat([paragraph_df, temp_df])
    else:
        no_file_available.append(filename)

# Reset index to create Paragraph ID
paragraph_df.reset_index(inplace=True, drop=True)

# Create dataframes for report details and paragraphs w/o sents
report_details_df = pd.DataFrame.from_dict(report_details_dict, orient='index')
paragraph_df = paragraph_df[['Paragraph_Order', 'Filename', 'Paragraph', 'Supply_Chain']]

In [None]:
print(len(paragraph_df)) #363,950 paragraphs
print(len(paragraph_df[paragraph_df['Supply_Chain']=='Yes'])) #52,495 paragraphs

In [None]:
paragraph_df.head()

## Save data to pickles

In [None]:
report_details_df.to_pickle('../Data/report_details_df.pkl')
paragraph_df.to_pickle('../Data/paragraph_df.pkl')