In [22]:
import fitz
import pandas as pd

from classification.keywords import keywords

In [18]:
def extract_paragraphs(pdf_path):
    """
    Extract paragraphs from a PDF file using PyMuPDF, with each paragraph tagged with its page and paragraph number.

    :param pdf_path: Path to the PDF file.
    :return: A list of dictionaries, each containing the page number, paragraph number, and paragraph text.
    """
    paragraphs = []
    paragraph_counter = 1

    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc, start=1):  # start=1 to start counting pages from 1
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:  # Ensuring it's a text block
                block_text = ""
                for line in block["lines"]:
                    for span in line["spans"]:
                        block_text += span["text"] + " "
                if block_text.strip():

                    if len(block_text.strip()) > 2:
                        paragraphs.append({
                            "page": page_num,
                            "paragraph": paragraph_counter,
                            "text": block_text.strip()
                        })
                    paragraph_counter += 1
    doc.close()
    return paragraphs

# Example usage
pdf_path = '1604404731.pdf'
paragraphs = extract_paragraphs(pdf_path)
for para in paragraphs:
    print(f"Page {para['page']}, Paragraph {para['paragraph']}: {para['text']}")

Page 1, Paragraph 1: _____ _____ 2019/20
Page 1, Paragraph 2: Sustainability Report
Page 1, Paragraph 3: Company registration (CVR) No. 69 74 99 17
Page 1, Paragraph 4: Statutory Report cf.  Danish Financial  Statements Act  sections 99 (a) and (b)
Page 1, Paragraph 5: Making life easier
Page 1, Paragraph 6: Stina, Ostomy device user
Page 2, Paragraph 8: Meeting our 2020  targets
Page 2, Paragraph 9: Coloplast has reached its 2020 targets  to increase share of renewable energy,  recycle more of our waste and increase  the diversity in our management. With  these achievements, Coloplast has a  solid foundation to do more.     New 2025 ambitions
Page 2, Paragraph 10: As part of the corporate strategy  process, Coloplast has reviewed existing  efforts on sustainability, and has set new  ambitions to support the UN Sustainable  Development Goals.             To serve as the basis for our new  sustainability strategy, Coloplast has  systematically assessed our impact on  the Sustainable Dev

In [28]:
df = pd.DataFrame(paragraphs)

for sdg, sdg_keywords in keywords.items():
    sdg_keywords_splitted = sdg_keywords.split(", ")

    df[sdg] = df["text"].map(lambda x: sum([1 if keyword in x.split() else 0 for keyword in sdg_keywords_splitted ]) )

df.sum(axis=0)
    

page                                                     13233
paragraph                                               203188
text         _____ _____ 2019/20Sustainability ReportCompan...
1                                                            0
2                                                            0
3                                                           11
4                                                            4
5                                                            0
6                                                           18
7                                                            7
8                                                            0
9                                                            0
10                                                           0
11                                                          28
12                                                           0
13                                                     

"Analyze the given text from a company report and provide key facts categorized under emissions, resources, energy, waste, employees, and audits. Format the response as a JSON object with each category as a key and the key facts as the value. For example: { 'emissions': '<fact>', 'resources': '<fact>', 'energy': '<fact>', 'waste': '<fact>', 'employees': '<fact>', 'audits': '<fact>' }."
"Given this text "" from a report of a company.  Give as the key facts towards the categories emissions, resources, energy, waste, employees and audits." 

{1: '2019 20 Sustainability Report Company registration CVR No.', 2: '2019 20 Sustainability Report Company registration CVR No.', 3: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 4: 'Plastics are the only relevant materials to use in our products due to product performance as well as hygiene and quality standards.', 5: '2019 20 Sustainability Report Company registration CVR No.', 6: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 7: 'Danish Financial Statements Act sections 99 a and b Making life easier Stina Ostomy device user 2 Meeting our 2020 targets Coloplast has reached its 2020 targets to increase share of renewable energy recycle more of our waste and increase the diversity in our management.', 8: 'We work to make life easier for people with intimate healthcare needs.', 9: 'As formulated by the TCFD the industry which Coloplast is in is not considered to have high exposure to climate change risks.', 10: '2019 20 Sustainability Report Company registration CVR No.', 11: 'Our products will continue to be made of plastics but we will identify and support the development of new sustainable technologies.', 12: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 13: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 14: 'We work to make life easier for people with intimate healthcare needs.', 15: 'We work to make life easier for people with intimate healthcare needs.', 16: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 17: 'We embrace that challenge and therefore we are setting a new ambition for sustainability to support the UN Sustainable Development Goals and the Paris Agreement s goal to limit global temperature increase to 1.5 degrees.'}


In [9]:
"Given this text "" from a report of a company.  Give as the key facts towards the categories emissions, resources, energy, waste, employees and audits." 


relevant_paragraphs = {1: '2019 20 Sustainability Report Company registration CVR No.', 2: '2019 20 Sustainability Report Company registration CVR No.', 3: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 4: 'Plastics are the only relevant materials to use in our products due to product performance as well as hygiene and quality standards.', 5: '2019 20 Sustainability Report Company registration CVR No.', 6: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 7: 'Danish Financial Statements Act sections 99 a and b Making life easier Stina Ostomy device user 2 Meeting our 2020 targets Coloplast has reached its 2020 targets to increase share of renewable energy recycle more of our waste and increase the diversity in our management.', 8: 'We work to make life easier for people with intimate healthcare needs.', 9: 'As formulated by the TCFD the industry which Coloplast is in is not considered to have high exposure to climate change risks.', 10: '2019 20 Sustainability Report Company registration CVR No.', 11: 'Our products will continue to be made of plastics but we will identify and support the development of new sustainable technologies.', 12: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 13: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 14: 'We work to make life easier for people with intimate healthcare needs.', 15: 'We work to make life easier for people with intimate healthcare needs.', 16: 'Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 .', 17: 'We embrace that challenge and therefore we are setting a new ambition for sustainability to support the UN Sustainable Development Goals and the Paris Agreement s goal to limit global temperature increase to 1.5 degrees.'}

" ".join(relevant_paragraphs.values())

'2019 20 Sustainability Report Company registration CVR No. 2019 20 Sustainability Report Company registration CVR No. Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 . Plastics are the only relevant materials to use in our products due to product performance as well as hygiene and quality standards. 2019 20 Sustainability Report Company registration CVR No. Our main contributions are on Good Health and Well Being Responsible consumption and production and Climate Action SDGs 3 12 and 13 . Danish Financial Statements Act sections 99 a and b Making life easier Stina Ostomy device user 2 Meeting our 2020 targets Coloplast has reached its 2020 targets to increase share of renewable energy recycle more of our waste and increase the diversity in our management. We work to make life easier for people with intimate healthcare needs. As formulated by the TCFD the industry which Coloplast is in is not considered

In [None]:
import openai as OpenAI

client = OpenAI(
    api_key="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",
)

In [14]:
import json

relevant_text = " ".join(relevant_paragraphs.values())

prompt = "Analyze the given text from a company report and provide key facts categorized under emissions, resources, energy, waste, employees, and audits. Format the response as a JSON object with each category as a key and the key facts as the value. For example: { 'emissions': '<fact>', 'resources': '<fact>', 'energy': '<fact>', 'waste': '<fact>', 'employees': '<fact>', 'audits': '<fact>' }. The text: '"+relevant_text+"'"


response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}],
    max_tokens= 200,
)

json.loads(response.choices[0].message.content)

'{\n  "emissions": "As formulated by the TCFD, the industry in which Coloplast operates is not considered to have high exposure to climate change risks.",\n  "resources": "Plastics are the only relevant materials used in Coloplast\'s products due to product performance, hygiene, and quality standards. However, they are committed to identifying and supporting the development of new sustainable technologies.",\n  "energy": "Coloplast has reached its 2020 targets to increase the share of renewable energy in their operations.",\n  "waste": "Coloplast has reached its 2020 targets to recycle more of their waste.",\n  "employees": "Coloplast has made efforts to increase diversity within their management.",\n  "audits": "No specific information about audits is provided in the text."\n}'

In [16]:
import json
res_str = response.choices[0].message.content

print(res_str)
json_object = json.loads(res_str)
json_object

{
  "emissions": "As formulated by the TCFD, the industry in which Coloplast operates is not considered to have high exposure to climate change risks.",
  "resources": "Plastics are the only relevant materials used in Coloplast's products due to product performance, hygiene, and quality standards. However, they are committed to identifying and supporting the development of new sustainable technologies.",
  "energy": "Coloplast has reached its 2020 targets to increase the share of renewable energy in their operations.",
  "waste": "Coloplast has reached its 2020 targets to recycle more of their waste.",
  "employees": "Coloplast has made efforts to increase diversity within their management.",
  "audits": "No specific information about audits is provided in the text."
}


{'emissions': 'As formulated by the TCFD, the industry in which Coloplast operates is not considered to have high exposure to climate change risks.',
 'resources': "Plastics are the only relevant materials used in Coloplast's products due to product performance, hygiene, and quality standards. However, they are committed to identifying and supporting the development of new sustainable technologies.",
 'energy': 'Coloplast has reached its 2020 targets to increase the share of renewable energy in their operations.',
 'waste': 'Coloplast has reached its 2020 targets to recycle more of their waste.',
 'employees': 'Coloplast has made efforts to increase diversity within their management.',
 'audits': 'No specific information about audits is provided in the text.'}