In [1]:
!pip install -r requirements.txt

Collecting azure-ai-textanalytics==5.3.0b2
  Downloading azure_ai_textanalytics-5.3.0b2-py3-none-any.whl (321 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.5/321.5 KB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyPDF2==3.0.1
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
Collecting azure-common~=1.1
  Downloading azure_common-1.1.28-py2.py3-none-any.whl (14 kB)
Collecting azure-core<2.0.0,>=1.24.0
  Downloading azure_core-1.28.0-py3-none-any.whl (185 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.4/185.4 KB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: azure-common, PyPDF2, azure-core, azure-ai-textanalytics
Successfully installed PyPDF2-3.0.1 azure-ai-textanalytics-5.3.0b2 azure-common-1.1.28 azure-core-1.28.0
You should consider upgrading via the '/root/venv/bin/pytho

In [2]:
import re
import os

from azure.ai.textanalytics import TextAnalyticsClient, ExtractSummaryAction
from azure.core.credentials import AzureKeyCredential
from PyPDF2 import PdfReader

## Extraction

In [3]:
page_pattern = re.compile(r'Página\s\d+\sde\s\d+')
text_pattern = re.compile(r"1\.\d")
cap_pattern = re.compile(r"\d\.\s+[A-Z]+")

In [4]:
db = open("pdf/Bases de datos.pdf", "rb")
reader = PdfReader(db)

In [5]:
corpus = []
for pagei in range(len(reader.pages)):
    page = reader.pages[pagei]
    text = page.extract_text()
    corpus.append(text)
db.close()

## Analysis

Using language services from Azure Cognitive Services from Azure

### Authentication:

In [8]:
key = os.environ["KEY"]
endpoint = os.environ["ENDPOINT"]

In [9]:
text_analytics_client = TextAnalyticsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(key),
    )

In [50]:
def sample_extractive_summarization(client, document:list[str]):

    output = []

    poller = client.begin_analyze_actions(
        document,
        actions=[
            ExtractSummaryAction(max_sentence_count=4)
        ],
    )

    document_results = poller.result()
    
    for results in document_results:
        for document in results:
            output.append(" ".join([sentence.text for sentence in document.sentences]))
    return output

In [51]:
doc_summ = sample_extractive_summarization(text_analytics_client, corpus)

### Extractive:

In [None]:
def extractive_summarization(client, doc):

    document = doc
    output = []
    summary = ''

    poller = text_analytics_client.begin_extract_summary(document)
    extract_summary_results = poller.result()
    for result in extract_summary_results:
        if result.kind == "ExtractiveSummarization":
            summary =  " ".join([sentence.text for sentence in result.sentences])
            output.append(summary)
            print("Summaries extractive:")
        elif result.is_error is True:
            print("...Is an error with code '{}' and message '{}'".format(
                result.error.code, result.error.message
            ))
    
    return output

### Abstractive:

In [None]:
def abstractive_summarization(client,docs:list[str]):

    document:list[str] = docs
    output = []

    poller = text_analytics_client.begin_abstractive_summary(document)
    abstractive_summary_results = poller.result()
    for result in abstractive_summary_results:
        if result.kind == "AbstractiveSummarization":
            print("Summaries abstracted:")
            sumsum = [summary.text for summary in result.summaries]
            output.append(sumsum)
        elif result.is_error is True:
            print("...Is an error with code '{}' and message '{}'".format(
                result.error.code, result.error.message
            ))    
    
    return output

In [None]:
# db_summary = extractive_summarization(text_analytics_client, db)
# random_summary = abstractive_summarization(text_analytics_client, random_numbers)

In [None]:
# sum_sum = [page for array in random_summary for page in array]

### Persistence:

In [54]:
def persistence(datalist:list[str], name:str) -> None:
    with open(name, mode="a", encoding='utf8') as f:
        for letter in datalist:
            f.write(letter)

In [55]:
persistence(doc_summ, "jum.txt")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=67cfeb36-b07e-45a6-acf2-83ce38aa6dfe' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>