In [None]:
## install apache ticka with docker 
!docker pull apache/tika
!docker run -d -p 0.0.0.0:9998:9998 apache/tika


In [None]:
!curl http://192.168.2.97:9998/tika

In [None]:
!pip install tika
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install nltk 
!pip install transformers

In [None]:
from tika import parser
import re
from transformers import pipeline
import os
import time

timer=time.time()

os.environ['TOKENIZERS_PARALLELISM']='1' 

def extract_text_from_pdf(pdf_path):
    """ Extract text from a given PDF file using Apache Tika. """
    raw = parser.from_file(pdf_path, serverEndpoint='http://192.168.2.97:9998/')
    return raw['content']

def preprocess(text):
    return re.sub(r'\s{2,}|\xa0|\x00','',text)

def extract_links(text):
    # Regex pattern to extract URLs
    pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
    urls = re.findall(pattern, text)
    return urls
    
def tokenize(text):
    import nltk 
    from nltk.tokenize import sent_tokenize
    nltk.download('punkt')
    return sent_tokenize(extracted_text)
    
def summarize_text(text, model=0, max_length=200, with_cuda_device=-1):
    """ Generate a summary for the provided text using Hugging Face's summarization pipeline. """
    
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6",device=with_cuda_device) if model==0 \
        else pipeline("summarization", model="facebook/bart-large-cnn",device=with_cuda_device)
    try:
        summary = summarizer(text, max_length=max_length, min_length=100, do_sample=False )
    except Exception as e: 
        return '' 
    return summary[0]['summary_text']

def download_file(url):
    import requests
    filename = url.split('/')[-1]
    if not os.path.exists('./downloads'):
        os.mkdir('./downloads')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open('./downloads/'+filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    return './downloads/'+filename

# Summarize the text
def rolling_summary(extracted_text_tk: list, **kwargs):
    summary=[]
    print(kwargs)
    for i in range(1,len(extracted_text_tk)-1):
        corpus_context='.'.join(extracted_text_tk[i-1:i])
        if len(corpus_context)<=30 and len(corpus_context)>=1024:
            pass
        else: 
            summary.append(summarize_text(corpus_context
                    , model=1, **kwargs
                        )
                          )
    return summary 

## Get data and process 

# pdf_file_path=download_file('https://www.intel.com/content/dam/www/public/us/en/documents/technology-briefs/data-direct-i-o-technology-brief.pdf')    
pdf_file_path=download_file('https://fast.dpdk.org/doc/perf/DPDK_20_11_Mellanox_NIC_performance_report.pdf')

# Path to your PDF file
# pdf_file_path = './docs/data-direct-i-o-technology-brief.pdf'

# Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_file_path)

#remove useless symbols 
extracted_text = preprocess(extracted_text)

#split text into tokens, as summarize_text requries<1024 symbols
extracted_text_tk=tokenize(extracted_text)

summary_1 = rolling_summary(extracted_text_tk[50:60], with_cuda_device=-1)
summary_1=[x for x in  summary_1 if ' cnn' not in x.lower()]


#second iteration 
summary_2 = rolling_summary(summary_1)
summary_2=[x for x in  summary_1 if ' cnn' not in x.lower()] ## sometimes default model inejcts CNN related sentences 

#third iteration 
summary_3 = rolling_summary(summary_2)


summary_3=[x for x in  summary_3 if ' cnn' not in x.lower()]

print('\n'.join(summary_3))

with open(pdf_file_path+'_summary.txt', 'w+') as f:
    f.write('\n.'.join(summary_3))