# Summarization
We try to summarize a document and also for a single embedding, to compute a representative sentence.

In [None]:
import logging
import pathlib

import requests
from haystack import Document
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import EmbeddingRetriever, BM25Retriever, PromptTemplate, PromptNode, PreProcessor
from haystack.nodes import TransformersSummarizer

In [None]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
DATA_DIR = pathlib.Path().absolute().parent / "data"
BOOKS_DIR = DATA_DIR / "books"

# Download some books from Project Gutenberg

In [None]:
book_urls = [("anna_karenina.txt", "https://www.gutenberg.org/cache/epub/1399/pg1399.txt")]
if not BOOKS_DIR.exists():
    BOOKS_DIR.mkdir(parents=True)
for (file_name, url) in book_urls:
    file_path = BOOKS_DIR / file_name
    if not file_path.exists():
        print(f"Downloading {file_name} from {url}...")
        with open(file_path, "w", encoding='utf-8') as f:
            text = requests.get(url).text
            lines = text.split("\n")
            separator_index = -1
            for i, line in enumerate(lines):
                if line.startswith("*** START OF THE PROJECT GUTENBERG EBOOK"):
                    separator_index = i
                    break
            book = "\n".join(lines[separator_index + 1:])
            f.write(book)


In [None]:
book_text = (BOOKS_DIR / "anna_karenina.txt").read_text(encoding='utf-8')
docs = [Document(content=book_text[:1000], meta={"name": "Anna Karenina"})] # shorten the book for faster processing, the tokenizer cannot handle large texts anyway

In [None]:
bm25_ds = InMemoryDocumentStore(use_bm25=True)
bm25_ds.write_documents(docs)
bm25_retriever = BM25Retriever(document_store=bm25_ds, top_k=2)

# Try some different summarization models
"xsum" and "samsum" are widely used summarization datasets used for training as indicated by the model names.

In [None]:
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum", use_gpu=True)
summary = summarizer.predict(documents=docs)
summary[0].meta["summary"]

Summary: `In our series of letters from African journalists, novelist and writer Anna Karenina reflects on life in her native Russia in the 19th Century.`

In [None]:
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn", use_gpu=True)
summary = summarizer.predict(documents=docs)
summary[0].meta["summary"]

Summary: `Every unhappy family is unhappy in its own way, says Tolstoy.`

In [None]:
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-xsum", use_gpu=True)
summary = summarizer.predict(documents=docs)
summary[0].meta["summary"]

Summary: `A selection of extracts from Leo Tolstoy's novel Anna Karenina.`

In [None]:
summarizer = TransformersSummarizer(model_name_or_path="philschmid/bart-large-cnn-samsum", use_gpu=True)
summary = summarizer.predict(documents=docs)
summary[0].meta["summary"]

Summary: `The Oblonskys’ house was in chaos because the husband was having an affair with a French girl. The French girl wanted to leave the house.`