# Summarization trials

## Setup

### Imports

In [None]:
from typing import List, Optional, Union
import sys

import numpy as np
import pandas as pd
import plotly.express as px
import torch
import umap
import yaml
from langchain import HuggingFaceHub, LLMChain, PromptTemplate
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.schema.document import Document
from langchain.text_splitter import CharacterTextSplitter
from rich import print
from sklearn.cluster import DBSCAN
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)

In [None]:
sys.path.append("../")
from src.ai_news_digest.utils import check_gpu_availability, create_run_folder
from src.ai_news_digest.steps.benchmark import entropy

### GPU availability

In [None]:
device = check_gpu_availability()

### Load secrets

In [None]:
with open("../conf/local/credentials.yml", "r") as f:
    hf_secrets = yaml.load(f, Loader=yaml.SafeLoader)["hugging_face"]
hf_hub_token = hf_secrets["hf_hub_token"]

### Config

In [None]:
PATH_INFO_DICT = "data/03_primary/arxiv_dict_2023-11-06_00-22-42.json"
MODEL_KWARGS = {"device": device}
ENCODE_KWARGS = {
    "normalize_embeddings": True,
    "batch_size": 16,
    "output_value": "sentence_embedding",
    "convert_to_numpy": True,
    "show_progress_bar": True,
}
UMAP_KWARGS = {
    "n_neighbors": 4,
    "min_dist": 0.1,
    "n_components": 2,
    "metric": "cosine",
}