In [1]:
%load_ext autoreload
%autoreload 2

In [82]:
import time
import logging
from typing import List, Dict, Any, Union

from news_summarizer.utils import batch
from transformers import pipeline
from news_summarizer.domain.clean_documents import CleanedArticle
from transformers import AutoModelForCausalLM, AutoTokenizer
from news_summarizer.domain.documents import Article, Link
from news_summarizer.domain.prompt import GenerateDatasetSamplesPrompt
from news_summarizer.datasets.generation import SummarizationDatasetGenerator

logger = logging.getLogger()
logger.setLevel(logging.INFO)


In [83]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct-AWQ"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


In [84]:
system_prompt_template = """Você é um assistente especializado em resumir notícias
"""


summarization_prompt_template = """Resuma essa noticia. Para isso você deve: Ler toda a notícia. \
Determinar o tema central da notícia e as informações mais relevantes que o autor deseja transmitir. \
Focar nos fatos e dados mais importantes, deixando de lado detalhes secundários ou exemplos específicos. \
Evitar adicionar opiniões pessoais ou interpretações. O resumo deve refletir fielmente o conteúdo original. \
Escrever de forma direta e simples, evitando jargões ou termos técnicos desnecessários. \
Manter a ordem lógica das informações apresentadas no artigo original, garantindo que o resumo seja coeso e fácil de entender. \ 
Certificar-se de que todas as informações incluídas no resumo estão corretas e correspondem ao conteúdo do artigo. \

Você deve responder utilizando a mesma lingua da notícia. \
Cada resposta deve conter apenas o resumo, sem palavras chave como 'Resumo:', não deve possuir multiplos espaços entre os parágrafos.
Noticia
{article}"""

In [85]:
def fetch_all_documents():
    offset = None
    my_documents = []

    while True:
        documents, offset = CleanedArticle.bulk_find(**{}, offset=offset)
        my_documents.extend(documents)

        if offset is None:
            break

    return my_documents



In [86]:
documents = fetch_all_documents()

INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "

In [96]:
generator = SummarizationDatasetGenerator()


We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.
Device set to use cuda:0


In [127]:
generator._model.config

Qwen2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct-AWQ",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "quantization_config": {
    "backend": "autoawq",
    "bits": 4,
    "do_fuse": false,
    "exllama_config": null,
    "fuse_max_seq_len": null,
    "group_size": 128,
    "modules_to_fuse": null,
    "modules_to_not_convert": null,
    "quant_method": "awq",
    "version": "gemm",
    "zero_point": true
  },
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "float16",
  "transform