In [1]:
import os
import dotenv

dotenv.load_dotenv('./.env')
_SUMMARY_MODEL = "llama3.2:3b"

# Self-Learning with llama

## chromadb store

In [2]:
import chromadb
import json
from pathlib import Path

from chromadb.utils import embedding_functions
from langchain_chroma import Chroma
from langchain.document_loaders import PyPDFLoader, JSONLoader

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.messages import HumanMessage

from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader

def metadata_func(record:dict, metadata:dict) -> dict:
    for col_metadata in ['code','title','code_data','code_number','link']:
        metadata[col_metadata] = record.get(col_metadata)
    metadata['authors'] = ','.join(record.get('authors'))
    return metadata

def load_chunk_persist_pdf(pdf_folder_path = "E:\codes\Artigo Forecast\pdfs",
                           collection_name = 'article_collection',
                           json_metadata_function = metadata_func,
                           chunk_size=2000, chunk_overlap=100) -> Chroma:
    documents = []
    for file in os.listdir(pdf_folder_path):
        file_path = os.path.join(pdf_folder_path, file)
        if file.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
            documents.extend(loader.load())
        # if file.endswith('.json'):
        #     data = json.loads(Path(file_path).read_text())
        #     loader = JSONLoader(
        #         file_path=file_path,
        #         jq_schema='.summary[]',
        #         content_key='summary',
        #         metadata_func=metadata_func,
        #         text_content=False)
        #     documents.extend(loader.load())
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunked_documents = text_splitter.split_documents(documents)
    client = chromadb.Client()
    try:
        client.delete_collection(collection_name)
        consent_collection = client.create_collection(collection_name)
    except:
        consent_collection = client.create_collection(collection_name)
    vectordb = Chroma.from_documents(
        documents=chunked_documents,
        embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"),
        persist_directory="../.chroma_store"
    )
    return vectordb

def _result_print(result):
    print("-="*40)
    print(f"-- input: \n{result['input']}")
    print("-="*40)
    print(f"-- context: \n")
    for docs in result['context']:
        print("--"*40)
        print("-- article:")
        print(docs.metadata['source'])
        print('-- content:')
        print(docs.page_content)
    print("-="*40)
    print(f"-- answer: \n{result['answer']}")

# Articles

In [3]:
_articles = [
    'E:\\codes\\Artigo Forecast\\pdfs\\2309.13807v1.pdf',
    'E:\\codes\\Artigo Forecast\\pdfs\\2309.14518v3.pdf',
    'E:\\codes\\Artigo Forecast\\pdfs\\2310.11059v1.pdf',
    'E:\\codes\\Artigo Forecast\\pdfs\\2405.19729v1.pdf',
    'E:\\codes\\Artigo Forecast\\pdfs\\2406.04390v1.pdf',
    'E:\\codes\\Artigo Forecast\\pdfs\\2409.04542v1.pdf',
]



# Define tools
We will also define some tools that our agents will use in the future

In [4]:
dict_articles = {}
for article in _articles:
    loader = PyPDFLoader(article)
    list_summaries = []
    for page in loader.load():
        llm = ChatOpenAI(
                    api_key="ollama",
                    model="llama3.2:3b",
                    base_url="http://localhost:11434/v1",
                )
        model = ChatOpenAI(
            api_key="ollama",
            model=_SUMMARY_MODEL,
            base_url="http://localhost:11434/v1",
        )
        message = HumanMessage(
            content=[
                {"type": "text", "text": f"""
                    resume the content of a text. you have to point the feature selection models used, the context where they are using it, the relationship with time series, and the results of the text, summarized.
                    never return authors or references, just the main concepts mentioned here.
                    this is the text:
                    <<{page.page_content}>>
                    """}
            ],
        )
        list_summaries.append(model.invoke([message]).content)
        message = HumanMessage(
            content=[
                {"type": "text", "text": f"""
                    resume the content of a text. you have to point the feature selection models used, the context where they are using it, the relationship with time series, and the results of the text, summarized.
                    never return authors or references, just the main concepts mentioned here.
                    this is the text:
                    <<{'|'.join(list_summaries)}>>
                    """}
            ],
        )
        dict_articles[article] = {
            'list_summaries':list_summaries, 
            'final_summary':model.invoke([message]).content
            }

In [9]:
# -*- coding: utf-8 -*-
import json

# Make it work for Python 2+3 and with Unicode
import io
try:
    to_unicode = unicode
except NameError:
    to_unicode = str

# Define data
data = dict_articles

# Write JSON file
with io.open('summaries.json', 'w', encoding='utf8') as outfile:
    str_ = json.dumps(data,
                      indent=4, sort_keys=True,
                      separators=(',', ': '), ensure_ascii=False)
    outfile.write(to_unicode(str_))