In [None]:
!pip install chromadb==0.4.20 langchain==0.0.342

### Summarization chain

In [2]:
import chromadb
from chromadb.utils import embedding_functions

#instantiate Chroma client & embedding function
chroma_client = chromadb.Client()
client = chromadb.PersistentClient(path="chroma_db/")
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

In [6]:
# delete existing collection to start fresh
try:
    chroma_client.delete_collection('portuguese_mililm_l6_v2')
except:
    print("collection didn't exist yet")

In [7]:
#get or create a new collection
collection = chroma_client.get_or_create_collection(name="portuguese_mililm_l6_v2", embedding_function=sentence_transformer_ef)

In [9]:
import json 

def flatten(xss):
    return [x for xs in xss for x in xs]
   
def get_livre_data():

    data = []

    for i in range(1,8):
        f = open(f'livre-2024_{i}.json') 
        loaded = json.load(f)
        data.append(loaded)
    
    data = flatten(data)
    
    #split up docs
    documents = [i['content'] for i in data]
    metadatas = [{'chapter':i['chapter'], 'party':'livre', 'year':2024} for i in data]
    
    return documents, metadatas

In [17]:
documents_livre, metadatas_livre = get_livre_data()

In [21]:
def get_il_data():
    documents = []
    metadatas = []
    
    f = open(f'programs/json/il-2022.json') 
    data = json.load(f)

    for j, i in enumerate(data):
        c = 0
        if i['content'] == None:
            c += 1
            pass
        else:
            # only added content here
            content = ' '.join(i['content'])
            chapter = i['chapter']
            party = 'il'
            year = 2022
            
            documents.append(content)
            metadatas.append({'chapter':chapter,\
                'party':party,\
                    'year':2022})
            
    return documents, metadatas

In [22]:
documents_il, metadatas_il = get_il_data()

In [29]:
full_docs = documents_livre + documents_il
full_metadata = metadatas_livre + metadatas_il
full_ids = [str(i) for i in range(len(full_docs))]

#necessary for chroma
len(full_docs), len(full_metadata), len(full_ids)

(430, 430, 430)

In [30]:
#add to chroma
collection.add(
    documents=full_docs,
    metadatas=full_metadata,
    ids=full_ids)

In [31]:
#query for chroma_db
query = 'Deixemos as classes dominantes tremerem perante uma revolução comunista. Os proletários não possuem nada a perder a não ser suas correntes. Eles têm um mundo para ganhar. Trabalhadores de todos os países, uni-vos!'

In [33]:
#Query db - return x best matching results => to provide to llm
results = collection.query(
query_texts=[query],
    n_results=5
)

results

{'ids': [['69', '25', '282', '20', '386']],
 'distances': [[0.6900758743286133,
   0.7773330211639404,
   0.7953920364379883,
   0.8023964166641235,
   0.8042513132095337]],
 'metadatas': [[{'chapter': 'Educação', 'party': 'livre', 'year': 2024},
   {'chapter': 'Igualdade, Justiça Social e Liberdade',
    'party': 'livre',
    'year': 2024},
   {'chapter': 'Estado e Instituições', 'party': 'livre', 'year': 2024},
   {'chapter': 'Igualdade, Justiça Social e Liberdade',
    'party': 'livre',
    'year': 2024},
   {'chapter': 'Educação', 'party': 'il', 'year': 2022}]],
 'embeddings': None,
 'documents': [['Transformar o 12º ano num ano zero de entrada na universidade e politécnicos, permitindo não só recuperar a geração Covid como torná-la numa das mais capacitadas a entrar no ensino superior — ou na vida profissional para aqueles que fizessem apenas o ano zero e não quisessem prosseguir. O ano zero da universidade deve ser para todos: gradual e tendencialmente lecionado em ambiente unive

In [35]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

#split returned docs to fit context window llm
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
texts = text_splitter.create_documents(results["documents"][0], metadatas=results['metadatas'][0])
texts = text_splitter.split_documents(texts)

In [69]:
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate

prompt_template = """You will receive documents in Portuguese. Summarize the documents in Portuguese in bullet points.
              ```{text}```
  """

prompt = PromptTemplate(template=prompt_template, input_variables=["page_content"])

In [70]:
#model path see here https://gpt4all.io/index.html

local_path = (
    "llm_model/gpt4all-falcon-q4_0.gguf"  # replace with your desired local file path
)

In [71]:
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

In [72]:
stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)

In [73]:
try:
    stuff_chain.run(texts)
except Exception as e:
    print(
        "The code failed since it won't be able to run inference on such a huge context and throws this exception: ",
        e,
    )


The Portuguese government plans to implement several measures to address the issue of missing children in Portugal. These include:

* Establishing a National Missing Persons Unit within the Ministry of Justice and Human Rights, which will coordinate all missing persons investigations and provide support to families of missing persons.
* Creating a database of missing persons, including information on their age, gender, location, and other relevant details. This database will be accessible to law enforcement agencies, NGOs, and other organizations working to prevent and address missing persons cases.
* Implementing a national hotline for reporting missing persons, which will be staffed by trained professionals who can provide support and guidance to families of missing persons.
* Establishing a National Missing Persons Advisory Committee, which will coordinate all efforts related to missing persons in Portugal and ensure that the government is taking a comprehensive approach to address

### Notes
- apparently it understands Portuguese since content and query was done in PT - doesn't answer in PT yet
- havent't really figured out how to add the metadata tags to the prompt (besides actually giving it two lists)
    - think I can manually split the docs per party, summarize with above chain and then feed the individual summarization to a ```map_reduce``` chain to make a summary of summaries

- will first maybe have a look at a chatbot or question/answering option