----

# **Weaviate, LangChain, Cohere - RAG-based application to control biases in media articles**

Useful documentation:
* Hybrid Search: https://python.langchain.com/docs/integrations/retrievers/weaviate-hybrid
* To solve problems with Cohere and Hybrid Search: https://github.com/langchain-ai/langchain/issues/5300

----

In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import time


# only if we want to import a prompt
# from langchain import hub
from langchain_community.vectorstores import Weaviate
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate

from langchain_community.llms import Cohere
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain.docstore.document import Document
import weaviate
from langchain_community.embeddings import CohereEmbeddings
from json import loads, dumps

from config_private import COHERE_KEY, NEWS_API_KEY, WV_API_KEY

## 0.Setup

In [2]:
# To adapt to Weaviate this one must be used even for English
EMBED_MODEL = "embed-multilingual-v3.0"

# the URL of the WV cluster
WEAVIATE_URL = "https://orals-cluster1-j7b4goli.weaviate.network"

In [3]:
# utility functions

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def preprocess_text(text):
    # eventually, remove ,
    text = text.replace(",", " ")

## **1. News API - Get articles**

In [6]:
#define subjects
list_of_subjects = ["Israel", "Immigration", "Elections", "NFL", "United Nations", "Gaza", "Biden", "Trump", "Republicans", "Democrats"]

columns = ['subject', 'source_id', 'name', 'author', 'title', 'url', 'urlToImage', 'publishedAt', 'content']

#create empty dataframe to fill
df_original = pd.DataFrame(columns=columns)
    
# loop over subjects and get 100 requests

for subject in tqdm(list_of_subjects):
    time.sleep(2)
    #define url
    url = (f"https://newsapi.org/v2/everything?"
               f"q={subject}&"
               f"language=en&"
               f"apiKey={NEWS_API_KEY}")

    response = requests.get(url)
    response_json = response.json()
    
    
    list_of_response = []

    for response in response_json['articles']:
        #add subject
        subject = subject
        
        #get individual columns from data    
        source_id = response['source']['id']
        name = response['source']['name']
        author = response['author']
        title = response['title']
        url = response['url']
        urlToImage = response['urlToImage']
        publishedAt = response['publishedAt']
        content = response['title'] + "\n" + response['content']

        #add to list
        list_of_response.append([subject, source_id, name, author, title, url, urlToImage, publishedAt, content])

    df_1 = pd.DataFrame(list_of_response, columns=columns)
        
    #add single df to multiple df
    df_original = pd.concat([df_original, df_1])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.80s/it]


In [7]:
df_original.head()

Unnamed: 0,subject,source_id,name,author,title,url,urlToImage,publishedAt,content
0,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,"Hamas command in north Gaza destroyed, Israel ...",https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-06T20:36:02Z,"Hamas command in north Gaza destroyed, Israel ..."
1,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,South Africa's genocide case against Israel: B...,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-12T20:59:40Z,South Africa's genocide case against Israel: B...
2,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,UN court to hear South Africa genocide case ag...,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-11T07:16:13Z,UN court to hear South Africa genocide case ag...
3,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,Khan Younis: Israel says forces have encircled...,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-23T21:15:25Z,Khan Younis: Israel says forces have encircled...
4,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,Israeli military says 21 soldiers killed in Gaza,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-23T07:00:02Z,Israeli military says 21 soldiers killed in Ga...


In [8]:
df_original.columns

Index(['subject', 'source_id', 'name', 'author', 'title', 'url', 'urlToImage',
       'publishedAt', 'content'],
      dtype='object')

In [9]:
df_original.shape

(1000, 9)

In [10]:
# save to excel
df_original.to_excel("news_2024_02_03.xlsx", index=False)

#### Reload from XLS

In [4]:
# reload from xls
df_original = pd.read_excel("news_2024_02_03.xlsx")

In [5]:
df_original.head()

Unnamed: 0,subject,source_id,name,author,title,url,urlToImage,publishedAt,content
0,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,"Hamas command in north Gaza destroyed, Israel ...",https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-06T20:36:02Z,"Hamas command in north Gaza destroyed, Israel ..."
1,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,South Africa's genocide case against Israel: B...,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-12T20:59:40Z,South Africa's genocide case against Israel: B...
2,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,UN court to hear South Africa genocide case ag...,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-11T07:16:13Z,UN court to hear South Africa genocide case ag...
3,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,Khan Younis: Israel says forces have encircled...,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-23T21:15:25Z,Khan Younis: Israel says forces have encircled...
4,Israel,bbc-news,BBC News,https://www.facebook.com/bbcnews,Israeli military says 21 soldiers killed in Gaza,https://www.bbc.co.uk/news/world-middle-east-6...,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-01-23T07:00:02Z,Israeli military says 21 soldiers killed in Ga...


In [6]:
# one problem we have is that we cannot load if some field are NaN
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subject      1000 non-null   object
 1   source_id    632 non-null    object
 2   name         1000 non-null   object
 3   author       971 non-null    object
 4   title        1000 non-null   object
 5   url          1000 non-null   object
 6   urlToImage   936 non-null    object
 7   publishedAt  1000 non-null   object
 8   content      1000 non-null   object
dtypes: object(9)
memory usage: 70.4+ KB


----------------------

-------

## **2. Weaviate, Cohere, LangChain application**

### **Create Weaviate client**

- Cohere embeddings models are used
- Weaviate as vector db
- LangChain

In [7]:
# get only text from df

txts = list(df_original['content'].values)
subjects = list(df_original['subject'].values)
publisheds = list(df_original['publishedAt'].values)
sources = list(df_original['source_id'].values)

# wrapping in LangChain list of Document
docs = [Document(page_content=txt, metadata={"subject":subject}) for txt, subject in zip(txts, subjects)] # metadata={"subject":subject, "published":published, "source_id":source}) for txt, subject, published, source in zip(txts, subjects, publisheds, sources)]

In [8]:
embed_model = CohereEmbeddings(model=EMBED_MODEL, cohere_api_key=COHERE_KEY)

In [9]:
# for now using V3
auth_client_secret = weaviate.AuthApiKey(api_key=WV_API_KEY)

client = weaviate.Client(
    url="https://orals-cluster1-j7b4goli.weaviate.network",
    auth_client_secret=auth_client_secret,
    additional_headers={"X-Cohere-Api-Key": COHERE_KEY}
)

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


#### if we want to clean up the DB (delete everything)

In [138]:
response = client.schema.get()

classes = response["classes"]

In [139]:
for obj in classes:
    print(obj["class"])

Document3


In [141]:
# with thos one you can clean the database
for obj in classes:
    print(obj["class"])
    # to delete uncomment
    # client.schema.delete_class(obj["class"])

Document3


In [50]:
# here we load documents + metadata in the Vector Store

db = Weaviate.from_documents(docs, embed_model, by_text=False, client=client)

In [51]:
retriever = db.as_retriever(search_kwargs={"k":10})

In [52]:
# add a reranker

compressor = CohereRerank(cohere_api_key=COHERE_KEY, top_n=6)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [53]:
llm = Cohere(model="command", max_tokens=1024, temperature=0.1, cohere_api_key=COHERE_KEY)

### Here we define the entire chain

In [54]:
prompt_template = """
You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
---------------------
Context: {context}
---------------------
Question: {question} 
Answer:
"""

prompt = PromptTemplate.from_template(prompt_template)

#
# a little more complex because we want metadata and references
#
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": compression_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

#### Do a test

In [55]:
query = "List all the info you have regarding Israel. Give as many details as possible."

In [56]:
result = rag_chain_with_source.invoke(query)

In [57]:
print(result['answer'])

 I have retrieved some information on the following topics: 

1. Sexual violence and genocide against Palestinian women by Hamas.
2. The US aid to Israel and whether it is becoming less bipartisan. 
3. Violations of human rights by the Israeli military in Gaza's second city, Khan Younis, and other attacks on Gaza. 

I do not have specific details on the amount of US aid to Israel over the last 75 years or exact figures on the advancement of troops into Khan Younis. 

I do not have information on the Israeli soccer player who referenced Hamas' hostages in Turkey. 

Can I help you with anything else? 


#### List references

In [58]:
for doc in result['context']:
    print(doc.page_content)
    print(f"Subject: {doc.metadata['subject']}")
    print("-----------------------")
    print("")

Israelis tell British MPs of evidence of Hamas sexual violence
Israelis who dealt with the bodies of victims of Hamas's 7 October attacks on Israel have told British MPs and peers that there was "deliberate, systematic genital mutilation" of female victims and t… [+5747 chars]
Subject: Gaza
-----------------------

Israelis tell British MPs of evidence of Hamas sexual violence
Israelis who dealt with the bodies of victims of Hamas's 7 October attacks on Israel have told British MPs and peers that there was "deliberate, systematic genital mutilation" of female victims and t… [+5747 chars]
Subject: Israel
-----------------------

Unconditional aid to Israel is no longer the bipartisan certainty it once was as US senators begin to question Israel's campaign in Gaza and the West Bank
Pro-Palestinian demonstrators in Chicago demand a halt to US aid to Israel.Anadolu
<ul><li>The US has given about $130 billion in aid to Israel over the last 75 years.</li><li>Israel is the largest … [+4114 ch

#### Test hybrid Search

In [10]:
class_obj = {
    "class": "Document3",
    "vectorizer": "text2vec-cohere"
}
client.schema.create_class(class_obj)

UnexpectedStatusCodeError: Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Document3" already exists'}]}.

In [11]:
retriever_hb = WeaviateHybridSearchRetriever(
    client=client, index_name="Document3", text_key="text"
)

In [131]:
# ids = retriever_hb.add_documents(docs)

[ERROR] Batch ConnectionError Exception occurred! Retrying in 2s. [1/3]


In [12]:
results = retriever_hb.get_relevant_documents("List everything about Gaza.")

In [13]:
results

[Document(page_content='Why Netanyahu Can’t Talk About Post-War Gaza\nWhen Israel launched its war in Gaza in the aftermath of the Oct. 7 massacre, in which some 1,200 people were killed by Palestinian militants, it did so with one stated goal: the total elimination of… [+6958 chars]'),
 Document(page_content='LA Times bans 38 of its journalists from reporting on Gaza after they sign open letter critical of coverage\nAt the Los Angeles Times, 38 journalists were banned from covering Gaza for a minimum of three months after signing an open letter about biased coverage of the Israel-Hamas war. The letter, "We conde… [+1449 chars]'),
 Document(page_content="More than 25,000 now killed in Gaza, Hamas-run health ministry says\nMore than 25,000 people have now been killed in Gaza since Israel's offensive began, according to the Hamas-run health ministry.\nIt said there had been 178 deaths in the last 24 hours, making it one… [+4268 chars]"),
 Document(page_content='Israeli military says 21 s

In [14]:
# Hybrid Search

# set how many docs to return
retriever_hb.k = 6

results = retriever_hb.get_relevant_documents(
    "List all the info you have regarding Israel. Give as many details as possible.",
    where_filter={
        "path": ["subject"],
        "operator": "Equal",
        "valueString": "Biden",
    },
)

In [15]:
results

[Document(page_content="Up First briefing: Strike kills U.S. troops in Jordan; Israel-Hamas talks continue\nGood morning. You're reading the Up First newsletter. Subscribe here to get it delivered to your inbox, and listen to the Up First podcast for all the news you need to start your day.\nToday's top st… [+5110 chars]"),
 Document(page_content="Biden: Three Americans killed, 'many' wounded in drone attack by Iran-backed militia in Jordan\nCOLUMBIA, S.C. (AP) Three American service members were killed and many were wounded in a drone strike in northeast Jordan near the Syrian border, President Joe Biden said in a statement Sunday. He a… [+3259 chars]"),
 Document(page_content="Biden considers halting some US military aid to force Israel to scale back its offensive in Gaza\nProtesters in Los Angeles denounce the Biden administration's support of Israel.David McNew/Getty Images\n<ul><li>Biden is considering curtailing some weapons support for Israel, NBC reported.</li><l… [+1831 chars]