In [6]:
import os
import glob
import json
import uuid
from pathlib import Path
import requests
import urllib.parse
import pandas as pd
from dotenv import load_dotenv

In [7]:
load_dotenv(override=True)

True

In [None]:

url = "https://api.vectara.io/v2/corpora/casey/documents"

payload = json.dumps({
  "id": str(uuid.uuid4()),
  "type": "core",
  "metadata": {
    "title": "A Nice Document",
    "lang": "eng"
  },
  "document_parts": [
    {
      "text": "I'm a nice document part."
    }
  ]
})
headers = {
  'Content-Type': 'application/json',
  'Accept': 'application/json',
  'x-api-key': os.environ.get("VECTARA_API_KEY")
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)

{"id":"e9c78ff5-28ce-4b4d-bae7-7818d8ffd485","metadata":{"title":"A Nice Document","lang":"eng"},"storage_usage":{"bytes_used":25,"metadata_bytes_used":58}}


In [20]:
import requests

url = f"https://api.vectara.io/v2/corpora/casey/query?query={urllib.parse.quote('my powerful document')}"

payload={}
headers = {
  'Accept': 'application/json',
  'x-api-key': os.environ.get("VECTARA_API_KEY")
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)

{"search_results":[{"text":"I'm a nice document part.","score":0.6243952512741089,"part_metadata":{"nice_rank":9000},"document_metadata":{"title":"A Nice Document","lang":"eng"},"document_id":"e9c78ff5-28ce-4b4d-bae7-7818d8ffd485"}]}


In [None]:
from langdetect import detect

def detect_language(text):
    try:
        # Detect the language
        language = detect(text)
        return language
    except Exception as e:
        return f"Error: {e}"

# Example usage
text = "Hola, ¿cómo estás?"
language = detect_language(text)
print(f"The detected language is: {language}")

The detected language is: es


**dist**

In [None]:
import json
import uuid
import requests
import pandas as pd
from langdetect import detect
import urllib.parse as urlparse

class VectaraClient:
    def __init__(self, api_key: str):
        self.api_key = api_key

    def add_document_to_corpus(self, corpus_key, title, metadata, document_parts):
        url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"

        data = {
            "id": str(uuid.uuid4()),
            "type": "core",
            "metadata": {
                "title": title,
                "lang": detect(title),
            },
            "document_parts": document_parts
        }

        data["metadata"].update(metadata)

        payload = json.dumps(data)
        headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json',
            'x-api-key': self.api_key
        }

        response = requests.request("POST", url, headers=headers, data=payload)
        print(response.text)

    def simple_single_corpus_query(self, corpus_key, query, limit=5):
        url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query?query={urlparse.quote(query)}&limit={limit}"

        payload={}
        headers = {
            'Accept': 'application/json',
            'x-api-key': self.api_key
        }

        response = requests.request("GET", url, headers=headers, data=payload)
        print(response.text)
        return pd.DataFrame(json.loads(response.text)['search_results'])
    
    def remove_all_documents_and_data_in_a_corpus(self, corpus_key):
        url = f"https://api.vectara.io/v2/corpora/{corpus_key}/reset"

        payload={}
        headers = {
            'Accept': 'application/json',
            'x-api-key': self.api_key
        }

        response = requests.request("POST", url, headers=headers, data=payload)
        print(response.text)

    def list_rerankers(self):
        url = f"https://api.vectara.io/v2/rerankers"

        payload={}
        headers = {
            'Accept': 'application/json',
            'x-api-key': self.api_key
        }

        response = requests.request("GET", url, headers=headers, data=payload)
        return response.text

In [22]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

vectara_client = VectaraClient(os.environ.get("VECTARA_API_KEY"))

In [24]:
df = pd.DataFrame(json.loads(vectara_client.list_rerankers())['rerankers'])
df

Unnamed: 0,id,name,description,enabled
0,rnk_272725717,vectara-rrk-v1.0.0,Base reranker(english-only).,False
1,rnk_272725718,Maximum Marginal Relevance Reranker,This reranker strives to reduce redundancy whi...,False
2,rnk_272725719,Rerank_Multilingual_v1,Multilingual reranker that reorders retrieved ...,False
3,rnk_272725722,User_Defined_Function_Reranker,Custom reranker that reorders retrieved search...,False


In [25]:
print(df.description.tolist())

['Base reranker(english-only).', 'This reranker strives to reduce redundancy while maintaining query relevance in re-ranking retrieved documents.', 'Multilingual reranker that reorders retrieved search results based on relevance to the query.', 'Custom reranker that reorders retrieved search results based on a user defined function.']


In [None]:
for news in glob.glob('../data/clean/news/en/*.csv')[:1]:
    new_df = pd.read_csv(news)
    corpus_key = 'casey'
    title = Path(news).stem
    fact_list = [{'text': fact} for fact in new_df['fact']]

    vectara_client.add_document_to_corpus(corpus_key, title, {}, fact_list)   

{"id":"bed91294-8bbb-4ad1-90f8-138084d834ba","metadata":{"title":"8_GameChanging_Manufacturing_Trends_That_Will_Define_2025","lang":"en"},"storage_usage":{"bytes_used":3751,"metadata_bytes_used":121}}


In [92]:
search_results = vectara_client.simple_single_corpus_query('casey', '¿Que ocurrirá con la IA a partir del 2025?')

{"search_results":[{"text":"La IA puede ser utilizada para resolver problemas y crear eficiencias en la industria manufacturera, como la planificación y la entrega de iniciativas para fomentar la comprensión de la IA en la fuerza laboral.","score":0.6321282982826233,"part_metadata":{},"document_metadata":{"title":"8_GameChanging_Manufacturing_Trends_That_Will_Define_2025","lang":"en"},"document_id":"bed91294-8bbb-4ad1-90f8-138084d834ba"},{"text":"La IA puede ser utilizada para mejorar la precisión de la predicción de la demanda y la toma de decisiones automatizada en la industria manufacturera.","score":0.6312321424484253,"part_metadata":{},"document_metadata":{"title":"8_GameChanging_Manufacturing_Trends_That_Will_Define_2025","lang":"en"},"document_id":"bed91294-8bbb-4ad1-90f8-138084d834ba"},{"text":"En 2025, las empresas manufactureras priorizarán la innovación y la eficiencia en la adopción de tecnologías como la robótica, el mantenimiento predictivo y la automatización.","score":0

In [93]:
search_results

Unnamed: 0,text,score,part_metadata,document_metadata,document_id
0,La IA puede ser utilizada para resolver proble...,0.632128,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
1,La IA puede ser utilizada para mejorar la prec...,0.631232,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
2,"En 2025, las empresas manufactureras priorizar...",0.618242,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
3,Las empresas que adopten la IA y la eficiencia...,0.614611,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
4,"En 2025, la industria manufacturera se enfrent...",0.608512,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
5,La adopción de la inteligencia artificial (IA)...,0.594797,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
6,La sostenibilidad se está convirtiendo en una ...,0.594134,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
7,"En 2025, se espera que las empresas manufactur...",0.592526,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
8,Las empresas manufactureras necesitarán replan...,0.58264,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba
9,A medida que los robots se alejan de la línea ...,0.559317,{},{'title': '8_GameChanging_Manufacturing_Trends...,bed91294-8bbb-4ad1-90f8-138084d834ba


In [None]:
I have a "search_results" variable that is a Dataframe with the columns text and document_metadata. The document_metadata have information in the format "{'title':'string', 'url': 'string'}". I need to process this Dataframe in a way to end up with a list of dictionaries that have the properties: title, url, text

In [32]:
import re

def __escape_markdown_v2(text: str) -> str:
    """
    Escape special characters for Telegram's MarkdownV2 format while preserving formatting.
    """
    # First, let's preserve the bold headers by converting them
    text = re.sub(r'\*\*([^\n]+?)\*\*', lambda m: f'⟦BOLD⟧{m.group(1)}⟦/BOLD⟧', text)
    
    # Escape special characters except those in our preserved tokens
    parts = []
    current_pos = 0
    for match in re.finditer(r'⟦BOLD⟧.*?⟦/BOLD⟧', text, re.DOTALL):
        # Add escaped text before the match
        start, end = match.span()
        if start > current_pos:
            parts.append(re.sub(r'([_*\[\]()~`>#+\-=|{}.!])', r'\\\1', text[current_pos:start]))
        
        # Add the bold text with proper Telegram formatting
        bold_text = text[start:end].replace('⟦BOLD⟧', '*').replace('⟦/BOLD⟧', '*')
        parts.append(bold_text)
        current_pos = end
    
    # Add any remaining text
    if current_pos < len(text):
        parts.append(re.sub(r'([_*\[\]()~`>#+\-=|{}.!])', r'\\\1', text[current_pos:]))
    
    return ''.join(parts)

In [33]:
text = """CAREER TRANSITION GUIDANCE FOR DEVELOPERS

As the demand for data engineering skills continues to rise, many developers are considering a career transition into this field. With the right guidance and training, developers can leverage their existing skills to succeed in data engineering. According to Gina Smith, research director of IT skills for digital business at IDC, "New gen AI-fueled tech training platforms won’t get rid of skills shortages, but they can help data centers 
provide custom learning for individuals and teams."

**Key Skills for Data Engineering**
Developers looking to transition into data engineering should focus on acquiring skills in machine learning, data analysis, and model training and engineering. Additionally, knowledge of cybersecurity, automation technologies, cloud development, IT service management (ITSM), DevSecOps, and IT ops is highly valued in the industry.

**Career Development Strategies**
> "With an uneven economic recovery, and enterprises unsure about future market prospects, we’re seeing organizations look to reduce costs and maintain a tight focus on initiatives that produce value," said Forrester’s Mark.
Developers can benefit from online skill-building modules, adaptive learning platforms, and career analytics dashboards to build their workforce readiness and stay ahead in the industry."""

In [35]:
print(__escape_markdown_v2(text))

CAREER TRANSITION GUIDANCE FOR DEVELOPERS

As the demand for data engineering skills continues to rise, many developers are considering a career transition into this field\. With the right guidance and training, developers can leverage their existing skills to succeed in data engineering\. According to Gina Smith, research director of IT skills for digital business at IDC, "New gen AI\-fueled tech training platforms won’t get rid of skills shortages, but they can help data centers 
provide custom learning for individuals and teams\."

*Key Skills for Data Engineering*
Developers looking to transition into data engineering should focus on acquiring skills in machine learning, data analysis, and model training and engineering\. Additionally, knowledge of cybersecurity, automation technologies, cloud development, IT service management \(ITSM\), DevSecOps, and IT ops is highly valued in the industry\.

*Career Development Strategies*
\> "With an uneven economic recovery, and enterprises un