In [2]:
%pip install langchain-elasticsearch langchain-community langchain tiktoken langchain_openai

Collecting langchain-elasticsearch
  Downloading langchain_elasticsearch-0.3.2-py3-none-any.whl.metadata (8.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.27-py3-none-any.whl.metadata (2.3 kB)
Collecting elasticsearch<9.0.0,>=8.13.1 (from elasticsearch[vectorstore-mmr]<9.0.0,>=8.13.1->langchain-elasticsearch)
  Downloading elasticsearch-8.18.1-py3-none-any.whl.metadata (9.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langcha

In [4]:
import os
with open("/content/api_key_openai.txt") as archivo:
  apikey = archivo.read()
os.environ["OPENAI_API_KEY"] = apikey

with open("/content/elasticstore.txt") as archivo:
  elastickey = archivo.read()

In [9]:
import pandas as pd
from langchain.schema import Document
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import ElasticsearchStore
from elasticsearch import Elasticsearch

Este indexo correctamente con langchain

In [None]:
# 1. Leer el CSV
df = pd.read_csv('/content/Stock_tienda.csv')

# 2. Crear documentos
docs = []
for _, row in df.iterrows():
    content = row['Producto']  # El texto principal para el embedding
    metadata = {
        'marca': row['marca'],
        'categoria': row['categoria'],
        'preciounitario': float(row['preciounitario']),
        'stock_disponible': int(row['stock_disponible']),
    }
    docs.append(Document(page_content=content, metadata=metadata))

# 3. Generar embeddings
embeddings = OpenAIEmbeddings()

# 4. Indexar en Elasticsearch
db = ElasticsearchStore.from_documents(
    docs,
    embeddings,
    es_url="http://34.9.158.34:9200",
    es_user="elastic",
    es_password=elastickey,
    index_name="lg-stockdata",
    batch_size=500
)
print(f"Indexados {len(docs)} documentos en el índice 'lg-stockdata'.")

Indexados 100 documentos en el índice 'lg-stockdata'.


In [10]:
es = Elasticsearch("http://34.9.158.34:9200", basic_auth=("elastic", elastickey))
count = es.count(index="lg-stockdata")["count"]
print(f"Total de documentos en lg-stockdata: {count}")

Total de documentos en lg-stockdata: 100


In [None]:
# 1. Leer el CSV de ventas
df = pd.read_csv('/content/Ventas_tienda.csv')

# Limpiar columnas de precios (eliminar $ y , y convertir a float)
for col in ['preciounitario', 'precioventafinalunitario', 'totalventa']:
    df[col] = df[col].astype(str).str.replace('[$,]', '', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 2. Crear documentos
docs = []
for _, row in df.iterrows():
    content = row['producto']  # El texto principal para el embedding
    metadata = {
        'venta_id': int(row['venta_id']),
        'fecha': row['fecha'],
        'marca': row['marca'],
        'categoria': row['categoria'],
        'preciounitario': row['preciounitario'],
        'precioventafinalunitario': row['precioventafinalunitario'],
        'cantidadvendida': int(row['cantidadvendida']),
        'totalventa': row['totalventa'],
    }
    docs.append(Document(page_content=content, metadata=metadata))

# 3. Generar embeddings
embeddings = OpenAIEmbeddings()

# 4. Indexar en Elasticsearch
db = ElasticsearchStore.from_documents(
    docs,
    embeddings,
    es_url="http://34.9.158.34:9200",
    es_user="elastic",
    es_password=elastickey,
    index_name="lg-ventadata",
    batch_size=500
)
print(f"Indexados {len(docs)} documentos en el índice 'lg-ventadata'.")

Indexados 1000 documentos en el índice 'lg-ventadata'.


In [11]:
es = Elasticsearch("http://34.9.158.34:9200", basic_auth=("elastic", elastickey))
count = es.count(index="lg-ventadata")["count"]
print(f"Total de documentos en lg-ventadata: {count}")

Total de documentos en lg-ventadata: 1000
