```markdown
# Introduction

This Notebook demonstrates the process of storing malicious package data into a PostgreSQL database using PGVector and OpenAI embeddings. The workflow includes:

1. Loading environment variables.
2. Reading and parsing a JSON file containing malicious package data.
3. Converting JSON entries into LangChain documents.
4. Initializing OpenAI embeddings with error handling.
5. Configuring PostgreSQL PGVector connection parameters.
6. Storing the documents in PGVector in batches with rate limit handling.


```

In [1]:
import requests
import os
import json
from dotenv import load_dotenv

import base64
import psycopg2
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_openai import OpenAIEmbeddings
from tqdm import tqdm
import asyncio
# Load the environment variables from the .env file
load_dotenv()

True

In [2]:
import json
import os
import time

from openai import OpenAIError, RateLimitError
# Load JSON file
json_file_path = "..\data\\train_malicious_packages_final.json"
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert JSON entries into LangChain documents
documents = []
for entry in data:
    metadata = {
        "package_name": entry["package_name"],
        "file_list": entry["file_list"],
    }
    doc = Document(page_content=entry["setup.py"], metadata=metadata)
    documents.append(doc)


# Initialize OpenAI Embeddings with error handling
def get_embeddings():
    retries = 2
    for i in range(retries):
        try:
            return OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
        except RateLimitError:
            wait_time = 2 ** i  # Exponential backoff
            print(f"Rate limit reached. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    raise Exception("Failed to initialize OpenAI embeddings after multiple attempts.")

embeddings = get_embeddings()

# PostgreSQL PGVector Config
DB_PARAMS = {
    "database": "malware_kb",
    "user": "malware_admin",
    "password": "admin_secure_password",
    "host": "localhost",
    "port": "5432"
}

PGVECTOR_CONNECTION_STRING = f"postgresql+psycopg://{DB_PARAMS['user']}:{DB_PARAMS['password']}@{DB_PARAMS['host']}:{DB_PARAMS['port']}/{DB_PARAMS['database']}?options=-csearch_path=malware"

BATCH_SIZE = 2  # Define batch size for processing

def store_in_pgvector(docs):
    vector_store = PGVector(
        connection=PGVECTOR_CONNECTION_STRING,  # ✅ Use correct connection string
        embeddings=embeddings,
        collection_name="malicious_setup_py",
        use_jsonb=True,
    )
    try:
        # Store documents in batches
        for i in range(0, len(docs), BATCH_SIZE):
            try:
                batch = docs[i:i + BATCH_SIZE]
                vector_store.add_documents(batch)
                print(f"Stored batch {i//BATCH_SIZE + 1} of {len(docs)//BATCH_SIZE + 1}")
            except RateLimitError as e:
                print(f"Rate limit reached: {e}")
                time.sleep(60)
                continue
        print("All documents successfully stored in PGVector.")
    
    except Exception as e:
        print(f"Error storing documents: {e}")

# Store documents in PGVector
store_in_pgvector(documents)

Stored batch 1 of 580
Stored batch 2 of 580
Stored batch 3 of 580
Stored batch 4 of 580
Stored batch 5 of 580
Stored batch 6 of 580
Stored batch 7 of 580
Stored batch 8 of 580
Stored batch 9 of 580
Stored batch 10 of 580
Stored batch 11 of 580
Stored batch 12 of 580
Stored batch 13 of 580
Stored batch 14 of 580
Stored batch 15 of 580
Stored batch 16 of 580
Stored batch 17 of 580
Stored batch 18 of 580
Stored batch 19 of 580
Stored batch 20 of 580
Stored batch 21 of 580
Stored batch 22 of 580
Stored batch 23 of 580
Stored batch 24 of 580
Stored batch 25 of 580
Stored batch 26 of 580
Stored batch 27 of 580
Stored batch 28 of 580
Stored batch 29 of 580
Stored batch 30 of 580
Stored batch 31 of 580
Stored batch 32 of 580
Stored batch 33 of 580
Stored batch 34 of 580
Stored batch 35 of 580
Stored batch 36 of 580
Stored batch 37 of 580
Stored batch 38 of 580
Stored batch 39 of 580
Stored batch 40 of 580
Stored batch 41 of 580
Stored batch 42 of 580
Stored batch 43 of 580
Stored batch 44 of 5