In [None]:
# !wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
# !tar -xf sift.tar.gz
# !mkdir data/sift1M -p
# !mv sift/* data/sift1M

In [None]:
import psutil

def get_memory_usage_mb():
    process = psutil.Process()
    memory_info = process.memory_info()

    return memory_info.rss / (1024 * 1024)

import time
import faiss
from faiss.contrib.datasets import DatasetSIFT1M

ds = DatasetSIFT1M()
xq = ds.get_queries() ## 쿼리 벡터, (10000, 128)
xb = ds.get_database() ## 문서 벡터 (1000000, 128)
gt = ds.get_groundtruth() ## 각 쿼리 벡터에 대해 상위 100개의 가장 가까운 이웃들의 인덱스 (10000, 100)

In [None]:
k = 1
d = xq.shape[1]
nq = 1000
xq = xq[:nq]

for i in range(1, 10, 2):
    start_memory = get_memory_usage_mb()
    start_indexing = time.time()

    ## Indexing
    index = faiss.IndexFlatL2(d)
    index.add(xb[:(i+1) * 100000])

    end_indexing = time.time()
    end_memory = get_memory_usage_mb()

    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()
    print(f"데이터 {(i+1) * 100000}개 : ")
    print(f"색인: {(end_indexing - start_indexing) * 1000 :.3f} ms ({end_memory - start_memory:.3f} MB) 검색: {(t1 - t0) * 1000 / nq :.3f} ms\n")

In [None]:
import numpy as np
import time
import faiss

k = 1
d = xq.shape[1]
nq = 1000
xq = xq[:nq]

# IndexFlatL2는 기본적인 L2 거리 기반 KNN 검색을 수행합니다
index = faiss.IndexFlatL2(d)

start_memory = get_memory_usage_mb()
start_index = time.time()

index.add(xb)

end_memory = get_memory_usage_mb()
end_index = time.time()
print(f"색인 시간 : {end_index - start_index}s, 메모리 사용량 : {end_memory - start_memory}MB")

t0 = time.time()
D, I = index.search(xq, k)  # k개의 가장 가까운 이웃을 검색
t1 = time.time()

recall_at_1 = np.equal(I, gt[:nq, :1]).sum() / float(nq)
print(f"{(t1 - t0) * 1000.0 / nq:.3f}ms per query, R@1 {recall_at_1:.3f}\n")

In [None]:
import numpy as np

k = 1
d = xq.shape[1]
nq = 1000
xq = xq[:nq]

for m in [8, 16, 32, 64]:
    index = faiss.IndexHNSWFlat(d, m)
    
    time.sleep(3)
    start_memory = get_memory_usage_mb()
    start_index = time.time()

    index.add(xb)
    
    end_memory = get_memory_usage_mb()
    end_index = time.time()
    print(f"M : {m} - 색인 시간 : {end_index - start_index}s, 메모리 사용량 : {end_memory - start_memory}MB")

    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()

    recall_at_1 = np.equal(I, gt[:nq, :1]).sum() / float(nq)
    print(f"{(t1 - t0) * 1000.0 / nq:.3f}ms per query, R@1 {recall_at_1:.3f}\n")

In [None]:
k = 1
d = xq.shape[1]
nq = 1000
xq = xq[:nq]

for ef_construction in [40, 80, 160, 320]:
    index = faiss.IndexHNSWFlat(d, 32)
    index.hnsw.efConstruction = ef_construction

    time.sleep(3)
    start_memory = get_memory_usage_mb()
    start_index = time.time()
    index.add(xb)
    end_memory = get_memory_usage_mb()
    end_index = time.time()

    print(f"efConstruction : {ef_construction} - 색인 시간 : {end_index - start_index}s, 메모리사용량 : {end_memory - start_memory}MB")

    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()

    recall_at_1 = np.equal(I, gt[:nq, :1]).sum() / float(nq)
    print(f"{(t1-t0) * 1000.0 / nq:.3f}ms per query, R@1 {recall_at_1:.3f}")

In [None]:
for ef_search in [16, 32, 64, 128]:
    index.hnsw.efSearch = ef_search
    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()

    recall_at_1 = np.equal(I, gt[:nq, :1]).sum() / float(nq)
    print(f"{(t1-t0) * 1000.0 / nq:.3f}ms per query, R@1 {recall_at_1:.3f}")

In [1]:
import os
from dotenv import load_dotenv
load_dotenv("../keys.env")

api_key = os.getenv('PINECONE_API_KEY')
os.environ['PINECONE_API_KEY'] = api_key

In [4]:
from pinecone import Pinecone, ServerlessSpec

## 인덱스 생성
pc = Pinecone(api_key=api_key)
pc.create_index('llm-book', spec=ServerlessSpec("aws", "us-east-1"), dimension=768)

## 인덱스 불러오기
index = pc.Index("llm-book")

In [5]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

## 임베딩 생성
sentence_model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

## 데이터셋 불러오기
klue_dp_train = load_dataset('klue', 'dp', split='train[:100]')

## 임베딩
embeddings = sentence_model.encode(klue_dp_train['sentence'])

In [6]:
embeddings = embeddings.tolist()

insert_data = []
for idx, (embedding, text) in enumerate(zip(embeddings, klue_dp_train['sentence'])):
    insert_data.append({'id' : str(idx), 'values' : embedding, 'metadata' : {'text' : text}})

In [7]:
upsert_response = index.upsert(vectors=insert_data, namespace='llm-book-sub')

In [8]:
query_response = index.query(
    namespace='llm-book-sub', ## 검색할 네임스페이스
    top_k=10, ## 몇개의 결과를 반환할지
    include_values=True, ## 벡터 임베딩 반환 여부
    include_metadata=True, ## 메타 데이터 반환 여부
    vector=embeddings[0] ## 검색할 벡터 임베딩
)

print(query_response)

{'matches': [{'id': '0',
              'metadata': {'text': '해당 그림을 보면 디즈니 공주들이 브리트니 스피어스의 앨범이나 뮤직비디오, '
                                   '화보 속 모습을 똑같이 재연했다.'},
              'score': 1.00002015,
              'values': [-1.10073376,
                         0.22047776,
                         0.742353439,
                         0.40755263,
                         0.408453226,
                         -0.566834807,
                         0.120408386,
                         0.961136639,
                         0.122866727,
                         0.0803644,
                         -0.269531339,
                         1.06352246,
                         0.799612105,
                         -0.652636886,
                         0.280845195,
                         0.298936814,
                         0.349924058,
                         -0.300158411,
                         -0.22010456,
                         0.143384576,
                         0.499211878,
     

In [11]:
## 업데이트
new_text = '변경할 새로운 텍스트'
new_embedding = sentence_model.encode(new_text).tolist()

update_response = index.update(
    id='0', ## 기존 문서 아이디
    values = new_embedding,
    set_metadata={'text' : new_text},
    namespace='llm-book-sub'

)

## 삭제
delete_response = index.delete(ids=['0'], namespace='llm-book-sub')

In [None]:
# ## pinecone 설정.
# from pinecone import Pinecone

# pc = Pinecone(api_key=api_key)
# pc.create_index("quickstart", dimension=1536, metric="euclidean", spec=ServerlessSpec("aws", 'us-east-1'))
# pinecone_index = pc.Index("quickstart")

# ## 라마인덱스에 pinecone 연결
# from llama_index.core import VectorStoreIndex
# from llama_index.vector_stores.pinecone import PineconeVectorStore
# from llama_index.core import StorageContext

# vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

NameError: name 'api_key' is not defined

In [3]:
from datasets import load_dataset

dataset = load_dataset("poloclub/diffusiondb", "2m_first_1k", split="train")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/25.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/581M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
example_idx = 867
original_image = dataset[example_idx]['image']
original_prompt = dataset[example_idx]['prompt']
print(original_prompt)

cute fluffy baby cat rabbit lion hybrid mixed creature character concept, with long flowing mane blowing in the wind, long peacock feather tail, wearing headdress of tribal peacock feathers and flowers, detailed painting, renaissance, 4 k 


In [None]:
import requests
import base64
from io import BytesIO

def make_base64(image):
  buffered = BytesIO()
  image.save(buffered, format="JPEG")
  img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
  return img_str

def generate_description_from_image_gpt4(prompt, image64):
  headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {client.api_key}"
  }
  payload = {
      "model": "gpt-4o",
      "messages": [
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": prompt
            },
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/jpeg;base64,{image64}"
              }
            }
          ]
        }
      ],
      "max_tokens": 300
  }
  response_oai = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  result = response_oai.json()['choices'][0]['message']['content']
  return result

image_base64 = make_base64(original_image)
described_result = generate_description_from_image_gpt4("describe provided image", image_base64)

In [None]:
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

from dotenv import load_dotenv
load_dotenv("../keys.env")

pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv("OPENAI_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)
os.environ["OPENAI_API_KEY"] = openai_api_key
client = OpenAI()

In [None]:
print(pc.list_indexes())

index_name = "llm-multimodal"
try:
  pc.create_index(
    name=index_name,
    dimension=512,
    metric="cosine",
    spec=ServerlessSpec(
      "aws", "us-east-1"
    )
  )
  print(pc.list_indexes())
except:
  print("Index already exists")
index = pc.Index(index_name)

In [None]:
import torch
from tqdm.auto import trange
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, CLIPTextModelWithProjection

device = "cuda" if torch.cuda.is_available() else "cpu"

text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

tokens = tokenizer(dataset['prompt'], padding=True, return_tensors="pt", truncation=True)
batch_size = 16
text_embs = []
for start_idx in trange(0, len(dataset), batch_size):
    with torch.no_grad():
        outputs = text_model(input_ids = tokens['input_ids'][start_idx:start_idx+batch_size],
                        attention_mask = tokens['attention_mask'][start_idx:start_idx+batch_size])
        text_emb_tmp = outputs.text_embeds
    text_embs.append(text_emb_tmp)
text_embs = torch.cat(text_embs, dim=0)
text_embs.shape # (1000, 512)

In [None]:
input_data = []
for id_int, emb, prompt in zip(range(0, len(dataset)), text_embs.tolist(), dataset['prompt']):
  input_data.append(
      {
          "id": str(id_int),
          "values": emb,
          "metadata": {
              "prompt": prompt
          }
      }
  )

index.upsert(
  vectors=input_data
)

In [None]:
from PIL import Image

def generate_image_dalle3(prompt):
  response_oai = client.images.generate(
    model="dall-e-3",
    prompt=str(prompt),
    size="1024x1024",
    quality="standard",
    n=1,
  )
  result = response_oai.data[0].url
  return result

def get_generated_image(image_url):
  generated_image = requests.get(image_url).content
  image_filename = 'gen_img.png'
  with open(image_filename, "wb") as image_file:
      image_file.write(generated_image)
  return Image.open(image_filename)

In [None]:
# GPT-4o가 만든 프롬프트로 이미지 생성
gpt_described_image_url = generate_image_dalle3(described_result)
gpt4o_prompt_image = get_generated_image(gpt_described_image_url)
gpt4o_prompt_image

In [None]:
# 원본 프롬프트로 이미지 생성
original_prompt_image_url = generate_image_dalle3(original_prompt)
original_prompt_image = get_generated_image(original_prompt_image_url)
original_prompt_image

In [None]:
# 이미지 임베딩으로 검색한 유사 프롬프트로 이미지 생성
searched_prompt_image_url = generate_image_dalle3(dataset[searched_idx]['prompt'])
searched_prompt_image = get_generated_image(searched_prompt_image_url)
searched_prompt_image