In [2]:
#| default_exp import_to_pinecone

%load_ext autoreload
%autoreload 2

In [3]:
#| export

import yaml
from typing import Dict
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from tqdm.auto import tqdm
import binascii
from ibmcloudant.cloudant_v1 import CloudantV1
from ibm_cloud_sdk_core.authenticators import BasicAuthenticator
import nltk
import os
import sys
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import torch
import numpy as np
import argparse
import time 

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from tools.optimal_embeddings_model.mailio_ai_libs.collect_emails import list_emails
from tools.optimal_embeddings_model.data_types.email import Email, MessageType
from api.services.embedding_service import EmbeddingService

  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package punkt to /Users/igor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/igor/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
#| export

def load_config(path:str) -> Dict:
    with open(path, 'r') as f:
        config = yaml.safe_load(f)
    return config

In [5]:
cfg = load_config('../config.yaml')

In [6]:
#| export

def connect_pinecone(cfg:Dict) -> Pinecone:
    pinecone_cfg = cfg.get("pinecone")
    pc = Pinecone(api_key=pinecone_cfg.get("api_key"))
    spec = ServerlessSpec(cloud=pinecone_cfg.get("cloud"), region=pinecone_cfg.get("region"))
    index = pc.Index(host=pinecone_cfg.get("index_name"))
    return index

In [7]:
#| export

def get_db_name(address:str) -> str:
    return "userdb-" + binascii.hexlify(address.encode()).decode() 

def connect_couchdb(cfg:Dict) -> CloudantV1:
    couch_cfg = cfg.get("couchdb")
    auth = BasicAuthenticator(couch_cfg.get("username"), couch_cfg.get("password"))
    client = CloudantV1(authenticator=auth)
    client.set_service_url(couch_cfg.get("host"))
    client.set_disable_ssl_verification(True)
    return client

In [8]:
#| export

# load transformers model
def load_embedding_service(cfg: Dict) -> EmbeddingService:
    embedding_service = EmbeddingService(cfg)
    return embedding_service

In [9]:
#| export

# default folders for import
DEFAULT_FOLDERS = ["inbox", "goodreads", "archive", "sent"]

def import_to_pinecode(client, index, embedding_service:EmbeddingService, user_db: str, address:str, folders: str, batch_size:int = 500):
    """
    Import emails from couchdb to pinecone index
    Args:
        client: CloudantV1 client
        embedding_service: EmbeddingService object
        user_db: user db name
        folders: list of folders to import
        batch_size: batch size for import
    Results:
        None
    """
    processed = 0

    all_vectors = []

    for folder in folders:
        bookmark = ""
        while True:
            for emails, new_bookmark in list_emails(client, user_db, folder, bookmark=bookmark, limit=batch_size):
                if len(emails) == 0:
                    bookmark = None
                    break

                # prepare data for import
                vectors = []
                for e in tqdm(emails, desc=f"Importing {folder}", unit="email"):
                    metadata = {
                        "created": e.created,
                        "from": e.sender_email,
                        "from_name": e.sender_name,
                        "folder": e.folder,
                    }
                    
                    embedding = embedding_service.create_embedding(e)

                    vector = {
                        "id": e.message_id,
                        "values": embedding[0].tolist(),
                        "metadata": metadata,
                    }
                    vectors.append(vector)
                    processed += 1
                
                # upsert to pinecone
                index.upsert(vectors=vectors, namespace=address)

                bookmark = new_bookmark
            if not bookmark:
                break 

    print(f"Processed {processed} emails")
    return all_vectors


In [13]:
#| export

import pickle

def main(address:str):
    cfg = load_config('../config.yaml')
    client = connect_couchdb(cfg)
    index = connect_pinecone(cfg)
    embedding_service = load_embedding_service(cfg)
    user_db = get_db_name(address)
    all_vectors = import_to_pinecode(client, index, embedding_service, user_db, address, DEFAULT_FOLDERS, batch_size=500)
    print(f"generated {len(all_vectors)} vectors")
    # store to file
    with open(f"vectors_{address}.pkl", "wb") as f:
        pickle.dump(all_vectors, f)


In [14]:
main(address="0x139d1fe7306dd2b22c95c8e8343e5163fcc8aa09")

EmbeddingService using device: cpu


Importing inbox: 100%|██████████| 497/497 [00:00<00:00, 336709.59email/s]
Importing inbox: 100%|██████████| 491/491 [00:00<00:00, 720071.07email/s]
Importing inbox: 100%|██████████| 500/500 [00:00<00:00, 643495.55email/s]
Importing inbox: 100%|██████████| 500/500 [00:00<00:00, 345267.04email/s]
Importing inbox: 100%|██████████| 499/499 [00:00<00:00, 705887.92email/s]
Importing inbox: 100%|██████████| 207/207 [00:00<00:00, 655261.08email/s]
Importing goodreads: 100%|██████████| 496/496 [00:00<00:00, 673914.73email/s]
Importing goodreads: 100%|██████████| 499/499 [00:00<00:00, 707797.67email/s]
Importing goodreads: 100%|██████████| 500/500 [00:00<00:00, 697655.36email/s]
Importing goodreads: 100%|██████████| 500/500 [00:00<00:00, 674542.30email/s]
Importing goodreads: 100%|██████████| 500/500 [00:00<00:00, 575508.23email/s]
Importing goodreads: 100%|██████████| 500/500 [00:00<00:00, 359717.32email/s]
Importing goodreads: 100%|██████████| 500/500 [00:00<00:00, 534851.31email/s]
Importing 

Processed 12432 emails
generated 12432 vectors





In [None]:
address="0x139d1fe7306dd2b22c95c8e8343e5163fcc8aa09"
client = connect_couchdb(cfg)
index = connect_pinecone(cfg)
embedder = load_embedder(cfg)
user_db = get_db_name(address)

import_to_pinecode(client, index, embedder, user_db, address, DEFAULT_FOLDERS, batch_size=250)

In [None]:
address = "0x139d1fe7306dd2b22c95c8e8343e5163fcc8aa09"

In [35]:
#| export

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the main function with an address argument.")
    parser.add_argument("address", type=str, help="The address to process")
    args = parser.parse_args()
    address = args.address
    if not address:
        print("Please provide an address")
        sys.exit(1)
    main(address)

usage: ipykernel_launcher.py [-h] address
ipykernel_launcher.py: error: the following arguments are required: address


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [36]:
#| hide

import nbdev; nbdev.nbdev_export()