In [6]:
import firebase_admin
from firebase_admin import credentials

cred = credentials.Certificate("firebase-credentials.json")
firebase_admin.initialize_app(cred)

<firebase_admin.App at 0x16ca67910>

In [7]:
from firebase_admin import firestore

db = firestore.client()

In [9]:
for doc in db.collection('urls').stream():
    print(f'{doc.id} => {doc.to_dict()}')

test => {'date': DatetimeWithNanoseconds(2023, 2, 18, 17, 0, 0, 896000, tzinfo=datetime.timezone.utc), 'username': 'arvind6902@gmail.com', 'url': 'https://www.google.com/'}


In [27]:
username = 'arvind6902@gmail.com'
old_url_docs = db.collection('urls').where('username', '==', username).stream()
for doc in old_url_docs:
    print(doc.to_dict())

{'title': 'Apple', 'timestamp': '11:00', 'username': 'arvind6902@gmail.com', 'url': 'https://www.apple.com/'}
{'title': 'Google Maps', 'timestamp': '10:00', 'url': 'https://www.google.com/maps', 'username': 'arvind6902@gmail.com'}


In [32]:
clusters = db.collection('clusters').where('username', '==', username).stream()
for cluster in clusters:
    print(cluster.reference.update({
        'abc': 'def'
    }))

update_time {
  seconds: 1676750069
  nanos: 980595000
}

update_time {
  seconds: 1676750070
  nanos: 58867000
}



In [98]:
from embeddings import run_kmeans_2
import uuid
import numpy as np
import math

In [99]:
username = 'arvind6902@gmail.com'
urls = ['https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained',
        'https://www.bankofamerica.com/',
        'https://www.mygreatlearning.com/blog/what-is-machine-learning/',
        'https://www.citi.com/']
titles = ['Machine Learning Explained, MIT Sloan',
          'Bank of America',
          'What is Machine Learning? Defination, Types, Applications, and more',
          'Citi Online Credit Cards']
timestamps = [9000, 10000, 20000, 4000]


In [100]:
def generateId():
    return uuid.uuid4().hex

In [101]:
# Generate clusters and URLs from scratch
cluster_objs = []
cluster_ids = []
url_objs = []
url_ids = []

kmeans, embeddings, cluster_indices = run_kmeans_2(titles)
cluster_centers = kmeans.cluster_centers_
for cluster_num in cluster_indices:
    cluster_id = generateId()
    cluster_name = f'Unnamed Cluster {cluster_num + 1}'
    cluster_center = cluster_centers[cluster_num].tolist()

    cluster_objs.append({
        'name': cluster_name,
        'username': username,
        'center': cluster_center
    })
    cluster_ids.append(cluster_id)

    for idx in cluster_indices[cluster_num]:
        url_id = generateId()
        embedding = embeddings[idx].tolist()
        title = titles[idx]
        url = urls[idx]
        timestamp = timestamps[idx]

        url_objs.append({
            'title': title,
            'url': url,
            'timestamp': timestamp,
            'username': username,
            'cluster_id': cluster_id,
            'embedding': embedding
        })
        url_ids.append(url_id)

cluster_batch = db.batch()
for cluster_id, cluster_obj in zip(cluster_ids, cluster_objs):
    doc_ref = db.collection('clusters').document(cluster_id)
    cluster_batch.set(doc_ref, cluster_obj)

url_batch = db.batch()
for url_id, url_obj in zip(url_ids, url_objs):
    url_ref = db.collection('urls').document(url_id)
    url_batch.set(url_ref, url_obj)

cluster_batch.commit()
url_batch.commit()



Cluster 0 has 2 vectors with indices: [0, 2]
Cluster 1 has 2 vectors with indices: [1, 3]


[update_time {
   seconds: 1676789855
   nanos: 605142000
 },
 update_time {
   seconds: 1676789855
   nanos: 605142000
 },
 update_time {
   seconds: 1676789855
   nanos: 605142000
 },
 update_time {
   seconds: 1676789855
   nanos: 605142000
 }]

In [102]:
username = 'arvind6902@gmail.com'
urls = ['https://www.baeldung.com/cs/bellman-ford',
        'https://www.moderntreasury.com/',
        'https://stripe.com/',
        'https://edu.gcfglobal.org/en/computerbasics/understanding-operating-systems/1/']
titles = ['Bellman Ford Shortest Path Algorithm',
          'Money Movement APIs | Modern Treasury',
          'Stripe | Payment Infrastructure for the Internet',
          'Computer Basics: Understanding Operating Systems']
timestamps = [2000, 3000, 5000, 7000]

In [103]:
existing_cluster_objs = []
existing_cluster_ids = []
# Get existing clusters (old)
cluster_refs = db.collection('clusters').where('username', '==', username).stream()
for cluster_ref in cluster_refs:
    id = cluster_ref.id
    d = cluster_ref.to_dict()
    existing_cluster_objs.append(d)
    existing_cluster_ids.append(id)

new_url_objs = []
new_url_ids = []
# Append new URLs (new)
for idx in range(len(titles)):
    url_id = generateId()
    title = titles[idx]
    url = urls[idx]
    timestamp = timestamps[idx]

    new_url_objs.append({
        'title': title,
        'url': url,
        'timestamp': timestamp,
        'username': username,
        # 'cluster_id': cluster_id,
        # 'embedding': embedding
    })
    new_url_ids.append(url_id)

existing_url_objs = []
existing_url_ids = []
# Get existing URLs (old)
url_refs = db.collection('urls').where('username', '==', username).stream()
for url_ref in url_refs:
    id = url_ref.id
    d = url_ref.to_dict()
    existing_url_objs.append(d)
    existing_url_ids.append(id)

all_url_objs = existing_url_objs + new_url_objs
all_url_ids = existing_url_ids + new_url_ids
all_titles = list(map(lambda o: o['title'], existing_url_objs)) + titles
new_old_cluster_mapping = {}
new_cluster_objs = []
new_cluster_ids = []
# Calculate new clusters
kmeans, embeddings, cluster_indices = run_kmeans_2(all_titles)
for cluster_idx in cluster_indices:
    new_cluster_mean = kmeans.cluster_centers_[cluster_idx].tolist()
    best_old_mean_id, best_old_mean_obj, best_old_dist = None, None, math.inf
    for existing_cluster_id, existing_cluster_obj in zip(existing_cluster_ids, existing_cluster_objs):
        old_cluster_mean = existing_cluster_obj['center']
        dist = np.linalg.norm(np.array(new_cluster_mean) - np.array(old_cluster_mean))
        if dist < best_old_dist:
            best_old_mean_id, best_old_mean_obj, best_old_dist = existing_cluster_id, existing_cluster_obj, dist

    new_old_cluster_mapping[cluster_idx] = best_old_mean_id
    new_cluster_objs.append({
        'center': new_cluster_mean
    })
    new_cluster_ids.append(best_old_mean_id)

# Add embedding and cluster_id for each new URL
for new_url_obj in new_url_objs:
    exit = False
    for cluster_idx in cluster_indices:
        for url_idx in cluster_indices[cluster_idx]:
            if all_url_objs[url_idx]['url'] == new_url_obj['url']:
                new_url_obj['embedding'] = embeddings[url_idx].tolist()
                new_url_obj['cluster_id'] = new_cluster_ids[cluster_idx]
                exit = True
                break
        if exit == True:
            break
    if exit == True:
        continue


# Update cluster centers for existing clusters
update_clusters_batch = db.batch()
for new_cluster_id, new_cluster_obj in zip(new_cluster_ids, new_cluster_objs):
    ref = db.collection('clusters').document(new_cluster_id)
    update_clusters_batch.update(ref, new_cluster_obj)

# Push new URLs
create_urls_batch = db.batch()
for new_url_id, new_url_obj in zip(new_url_ids, new_url_objs):
    ref = db.collection('urls').document(new_url_id)
    create_urls_batch.set(ref, new_url_obj)

update_clusters_batch.commit()
create_urls_batch.commit()



Cluster 0 has 4 vectors with indices: [0, 3, 5, 6]
Cluster 1 has 4 vectors with indices: [1, 2, 4, 7]


[update_time {
   seconds: 1676789866
   nanos: 883557000
 },
 update_time {
   seconds: 1676789866
   nanos: 883557000
 },
 update_time {
   seconds: 1676789866
   nanos: 883557000
 },
 update_time {
   seconds: 1676789866
   nanos: 883557000
 }]

In [109]:
db.collection('urls').where('username', '==', username).count().get()[0][0].value

8