In [21]:
! pip install mysql-connector-python
! pip install numpy
! pip install scikit-learn
! pip install python-dotenv




In [22]:
import mysql.connector
#import dotenv and use it to load the environment variables
from dotenv import load_dotenv
load_dotenv()
import os
# Connect to MySQL
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_DATABASE")
db_user = os.getenv("DB_USERNAME")
db_password = os.getenv("DB_PASSWORD")

# Connect to MySQL using the loaded environment variables
db = mysql.connector.connect(
    host=db_host,
    port=db_port,
    database=db_name,
    user=db_user,
    password=db_password
)



In [23]:
import mysql.connector
import numpy as np
import logging
import json  # Add json to handle JSON conversion
from sklearn.cluster import KMeans

# Setup logging to log to the notebook and a file
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

db = mysql.connector.connect(
    host=db_host,
    port=db_port,
    database=db_name,
    user=db_user,
    password=db_password
)

cursor = db.cursor()

# Fetch normalized vectors
cursor.execute("SELECT normalized_vector FROM vectors")

vectors = []
for idx, v in enumerate(cursor.fetchall()):
    try:
        # Log progress in Jupyter
        if idx % 1000 == 0:
            print(f"Processing vector {idx}...")

        # Remove any brackets and whitespace, then split the string by commas
        cleaned_vector = v[0].replace('[', '').replace(']', '').strip()
        vector = np.array(list(map(float, cleaned_vector.split(','))))
        vectors.append(vector)
    except ValueError as e:
        print(f"Error converting vector at index {idx}: {v[0]}")
        logging.error(f"Error converting vector at index {idx}: {v[0]}")
        logging.exception(e)

# Convert to a NumPy array
vectors = np.array(vectors)

# Log the number of valid vectors
print(f"Successfully fetched and parsed {len(vectors)} vectors.")
logging.info(f"Successfully fetched and parsed {len(vectors)} vectors.")

# Run k-means
print("Starting K-means clustering...")
num_centroids = 100
kmeans = KMeans(n_clusters=num_centroids, max_iter=100, random_state=122342)
kmeans.fit(vectors)

# Log centroids creation completion
print(f"K-means clustering completed. Found {len(kmeans.cluster_centers_)} centroids.")
logging.info(f"K-means clustering completed. Found {len(kmeans.cluster_centers_)} centroids.")

db = mysql.connector.connect(
    host=db_host,
    port=db_port,
    database=db_name,
    user=db_user,
    password=db_password
)

cursor = db.cursor()

# Save centroids back to the database
centroids = kmeans.cluster_centers_
for index, centroid in enumerate(centroids):
    centroid_json = json.dumps(centroid.tolist())  # Convert the NumPy array to a list and then to JSON
    
    # First, check if the centroid with the given ID exists
    cursor.execute("SELECT id FROM centroids WHERE id = %s", (index + 1,))
    result = cursor.fetchone()

    if result:
        # If the centroid exists, update it
        cursor.execute(
            "UPDATE centroids SET vector = %s WHERE id = %s", 
            (centroid_json, index + 1)
        )
        logging.info(f"Updated Centroid {index + 1} in the database.")
    else:
        # If the centroid doesn't exist, insert it
        cursor.execute(
            "INSERT INTO centroids (id, vector) VALUES (%s, %s)", 
            (index + 1, centroid_json)
        )
        logging.info(f"Inserted new Centroid {index + 1} into the database.")

db.commit()
cursor.close()
db.close()

print("Centroid processing completed successfully.")
logging.info("Centroid processing completed successfully.")


Processing vector 0...
Processing vector 1000...
Processing vector 2000...
Processing vector 3000...
Processing vector 4000...
Processing vector 5000...
Processing vector 6000...
Processing vector 7000...
Processing vector 8000...
Processing vector 9000...
Processing vector 10000...
Processing vector 11000...
Processing vector 12000...
Processing vector 13000...
Processing vector 14000...
Processing vector 15000...
Processing vector 16000...
Processing vector 17000...
Processing vector 18000...
Processing vector 19000...
Processing vector 20000...
Processing vector 21000...
Processing vector 22000...
Processing vector 23000...
Processing vector 24000...
Processing vector 25000...
Processing vector 26000...
Processing vector 27000...
Processing vector 28000...
Processing vector 29000...
Processing vector 30000...
Processing vector 31000...
Processing vector 32000...
Processing vector 33000...
Processing vector 34000...
Processing vector 35000...
Processing vector 36000...
Processing vec

2024-10-03 23:47:22,151 - INFO - Successfully fetched and parsed 78389 vectors.


Successfully fetched and parsed 78389 vectors.
Starting K-means clustering...


2024-10-03 23:47:25,014 - INFO - K-means clustering completed. Found 100 centroids.


K-means clustering completed. Found 100 centroids.


2024-10-03 23:47:25,804 - INFO - Inserted new Centroid 1 into the database.
2024-10-03 23:47:25,979 - INFO - Inserted new Centroid 2 into the database.
2024-10-03 23:47:26,144 - INFO - Inserted new Centroid 3 into the database.
2024-10-03 23:47:26,321 - INFO - Inserted new Centroid 4 into the database.
2024-10-03 23:47:26,492 - INFO - Inserted new Centroid 5 into the database.
2024-10-03 23:47:26,671 - INFO - Inserted new Centroid 6 into the database.
2024-10-03 23:47:26,925 - INFO - Inserted new Centroid 7 into the database.
2024-10-03 23:47:27,093 - INFO - Inserted new Centroid 8 into the database.
2024-10-03 23:47:27,267 - INFO - Inserted new Centroid 9 into the database.
2024-10-03 23:47:27,435 - INFO - Inserted new Centroid 10 into the database.
2024-10-03 23:47:27,606 - INFO - Inserted new Centroid 11 into the database.
2024-10-03 23:47:27,773 - INFO - Inserted new Centroid 12 into the database.
2024-10-03 23:47:27,946 - INFO - Inserted new Centroid 13 into the database.
2024-10-

Centroid processing completed successfully.


In [27]:
import mysql.connector
import numpy as np
import logging
import json

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

db = mysql.connector.connect(
    host=db_host,
    port=db_port,
    database=db_name,
    user=db_user,
    password=db_password
)

cursor = db.cursor()

# Fetch centroids from the database
cursor.execute("SELECT id, vector FROM centroids")
centroids = cursor.fetchall()

# Convert centroids into NumPy arrays
centroid_ids = [c[0] for c in centroids]
centroid_vectors = np.array([np.array(json.loads(c[1])) for c in centroids])  # Centroids are pre-normalized

# Fetch all vectors in memory
cursor.execute("SELECT id, normalized_vector FROM vectors")
vectors = cursor.fetchall()

chunk_size = 100  # Adjust if necessary
total_vectors = len(vectors)
logging.info(f"Total vectors to process: {total_vectors}")

# Normalize vectors (assuming the normalized_vector might not be pre-normalized)
vector_ids = [v[0] for v in vectors]
vector_data = np.array([np.array(json.loads(v[1])) for v in vectors])
vector_magnitudes = np.linalg.norm(vector_data, axis=1)
vector_data = vector_data / vector_magnitudes[:, None]  # Normalize vectors

# Perform cosine similarity calculation
dot_products = np.dot(vector_data, centroid_vectors.T)  # No need to normalize centroids again
cosine_similarities = dot_products  # Since centroids are already normalized

# Get the index of the closest centroid for each vector
best_matches = np.argmax(cosine_similarities, axis=1)

# Group vectors by their closest centroid
grouped_vectors = {}
for idx, best_match_idx in enumerate(best_matches):
    best_centroid_id = centroid_ids[best_match_idx]
    if best_centroid_id not in grouped_vectors:
        grouped_vectors[best_centroid_id] = []
    grouped_vectors[best_centroid_id].append(vector_ids[idx])

# Perform batch updates for each group of vectors that share the same closest centroid
for centroid_id, vector_group in grouped_vectors.items():
    try:
        # Build the UPDATE query for this group
        update_query = f"UPDATE grant_vector SET centroid_id = {centroid_id} WHERE id IN ({', '.join(map(str, vector_group))})"
        
        # Log the query for debugging
        logging.debug(f"Executing query: {update_query}")
        
        # Execute the query
        cursor.execute(update_query)
        db.commit()
        
        logging.info(f"Updated {len(vector_group)} vectors for centroid {centroid_id}")
        
    except mysql.connector.Error as err:
        logging.error(f"Error updating vectors for centroid {centroid_id}: {err}")
        db.rollback()

# Close the cursor and connection
cursor.close()
db.close()

logging.info("Centroid assignment for vectors completed.")


2024-10-04 00:01:17,594 - INFO - Total vectors to process: 78389
2024-10-04 00:01:22,807 - INFO - Updated 779 vectors for centroid 61
2024-10-04 00:01:23,013 - INFO - Updated 952 vectors for centroid 21
2024-10-04 00:01:23,319 - INFO - Updated 3140 vectors for centroid 36
2024-10-04 00:01:23,490 - INFO - Updated 342 vectors for centroid 89
2024-10-04 00:01:23,692 - INFO - Updated 1814 vectors for centroid 55
2024-10-04 00:01:23,882 - INFO - Updated 610 vectors for centroid 35
2024-10-04 00:01:24,091 - INFO - Updated 2016 vectors for centroid 2
2024-10-04 00:01:24,302 - INFO - Updated 1486 vectors for centroid 86
2024-10-04 00:01:24,490 - INFO - Updated 516 vectors for centroid 80
2024-10-04 00:01:24,688 - INFO - Updated 1301 vectors for centroid 91
2024-10-04 00:01:24,895 - INFO - Updated 1296 vectors for centroid 70
2024-10-04 00:01:25,088 - INFO - Updated 1162 vectors for centroid 12
2024-10-04 00:01:25,290 - INFO - Updated 1058 vectors for centroid 30
2024-10-04 00:01:25,470 - INFO 