In [None]:
import pandas as pd
from sqlalchemy import create_engine
import os
import json
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def load_credentials(path = "aws_rds_credentials.json"):
     with open(path, 'r') as file:
          config = json.load(file)

     # set up credentials
     for key in config.keys():
          os.environ[key] = config[key]

     return

time_interval = 90 #days

load_credentials()

aws_rds_url = f"postgresql://{os.environ['user']}:{os.environ['password']}@{os.environ['host']}:{os.environ['port']}/{os.environ['database']}?sslmode=require"

engine = create_engine(aws_rds_url)
sql_query = f"""SELECT brand_title, price_numeric, status, catalog_id, size_title, color1_id
               FROM public.tracking_staging 
               WHERE date >= CURRENT_DATE - INTERVAL '{time_interval} days'
               """
data = pd.read_sql(sql_query, engine)
data

In [None]:
# inputs: count products, median price, std dev price, nunique catalog_id, ununique color1_id, ununique size_title
# Group by brand_title and calculate various statistics
catalog_stats = data.groupby('catalog_id').agg({
    'price_numeric': ['count', 'median', 'std'],
    'catalog_id': 'nunique',
    'color1_id': 'nunique',
    'size_title': 'nunique'
})

# Rename columns for clarity
catalog_stats.columns = ['product_count', 'price_median', 'price_std', 'catalog_id_unique', 'color1_id_unique', 'size_title_unique']

catalog_stats = catalog_stats[catalog_stats["product_count"] > 300]
catalog_stats

In [None]:
pca = PCA(n_components=2)
X_2d = pca.fit_transform(catalog_stats)

# Assign cluster labels to the transformed data
data_with_labels = pd.DataFrame(X_2d, index=catalog_stats.index, columns=['PC1', 'PC2'])

# Plot the transformed data with colors based on cluster labels
plt.figure(figsize=(8, 6))
for cluster_label in data_with_labels.index:
    plt.scatter(data_with_labels.loc[data_with_labels.index == cluster_label, 'PC1'],
                data_with_labels.loc[data_with_labels.index == cluster_label, 'PC2'],
                label=f'Cluster {cluster_label}')
plt.title('PCA to 2 Dimensions with Cluster Labels')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)