In [None]:
import openai
import numpy as np
import pandas as pd
import os
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

from dotenv import load_dotenv
load_dotenv()

# Load your OpenAI API key from environment variables or replace with your actual key
openai.api_key = os.getenv("OPENAI_API_KEY")

def get_embedding(word):
    """
    Given a word, return its embedding using OpenAI API.
    """
    response = openai.embeddings.create(
        model="text-embedding-3-large",
        input=word
    )
    return response.data[0].embedding

def apply_get_embedding(df, column_name):
    """
    Apply get_embedding function to a column in a pandas dataframe row-wise.
    """
    df['embedding'] = df[column_name].apply(get_embedding)
    return df

def extract_pca_loadings(df, embedding_column, n_components=2):
    """
    Apply PCA to a column of word embeddings and add the PCA loadings to the dataframe.
    """
    pca = PCA(n_components=n_components)
    embeddings = np.array(df[embedding_column].tolist())
    pca_loadings = pca.fit_transform(embeddings)
    for i in range(n_components):
        df[f'PCA_{i+1}'] = pca_loadings[:, i]
    return df, pca

def plot_pca(df, word_column, category_column, pca_columns):
    """
    Create a 2D graph of the PCA loadings with words and categories.
    """
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x=df[pca_columns[0]], 
        y=df[pca_columns[1]], 
        hue=df[category_column], 
        palette='tab10', 
        s=100, 
        alpha=0.7,
        legend='full'
    )
    
    # Annotate each point with the word
    for i in range(df.shape[0]):
        plt.text(df[pca_columns[0]].iloc[i], df[pca_columns[1]].iloc[i], df[word_column].iloc[i], 
                 fontsize=9, alpha=0.9)
    
    plt.title('PCA of Word Embeddings')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.legend(title=category_column)
    plt.grid(True)
    plt.show()


In [None]:
# Load the Big Six Adjectives dataset
df = pd.read_csv('/mnt/data/big_six_adjectives.csv')

# Apply the embedding extraction
df = apply_get_embedding(df, 'Adjective')

# Extract PCA loadings
df, pca = extract_pca_loadings(df, 'embedding')

# Plot the PCA loadings
plot_pca(df, 'Adjective', 'Category', ['PCA_1', 'PCA_2'])
