In [2]:
import openai
import numpy as np
import pandas as pd
import os
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nbformat
from dotenv import load_dotenv
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import push_notebook
from bokeh.transform import factor_cmap
from bokeh.palettes import Category10



# Load your OpenAI API key from environment variables or replace with your actual key
openai.api_key = os.getenv("OPENAI_API_KEY")



In [3]:
# Functions
def get_embedding(word):
    """
    Given a word, return its embedding using OpenAI API.
    """
    response = openai.embeddings.create(
        model="text-embedding-3-large",
        input=word
    )
    return response.data[0].embedding

def apply_get_embedding(df, column_name):
    """
    Apply get_embedding function to a column in a pandas dataframe row-wise.
    """
    df['embedding'] = df[column_name].apply(get_embedding)
    return df

def extract_pca_loadings(df, embedding_column, n_components=2):
    """
    Apply PCA to a column of word embeddings and add the PCA loadings to the dataframe.
    """
    pca = PCA(n_components=n_components)
    embeddings = np.array(df[embedding_column].tolist())
    pca_loadings = pca.fit_transform(embeddings)
    for i in range(n_components):
        df[f'PCA_{i+1}'] = pca_loadings[:, i]
    return df, pca

def plot_pca(df, word_column, category_column, pca_columns):
    """
    Create a 2D graph of the PCA loadings with words and categories.
    """
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x=df[pca_columns[0]], 
        y=df[pca_columns[1]], 
        hue=df[category_column], 
        palette='tab10', 
        s=100, 
        alpha=0.7,
        legend='full'
    )
        
    # Annotate each point with the word
    #for i in range(df.shape[0]):
    #    plt.text(df[pca_columns[0]].iloc[i], df[pca_columns[1]].iloc[i], df[word_column].iloc[i], 
    #             fontsize=9, alpha=0.9)
    
    plt.title('PCA of Word Embeddings')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.legend(title=category_column)
    plt.grid(True)
    plt.show()


def plot_pca_interactive(df, word_column, category_column, pca_columns):
    """
    Create an interactive 2D graph of the PCA loadings with words and categories using Bokeh.
    """
    categories = df[category_column].unique().tolist()
    num_categories = len(categories)
    
    if num_categories == 2:
        palette = ["#1f77b4", "#ff7f0e"]  # Default colors for two categories
    elif num_categories <= 10:
        palette = Category10[num_categories]
    else:
        palette = Category20[num_categories]
    
    source = ColumnDataSource(data=dict(
        x=df[pca_columns[0]],
        y=df[pca_columns[1]],
        word=df[word_column],
        category=df[category_column]
    ))

    hover = HoverTool(tooltips=[
        ("Word", "@word"),
        ("Category", "@category"),
    ])

    p = figure(title="PCA of Word Embeddings", tools=[hover, "pan,wheel_zoom,box_zoom,reset"])
    p.scatter('x', 'y', source=source, legend_field='category', fill_alpha=0.6, size=10,
              color=factor_cmap('category', palette=palette, factors=categories))
    
    p.xaxis.axis_label = "PCA 1"
    p.yaxis.axis_label = "PCA 2"
    p.legend.title = category_column
    p.legend.location = "top_left"

    output_notebook()
    show(p, notebook_handle=True)


In [4]:
# Load the Big Six Adjectives dataset
df = pd.read_csv('./big_five_adjectives.csv')

In [5]:
# Apply the embedding extraction
df = apply_get_embedding(df, 'Adjective')

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
df.to_csv("./big_five_cum_embeddings.csv")

In [None]:
# Extract PCA loadings
df, pca = extract_pca_loadings(df, 'embedding')

In [None]:
# Plot the PCA loadings
plot_pca_interactive(df, 'Adjective', 'Category', ['PCA_1', 'PCA_2'])


In [None]:
# Load the Big Six Adjectives dataset
df2 = pd.read_csv('./moral_epistemic_adjectives.csv')

In [None]:
# Apply the embedding extraction
df2 = apply_get_embedding(df2, 'Adjective')

In [None]:
df2.to_csv("./moral_epistemic_cum_embeddings.csv")

In [None]:
# Extract PCA loadings
df2, pca2 = extract_pca_loadings(df2, 'embedding')

In [None]:
df2

In [None]:
# Plot the PCA loadings
plot_pca_interactive(df2, 'Adjective', 'Category', ['PCA_1', 'PCA_2'])
