In [None]:
import os
import ast
import math
import time
import urllib
import numpy as np
import pandas as pd
from numpy.linalg import norm
from sklearn.decomposition import PCA

from dotenv import load_dotenv
load_dotenv()

import openai
openai.api_key = os.getenv('API_KEY')

In [None]:
def embed(input):
    response = openai.Embedding.create(
    input=input,
    model="text-embedding-ada-002"
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

def cosine_dist(arr1, arr2):
    # Given that arr1 and arr2 are non-normalized vectors
    return 1 - np.dot(arr1,arr2)/(norm(arr1)*norm(arr2))

def cosine_dist_norm(arr1, arr2):
    # Given that arr1 and arr2 are normalized vectors
    return 1 - np.dot(arr1,arr2)

def euclidean_dist(arr1, arr2):
    # Computes the Euclidean distance between two vectors arr1 and arr2.
    arr1 = np.array(arr1)
    arr2 = np.array(arr2)
    return np.sqrt(np.dot(arr1 - arr2, arr1 - arr2))

def manhattan_dist(arr1, arr2):
    # Computes the L1 distance (Manhattan distance) between two vectors arr1 and arr2.
    arr1 = np.array(arr1)
    arr2 = np.array(arr2)
    return np.sum(np.abs(arr1 - arr2))

def dist(type, arr1, arr2):
    if type.lower()[:3] == 'cos':
        return cosine_dist(arr1, arr2)
    elif type.lower()[:3] == 'euc':
        return euclidean_dist(arr1, arr2)
    elif type.lower()[:3] == 'man':
        return manhattan_dist(arr1, arr2)


In [None]:
def generate_sheets(df, info):

    # Create page count df
    id_count = df.groupby('ID').count().to_dict()['Embedding']
    df = pd.DataFrame([id_count]).transpose().reset_index()
    df.columns = ['ID', 'Count']

    info = info[info['ID'].isin(id_count.keys())]
    info = info.reset_index().drop(columns=['index'])

    pages_df = pd.merge(info, df, how ='inner', on = 'ID')
    pages_df.drop(pages_df[pages_df['Count'] == 0].index)
    pages_df['Scientific Name'] = pages_df['Scientific Name'].apply(lambda x: x.split(', '))
    pages_df['Specialty'] = pages_df['Specialty'].apply(lambda x: x.split(', '))
    
    # count occurrences of Scientific Name
    scientific_name_counts = {}
    for names in pages_df['Scientific Name']:
        for name in names:
            if name not in scientific_name_counts:
                scientific_name_counts[name] = 1
            else:
                scientific_name_counts[name] += 1

    # count occurrences of Specialty
    specialty_counts = {}
    for specialties in pages_df['Specialty']:
        for specialty in specialties:
            if specialty not in specialty_counts:
                specialty_counts[specialty] = 1
            else:
                specialty_counts[specialty] += 1

    # create new dataframes with the counts
    scientific_name_df = pd.DataFrame.from_dict(scientific_name_counts, orient='index', columns=['Count'])
    specialty_df = pd.DataFrame.from_dict(specialty_counts, orient='index', columns=['Count'])

    # add column to indicate specialty or scientific name
    scientific_name_df['Type'] = 'Scientific Name'
    specialty_df['Type'] = 'Specialty'

    # merge the two dataframes
    sp_sn_count = pd.concat([scientific_name_df, specialty_df], axis=0, sort=False)
    sp_sn_count = sp_sn_count.reset_index()
    sp_sn_count.columns = ['Word', 'Count', 'Type']
    sp_sn_count = sp_sn_count.sort_values('Count', ascending=False)
    
    return pages_df, sp_sn_count.reset_index().drop(columns=['index'])

In [None]:
def reduce_embedding_dimensionality(df, num_components):
    df_temp = df.copy()
    # Extract the embeddings from the 'Embedding' column of the DataFrame
    embeddings = np.array(df_temp['Embedding'].tolist())
    
    # Instantiate a PCA object with the desired number of components
    pca = PCA(n_components=num_components)
    
    # Fit the PCA model to the embeddings
    pca.fit(embeddings)
    
    # Transform the embeddings to their reduced representation
    reduced_embeddings = pca.transform(embeddings)
    
    # Add the reduced embeddings to a new column in the DataFrame
    df_temp[f'Embedding - {num_components}'] = reduced_embeddings.tolist()
    df_temp[f'Embedding - {num_components}'] = df_temp[f'Embedding - {num_components}'].apply(lambda x: np.array(x))
    df_temp = df_temp.drop(columns = ['Embedding'])
    
    return df_temp, pca

In [None]:
df = pd.read_csv('openai_concat_embed_sample_smaller.csv')
df = df.dropna(subset=['Embedding'])
df['Embedding'] = df['Embedding'].apply(lambda x: 
                                    np.fromstring(
                                        x.replace('[','')
                                        .replace(']','')
                                        .replace('  ',' '), sep=', '))

info = pd.read_csv('Google Sheets.csv').set_index('ID')
info['Scientific Name'] = info['Scientific Name'].apply(lambda x: x.split(', '))
info['Specialty'] = info['Specialty'].apply(lambda x: x.split(', '))

In [103]:
DIM_MIN = 700
DIM_MAX = 1300
DIM_STEP = 200

sp_prompts = ['Rehabilitation', 'Cardiology', 'Woundcare & Dressing', 'Medical Clothing', 'Pediatrics',
              'Monitoring', 'Pharmaceutical', 'Urology & Nephrology', 'Gastroenterology', 'Rapid Tests']

sn_prompts = ['Vitamin C', 'Shockwave Therapy Machines', 'Sunscreen']

dims = [1536] + (list(range(DIM_MAX, DIM_MIN - 1, -DIM_STEP)))

pres = [np.float64, np.float32, np.float16]
# dists = ['Cosine', 'Euclidean', 'Manhattan']
df_log = pd.DataFrame(columns = ['Dimensions', 'Precision', 'Distance', 'Prompt', 'Top 5', 'Top 10', 'Top 20', 'Time'])

In [None]:
prompt_embeds = {}
for p in (sp_prompts + sn_prompts):
    prompt_embeds[p] = embed(p)

In [None]:
should_finds = {}
for p in sp_prompts:
    should_finds[p] = info[info['Specialty'].apply(lambda x: p in x)].index.tolist()
for p in sn_prompts:
    should_finds[p] = info[info['Scientific Name'].apply(lambda x: p in x)].index.tolist()

In [104]:
df_red = df.copy()
for p in pres:
    df_pre = df_red.copy()
    df_pre[f'Embedding'] = df_pre[f'Embedding'].apply(lambda x: x.astype(p))
    for dis in ['Cosine', 'Euclidean', 'Manhattan']:
        for prompt in (sp_prompts + sn_prompts):
            prompt_embed = prompt_embeds[prompt]

            start = time.time()
            distances = []
            for i, row in df_pre.iterrows():
                distances.append(dist(dis, prompt_embed, row[f'Embedding']))

            closest_indices = np.argsort(distances)

            closest_rows = df_pre.iloc[closest_indices].values.tolist()

            mfrs_returned = []
            for m in [r[0] for r in closest_rows]:
                if m not in mfrs_returned:
                    mfrs_returned.append(m)
            # mfrs_found = [id for id in mfrs_returned if id in should_finds[prompt]]
            # mfrs_new = [id for id in mfrs_returned if id not in should_finds[prompt]]
            # mfrs_missed = [id for id in should_finds[prompt] if id not in mfrs_returned]
            
            mfrs_top_5 =  len([id for id in mfrs_returned[:5] if id in should_finds[prompt]]) / 5
            mfrs_top_10 = len([id for id in mfrs_returned[:10] if id in should_finds[prompt]]) / 10
            mfrs_top_20 = len([id for id in mfrs_returned[:20] if id in should_finds[prompt]]) / 20

            end = time.time() - start
            
            df_to_concat = pd.DataFrame({
                'Dimensions' : [d],
                'Precision' : [str(p).split("'")[1]],
                'Distance' : [dis],
                'Prompt' : [prompt],
                'Top 5' : [mfrs_top_5],
                'Top 10' : [mfrs_top_10],
                'Top 20' : [mfrs_top_20],
                'Time' : [round(end, 3)]
            })
            df_log = pd.concat([df_log, df_to_concat], ignore_index = True)

In [105]:
for d in dims:
    df_red, pca = reduce_embedding_dimensionality(df, d)
    for p in pres:
        df_pre = df_red.copy()
        df_pre[f'Embedding - {d}'] = df_pre[f'Embedding - {d}'].apply(lambda x: x.astype(p))
        for dis in ['Cosine', 'Euclidean', 'Manhattan']:
            for prompt in (sp_prompts + sn_prompts):
                prompt_embed = prompt_embeds[prompt]
                prompt_embed = pca.transform(np.array(prompt_embed).reshape(1, -1))[0]

                start = time.time()
                distances = []
                for i, row in df_pre.iterrows():
                    distances.append(dist(dis, prompt_embed, row[f'Embedding - {d}']))

                closest_indices = np.argsort(distances)

                closest_rows = df_pre.iloc[closest_indices].values.tolist()

                mfrs_returned = []
                for m in [r[0] for r in closest_rows]:
                    if m not in mfrs_returned:
                        mfrs_returned.append(m)
                # mfrs_found = [id for id in mfrs_returned if id in should_finds[prompt]]
                # mfrs_new = [id for id in mfrs_returned if id not in should_finds[prompt]]
                # mfrs_missed = [id for id in should_finds[prompt] if id not in mfrs_returned]
                
                mfrs_top_5 =  len([id for id in mfrs_returned[:5] if id in should_finds[prompt]]) / 5
                mfrs_top_10 = len([id for id in mfrs_returned[:10] if id in should_finds[prompt]]) / 10
                mfrs_top_20 = len([id for id in mfrs_returned[:20] if id in should_finds[prompt]]) / 20

                end = time.time() - start
                
                df_to_concat = pd.DataFrame({
                    'Dimensions' : [d],
                    'Precision' : [str(p).split("'")[1]],
                    'Distance' : [dis],
                    'Prompt' : [prompt],
                    'Top 5' : [mfrs_top_5],
                    'Top 10' : [mfrs_top_10],
                    'Top 20' : [mfrs_top_20],
                    'Time' : [round(end, 3)]
                })
                df_log = pd.concat([df_log, df_to_concat], ignore_index = True)

In [108]:
df_log_grouped = df_log.drop(columns = ['Prompt']).groupby(['Dimensions', 'Precision', 'Distance']).agg({
    'Top 5' : 'mean',
    'Top 10' :'mean',
    'Top 20' : 'mean',
    'Time' : 'sum'
}).reset_index()

In [107]:
df_log.to_csv('test_log.csv')

In [110]:
df_log_grouped.to_csv('test_log_grouped.csv')
