In [None]:
# This notebook automates search for multiple prompts, and generates a df with the results for each search

In [None]:
import os
import ast
import math
import time
import urllib
import numpy as np
import pandas as pd
from numpy.linalg import norm
import openai

from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv('API_KEY')

In [None]:
def embed(input):
    response = openai.Embedding.create(
    input=input,
    model="text-embedding-ada-002"
    )
    embeddings = response['data'][0]['embedding']
    return embeddings
    
def dist(arr2, arr1):
    # Given that arr1 is a np array of lists, and arr2 is a np array
    return np.dot(arr1,arr2)/(norm(arr1, axis=1)*norm(arr2))

def dist2(arr1, arr2):
    # Given that arr1 np array, and arr2 is a np array
    return np.dot(arr1,arr2)/(norm(arr1)*norm(arr2))

In [None]:
def generate_sheets(df, info):

    # Create page count df
    id_count = df.groupby('ID').count().to_dict()['Embedding']
    df = pd.DataFrame([id_count]).transpose().reset_index()
    df.columns = ['ID', 'Count']

    info = info[info['ID'].isin(id_count.keys())]
    info = info.reset_index().drop(columns=['index'])

    pages_df = pd.merge(info, df, how ='inner', on = 'ID')
    pages_df.drop(pages_df[pages_df['Count'] == 0].index)
    pages_df['Scientific Name'] = pages_df['Scientific Name'].apply(lambda x: x.split(', '))
    pages_df['Specialty'] = pages_df['Specialty'].apply(lambda x: x.split(', '))
    
    # count occurrences of Scientific Name
    scientific_name_counts = {}
    for names in pages_df['Scientific Name']:
        for name in names:
            if name not in scientific_name_counts:
                scientific_name_counts[name] = 1
            else:
                scientific_name_counts[name] += 1

    # count occurrences of Specialty
    specialty_counts = {}
    for specialties in pages_df['Specialty']:
        for specialty in specialties:
            if specialty not in specialty_counts:
                specialty_counts[specialty] = 1
            else:
                specialty_counts[specialty] += 1

    # create new dataframes with the counts
    scientific_name_df = pd.DataFrame.from_dict(scientific_name_counts, orient='index', columns=['Count'])
    specialty_df = pd.DataFrame.from_dict(specialty_counts, orient='index', columns=['Count'])

    # add column to indicate specialty or scientific name
    scientific_name_df['Type'] = 'Scientific Name'
    specialty_df['Type'] = 'Specialty'

    # merge the two dataframes
    sp_sn_count = pd.concat([scientific_name_df, specialty_df], axis=0, sort=False)
    sp_sn_count = sp_sn_count.reset_index()
    sp_sn_count.columns = ['Word', 'Count', 'Type']
    sp_sn_count = sp_sn_count.sort_values('Count', ascending=False)
    
    return pages_df, sp_sn_count.reset_index().drop(columns=['index'])

In [None]:
df = pd.read_csv('openai_concat_embed_sample_smaller.csv')
df = df.dropna(subset=['Embedding'])
df['Embedding'] = df['Embedding'].apply(lambda x: 
                                    np.fromstring(
                                        x.replace('[','')
                                        .replace(']','')
                                        .replace('  ',' '), sep=', '))

info , test_cases = generate_sheets(df, pd.read_csv('Google Sheets.csv'))
info = info.set_index('ID')


In [None]:
def automated_test(test_list, n = None, threshold=0.8, n_chars=0, details = False):
    for prompt in test_list:
        # Retreive the IDs of the MFRs that have the prompt as a scientific name or specialty
        should_find = info[info['Specialty'].apply(lambda x: prompt in x)].index.tolist()
        should_find.extend(info[info['Scientific Name'].apply(lambda x: prompt in x)].index.tolist())
        should_find = list(set(should_find))
        
        # Set n to be 20% over the number of ids that belong to the prompt as a scientific name, or as a prompt.
        if n == None:
            top = math.ceil(1.20 * len(should_find))

        # Embed the prompt
        prompt_embed = embed(prompt)

        # Calculate cosine distances between prompt and each product
        distances = []
        for i, row in df.iterrows():
            distances.append(dist2(prompt_embed, row['Embedding']))

        # Sort the products by distance in descending order
        closest_indices = np.argsort(distances)[::-1]
        closest_rows = df.iloc[closest_indices].values.tolist()
        closest_distances = [distances[i] for i in closest_indices]

        # Initialize variables for tracking manufacturer scores
        manufacturer_scores = {}
        manufacturer_pages = {}

        # Iterate over the products
        for r, distance in zip(closest_rows, closest_distances):
            # Extract the manufacturer ID
            id = r[1]

            # If the distance is below the threshold, skip this product
            if distance < threshold:
                continue

            # If we haven't seen this manufacturer yet, initialize their score and page list
            if id not in manufacturer_scores:
                manufacturer_scores[id] = 0
                manufacturer_pages[id] = []

            # Add the distance to the manufacturer's score and add the page to their list
            if len(r[3]) > n_chars:
                manufacturer_scores[id] += distance
                manufacturer_pages[id].append((r[2], r[3], distance))

        # Sort the manufacturers by score in descending order
        sorted_manufacturers = sorted(manufacturer_scores.items(), key=lambda x: x[1], reverse=True)[:top]
        manufacturer_returned = [x[0] for x in sorted_manufacturers]

        manufacturers_found = [id for id in manufacturer_returned if id in should_find]
        manufacturer_new = [id for id in manufacturer_returned if id not in should_find]
        manufacturers_missed = [id for id in should_find if id not in manufacturer_returned]

        print('Prompt:', prompt)
        print(f'\n# of Manufacturers with prompt as SN/SP: {len(should_find)}')
        if details:
            for m in should_find:
                name = info.loc[m, 'Manufacturer']
                print(f'\tID: {str(m).ljust(10)} Name:{name}')
            
        
        if len(should_find) > 0:
            print(f'\n# of Missed Manufacturers: {len(manufacturers_missed)} / {len(should_find)} --> {round((len(manufacturers_missed) / len(should_find) * 100))}%')
            if details:
                for m in manufacturers_missed:
                    name = info.loc[m, 'Manufacturer']
                    print(f'\tID: {str(m).ljust(10)} Name:{name}')

            print(f'\n# of Manufacturers successfully found: {len(manufacturers_found)} / {len(should_find)} --> {round((len(manufacturers_found) / len(should_find) * 100))}%')
            if details:
                for i, m in enumerate(manufacturers_found):
                    name = info.loc[m, 'Manufacturer']
                    url = info.loc[m, 'Website']
                    sn = info.loc[m, 'Scientific Name']
                    sp = info.loc[m, 'Specialty']

                    print(f'\n\t--- {i+1} --- ')
                    print(f'\t\tMFR Name: {name}')
                    print(f'\t\tMFR ID: {id}')
                    print(f'\t\tMFR URL: {url}')
                    print(f'\t\tMFR SN: {sn}')
                    print(f'\t\tMFR Specialty: {sp}')
                    print('\n\t\t\tPages:')
                    for page in manufacturer_pages[m]:
                        print(f'\t\t\t\t Cosine Similarity: {page[2]}')
                        print(f'\t\t\t\t URL: {urllib.parse.unquote(page[0])}')
                        print(f'\t\t\t\t--- Segment: {page[1]}')
                    print('\n\n')

        print(f'\n# of New Manufacturers: {len(manufacturer_new)} / {len(manufacturer_returned)}')
        if details:
            for i, m in enumerate(manufacturer_new):
                name = info.loc[m, 'Manufacturer']
                url = info.loc[m, 'Website']
                sn = info.loc[m, 'Scientific Name']
                sp = info.loc[m, 'Specialty']

                print(f'\n\t--- {i+1} --- ')
                print(f'\t\tMFR Name: {name}')
                print(f'\t\tMFR ID: {id}')
                print(f'\t\tMFR URL: {url}')
                print(f'\t\tMFR SN: {sn}')
                print(f'\t\tMFR Specialty: {sp}')
                print('\n\t\t\tPages:')
                for page in manufacturer_pages[m]:
                    print(f'\t\t\t\t Cosine Similarity: {page[2]}')
                    print(f'\t\t\t\t URL: {urllib.parse.unquote(page[0])}')
                    print(f'\t\t\t\t--- Segment: {page[1]}')
                print('\n\n')
        print('-'*50)


In [None]:
def automated_test_df(test_list, n = None, threshold=0.8, n_chars=0, details = False):
    output_df = pd.DataFrame(columns = ['Prompt', 'Type', 'Should_Find', 'Returned', 'Found', 'Missed', 'New'])
    for i, prompt in enumerate(test_list):
        print(i, end = '\t')
        # Retreive the IDs of the MFRs that have the prompt as a scientific name or specialty
        should_find = info[info['Specialty'].apply(lambda x: prompt in x)].index.tolist()
        should_find.extend(info[info['Scientific Name'].apply(lambda x: prompt in x)].index.tolist())
        should_find = list(set(should_find))
        
        # Set n to be 20% over the number of ids that belong to the prompt as a scientific name, or as a prompt.
        if n == None:
            top = math.ceil(1.20 * len(should_find))

        # Embed the prompt
        prompt_embed = embed(prompt)

        # Calculate cosine distances between prompt and each product
        distances = []
        for i, row in df.iterrows():
            distances.append(dist2(prompt_embed, row['Embedding']))

        # Sort the products by distance in descending order
        closest_indices = np.argsort(distances)[::-1]
        closest_rows = df.iloc[closest_indices].values.tolist()
        closest_distances = [distances[i] for i in closest_indices]

        # Initialize variables for tracking manufacturer scores
        manufacturer_scores = {}
        manufacturer_pages = {}

        # Iterate over the products
        for r, distance in zip(closest_rows, closest_distances):
            # Extract the manufacturer ID
            id = r[1]

            # If the distance is below the threshold, skip this product
            if distance < threshold:
                continue

            # If we haven't seen this manufacturer yet, initialize their score and page list
            if id not in manufacturer_scores:
                manufacturer_scores[id] = 0
                manufacturer_pages[id] = []

            # Add the distance to the manufacturer's score and add the page to their list
            if len(r[3]) > n_chars:
                manufacturer_scores[id] += distance
                manufacturer_pages[id].append((r[2], r[3], distance))

        # Sort the manufacturers by score in descending order
        sorted_manufacturers = sorted(manufacturer_scores.items(), key=lambda x: x[1], reverse=True)[:top]
        manufacturer_returned = [x[0] for x in sorted_manufacturers]

        manufacturers_found = [id for id in manufacturer_returned if id in should_find]
        manufacturer_new = [id for id in manufacturer_returned if id not in should_find]
        manufacturers_missed = [id for id in should_find if id not in manufacturer_returned]
        if prompt in pd.unique(info['Specialty'].explode()).tolist():
            temp_df = pd.DataFrame({'Prompt':[prompt], 'Type':['Specialty'], 'Should_Find': [len(should_find)],'Returned': [len(manufacturer_returned)],
            'Found':[len(manufacturers_found)], 'Missed':[len(manufacturers_missed)], 'New':[len(manufacturer_new)]})
        else:
            temp_df = pd.DataFrame({'Prompt':[prompt], 'Type':['Scientific Name'], 'Should_Find': [len(should_find)],'Returned': [len(manufacturer_returned)],
            'Found':[len(manufacturers_found)], 'Missed':[len(manufacturers_missed)], 'New':[len(manufacturer_new)]})
        output_df = pd.concat([output_df, temp_df], ignore_index=True)
    return output_df


In [None]:
# # get all scientific names and specialities from sample
# all_sp = pd.unique(info['Specialty'].explode()).tolist()
# all_sn = pd.unique(info['Scientific Name'].explode()).tolist()
# to_test = all_sp
# to_test.extend(all_sn)
# to_test = list(set(to_test))

# get top 30 scientific names and specialties
to_test = [x[0] for x in test_cases[['Word', 'Count']].values.tolist()][:30]
# to_test = [x[0] for x in test_cases[['Word', 'Count', 'Type']].values.tolist() if x[2] == 'Scientific Name'][:30]

out_sp = automated_test_df(to_test, threshold=0.8, n_chars=100)