In [1]:
# search embedded results once they are appeneded into one CSV

In [2]:
import os
import ast
import math
import time
import urllib
import numpy as np
import pandas as pd
from numpy.linalg import norm

import openai
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv('API_KEY')

In [3]:
def embed(input):
    response = openai.Embedding.create(
    input=input,
    model="text-embedding-ada-002"
    )
    embeddings = response['data'][0]['embedding']
    return embeddings
    
def dist(arr2, arr1):
    # Given that arr1 is a np array of lists, and arr2 is a np array
    return np.dot(arr1,arr2)/(norm(arr1, axis=1)*norm(arr2))

def dist2(arr1, arr2):
    # Given that arr1 np array, and arr2 is a np array
    return np.dot(arr1,arr2)/(norm(arr1)*norm(arr2))

In [14]:
def generate_sheets(df, info):

    # Create page count df
    id_count = df.groupby('ID').count().to_dict()['Embedding']
    df = pd.DataFrame([id_count]).transpose().reset_index()
    df.columns = ['ID', 'Count']

    info = info[info['ID'].isin(id_count.keys())]
    info = info.reset_index().drop(columns=['index'])

    pages_df = pd.merge(info, df, how ='inner', on = 'ID')
    pages_df.drop(pages_df[pages_df['Count'] == 0].index)
    pages_df['Scientific Name'] = pages_df['Scientific Name'].apply(lambda x: x.split(', '))
    pages_df['Specialty'] = pages_df['Specialty'].apply(lambda x: x.split(', '))
    
    # count occurrences of Scientific Name
    scientific_name_counts = {}
    for names in pages_df['Scientific Name']:
        for name in names:
            if name not in scientific_name_counts:
                scientific_name_counts[name] = 1
            else:
                scientific_name_counts[name] += 1

    # count occurrences of Specialty
    specialty_counts = {}
    for specialties in pages_df['Specialty']:
        for specialty in specialties:
            if specialty not in specialty_counts:
                specialty_counts[specialty] = 1
            else:
                specialty_counts[specialty] += 1

    # create new dataframes with the counts
    scientific_name_df = pd.DataFrame.from_dict(scientific_name_counts, orient='index', columns=['Count'])
    specialty_df = pd.DataFrame.from_dict(specialty_counts, orient='index', columns=['Count'])

    # add column to indicate specialty or scientific name
    scientific_name_df['Type'] = 'Scientific Name'
    specialty_df['Type'] = 'Specialty'

    # merge the two dataframes
    sp_sn_count = pd.concat([scientific_name_df, specialty_df], axis=0, sort=False)
    sp_sn_count = sp_sn_count.reset_index()
    sp_sn_count.columns = ['Word', 'Count', 'Type']
    sp_sn_count = sp_sn_count.sort_values('Count', ascending=False)
    
    return pages_df, sp_sn_count.reset_index().drop(columns=['index'])

In [4]:
df = pd.read_csv('openai_concat_embed_sample_smaller.csv')
info = pd.read_csv('Google Sheets.csv').set_index('ID')
df = df.dropna(subset=['Embedding'])

df['Embedding'] = df['Embedding'].apply(lambda x: ast.literal_eval(x))

In [27]:
info , test_cases = generate_sheets(df, pd.read_csv('Google Sheets.csv'))
info = info.set_index('ID')

In [29]:
prompt = 'Vitamin C'
prompt_embed = embed(prompt)

distances = []
for i, row in df.iterrows():
    distances.append(dist2(prompt_embed, row['Embedding']))

# sort distances in ascending order and select top 10 closest results
closest_indices = np.argsort(distances)[::-1][:300]
closest_rows = df.iloc[closest_indices].values.tolist()
closest_distances = [distances[i] for i in closest_indices]


# print closest results in a nicely formatted way
print('Query:', prompt)
print('Closest Results:')
j = 0
for i, (r, distance) in enumerate(zip(closest_rows, closest_distances)):
    if len(r[2]) > 85:
        j += 1
        id = r[0]
        name = info.loc[id, 'Manufacturer']
        sn = info.loc[id, 'Scientific Name']
        sp = info.loc[id, 'Specialty']
        url = info.loc[id, 'Website']
        print(f'\n\n\n --- {i + 1} --- ')
        print(f'----- {j} -----')
        print(f'MFR Name: {name}')
        print(f'MFR ID: {id}')
        print('Cosine Distance:', distance)
        print()
        print(f'MFR URL: {url}')
        print()
        print(f'MFR SN: {sn}')
        print()
        print(f'MFR Specialty: {sp}')
        print()
        print('Page:', urllib.parse.unquote(r[1]))
        print('\tSegment:', r[2])

Query: Vitamin C
Closest Results:



 --- 1 --- 
----- 1 -----
MFR Name: mcePharma s.r.o.
MFR ID: 13576
Cosine Distance: 0.8693737146305163

MFR URL: https://www.mcepharma.com/

MFR SN: ['Colostrum', 'Iron', 'Pet Supplements']

MFR Specialty: ['Dietary and Herbal Supplements', 'Pharmaceutical']

Page: https://www.mcepharma.com/vitamin-c
	Segment: vitamin c | mcepharma s.r.o. - czech company operating worldwide, developing, producing and selling unique food supplements, pet supplements and cosmetics - mcepharma.. vitamin c. vitamin c significantly. to the normal function of our immune system and it´s also considered one of the most important vitamins in our body.. we offer vitamin c in two forms:. swallowing tablets or odt tablets. product description:. key ingredients. vitamin c is a strong antioxidant and water-soluble vitamin which has many impressive health benefits: reducing blood pressure, reducing heart disease risk, protecting against gout attacks, improving iron absorption, boo