In [2]:
!pip install openai pandas numpy scipy

import time
import traceback

import numpy as np
import openai
import pandas as pd
from scipy.spatial.distance import cosine
import heapq
import os

openai.api_key = os.environ['OPENAI_API_KEY']

Looking in indexes: https://mvega:****@vdevtools.appspot.com/vpm/simple/

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
model = "text-embedding-ada-002"
def generate_embeddings(input):
    if pd.isna(input): return
    try:
        print(f"Category Embedding {input}")
        return np.array(openai.Embedding.create(
            engine=model,
            input=input.upper(),
            temperature=0,
            max_tokens=1024,
        )['data'][0]['embedding'])
    except openai.error.RateLimitError:
        print("Rate limiting...")
        time.sleep(60)
        return generate_embeddings(input)
    except:
        print(traceback.format_exc())
        time.sleep(60)
        return generate_embeddings(input)


In [14]:
 def generate_embeddings_from_csv(vcat_csv, to_match_csv):
        try:
            vcat_df = pd.read_csv(vcat_csv, names=["id", "name"])
        
            to_match_cats_df = pd.read_csv(to_match_csv, names=["id", "name"])
        
            to_match_cats_embeddings = to_match_cats_df.apply(
                lambda row: generate_embeddings(f'{row["name"]}'), axis=1)
            to_match_cats_df["embedding"] = to_match_cats_embeddings
            
            vcat_embeddings = vcat_df.apply(
                lambda row: generate_embeddings(f'{row["name"]}'), axis=1)
            vcat_df["embedding"] = vcat_embeddings
        except pd.errors.ParserError as e:
            print(f"Error reading CSV file: {e}")
            return None, None
        return vcat_df, to_match_cats_df

In [5]:
class HeapObject:
    def __init__(self, score, category_id, category_name):
        self.score = score
        self.category_id = category_id
        self.category_name = category_name

    def __lt__(self, other):
        return self.score < other.score

class MaxHeap:
    def __init__(self):
        self.heap = []

    def add(self, obj):
        heapq.heappush(self.heap, obj)

    def get_max(self):
        return self.heap[0].score if self.heap else None

    def extract_max(self):
        return heapq.heappop(self.heap).score if self.heap else None

    def __str__(self):
        return str([obj.score for obj in self.heap])
    
    def get_top_values(self, n):
        top_values = []
        for _ in range(n):
            if self.heap:
                top_values.append(heapq.heappop(self.heap).score)
        return top_values
    
    def get_top_objects(self, n):
        return heapq.nlargest(n, self.heap)


In [18]:
def perfect_match(cat1, cat2):
    if cat1.upper() == cat2.upper():
        return True
    return False

def extract_last_substring(input_string):
    parts = input_string.split(" > ")
    if len(parts) > 1:
        return parts[-1]
    else:
        return input_string

def matcher(from_vector, to_vector):
    matches = {}
    matches_list = []

    for _, category1 in from_vector.iterrows():
        print(category1['name'])
        category1_embedding = category1.embedding
        matches[category1.id] = MaxHeap()
        
        specific_cat1 = extract_last_substring(category1['name'])

        for category2 in to_vector.itertuples(index=False):
            category2_embedding = category2.embedding
            similarity_score = cosine(category1_embedding, category2_embedding)
            
            specific_cat2 = extract_last_substring(category2.name)
            if perfect_match(specific_cat1, specific_cat2):
                matches[category1.id].add(HeapObject(1, category2.id, category2.name))
            matches[category1.id].add(HeapObject(1-similarity_score, category2.id, category2.name))
    
        top_matches = matches[category1.id].get_top_objects(3)
        for i, heap_obj in enumerate(top_matches):
            matches_list.append({
                "VCategory ID": category1['id'],
                "Match ID": heap_obj.category_id,
                "VCategory Name": category1['name'],
                "Match Name": heap_obj.category_name,
                "Match": i+1,
                "Similarity Score": heap_obj.score
            })
    matches_df = pd.DataFrame(matches_list)
    matches_df.to_csv("matches_output.csv", index=False)    
        

In [20]:
[from_vector, to_vector] = generate_embeddings_from_csv("CATEGORIES-V_CATEGORY.csv", "DATA_AXLE_TO_ADD.csv")

Category Embedding Oilseed (except Soybean) Farming
Category Embedding Dry Pea and Bean Farming
Category Embedding Oilseed and Grain Combination Farming
Category Embedding Sugar Beet Farming
Category Embedding Hay Farming
Category Embedding Peanut Farming
Category Embedding Strawberry Farming
Category Embedding Orange Groves
Category Embedding Nursery and Tree Production
Category Embedding Shellfish Farming
Category Embedding Soil Preparation Planting & Cultivating
Category Embedding Veterinary Services
Category Embedding Landscape Architectural Services
Category Embedding Other Marine Fishing
Category Embedding Emergency Property Restoration
Category Embedding Heavy & Civil Engineering Construction
Category Embedding Foundation Structure & Building Exterior Contractors
Category Embedding Building Finishing Contractors
Category Embedding Meat Processing
Category Embedding Nonchocolate Confectionery Manufacturing
Category Embedding Roasted Nuts & Peanut Butter Manufacturing
Category Emb

In [21]:
matcher(to_vector, from_vector)

Oilseed (except Soybean) Farming
Dry Pea and Bean Farming
Oilseed and Grain Combination Farming
Sugar Beet Farming
Hay Farming
Peanut Farming
Strawberry Farming
Orange Groves
Nursery and Tree Production
Shellfish Farming
Soil Preparation Planting & Cultivating
Veterinary Services
Landscape Architectural Services
Other Marine Fishing
Emergency Property Restoration
Heavy & Civil Engineering Construction
Foundation Structure & Building Exterior Contractors
Building Finishing Contractors
Meat Processing
Nonchocolate Confectionery Manufacturing
Roasted Nuts & Peanut Butter Manufacturing
Bottled Water Manufacturing
Tortilla Manufacturing
Apparel Knitting Mills
Fiber Yarn & Thread Mills
Cut & Sew Apparel Contractors
Cut & Sew Apparel-Except Contractors (Mfrs)
Curtain & Linen Mills
Cut Stock Resawing Lumber & Planing
Wood Window & Door Manufacturing
Wood Kitchen Cabinet & Countertop (Mfrs)
Wood Preservation
Showcase Partition Shelving & Locker (Mfrs)
Plastics Packaging Film & Sheet (Mfrs)
Gree