In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sqlalchemy import create_engine, select, values, update, and_, exists, text, distinct
from sqlalchemy.orm import sessionmaker, join
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink, NaicsCodes
from app.models.schema import NoticeBase, ResourceLinkBase, ResourceLinkSimple, NaicsCodeBase, NaicsCodeSimple
from dotenv import load_dotenv
import os
import requests
import json
import re
import pandas as pd
from pyspark.sql import SparkSession
import pendulum
from pyspark.sql.functions import col
from pydantic import BaseModel, ConfigDict
from typing_extensions import Optional, List
import tiktoken
from openai import OpenAI
import anthropic
import plotly.graph_objects as go


In [7]:

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
# Database
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=5).strftime("%Y%m%d")

In [8]:
client = OpenAI()

In [9]:
query = "cabinet making"

In [11]:
with SessionLocal() as db:
    res = client.embeddings.create(input=query, model="text-embedding-3-small")
    query_embed = res.data[0].embedding
    stmt = (
        select(NaicsCodes)
        .order_by(NaicsCodes.description_embedding.l2_distance(query_embed))
        .limit(20)
    )
    result = db.execute(stmt)
    codes = result.scalars().all()
    data = [NaicsCodeBase.model_validate(code) for code in codes]

In [12]:
embeddings = np.array([item.description_embedding for item in data])
labels = [item.title for item in data]

In [13]:
labels

['Wood Kitchen Cabinet and Countertop Manufacturing',
 'Finish Carpentry Contractors',
 'Nonupholstered Wood Household Furniture Manufacturing',
 'Custom Architectural Woodwork and Millwork Manufacturing',
 'Wood Office Furniture Manufacturing',
 'Showcase, Partition, Shelving, and Locker Manufacturing',
 'Office Furniture (except Wood) Manufacturing',
 'Institutional Furniture Manufacturing',
 'All Other Miscellaneous Wood Product Manufacturing',
 'Other Building Finishing Contractors',
 'Cut Stock, Resawing Lumber, and Planing',
 'Other Millwork (including Flooring)',
 'Upholstered Household Furniture Manufacturing',
 'Other Building Material Dealers',
 'Household Furniture (except Wood and Upholstered) Manufacturing',
 'Reupholstery and Furniture Repair',
 'Sawmill, Woodworking, and Paper Machinery Manufacturing',
 'Furniture Retailers',
 'Special Die and Tool, Die Set, Jig, and Fixture Manufacturing',
 'Hardwood Veneer and Plywood Manufacturing']

In [16]:
embeddings[0].shape

(1536,)

In [None]:
similarity_matrix = np.dot(embeddings, embeddings.T)


In [None]:
similarity_matrix

Creat Node Graph

In [None]:
G = nx.Graph()
for i in range(len(embeddings)):
    G.add_node(labels[i])

for i in range(len(embeddings)):
    for j in range(i + 1, len(embeddings)):
        G.add_edge(labels[i], j, weight=similarity_matrix[i, j])

In [None]:
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', font_size=8)
plt.show()

In [None]:
G = nx.Graph()

In [None]:
for i, label in enumerate(labels):
    G.add_node(i, label=label)

Add edges based on threshold

In [None]:
similarity_threshold = 0.5  
for i in range(len(labels)):
    for j in range(i+1, len(labels)):
        if similarity_matrix[i, j] > similarity_threshold:
            G.add_edge(i, j)

Positions for all the nodes

In [None]:
pos = nx.spring_layout(G)  

Edges

In [None]:
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])
edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines')

Nodes

In [None]:
node_x = []
node_y = []
for node in pos:
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
node_trace = go.Scatter(x=node_x, y=node_y, mode='markers', hoverinfo='text', marker=dict(showscale=True, colorscale='YlGnBu', reversescale=True, color=[], size=10, colorbar=dict(thickness=15, title='Node Connections', xanchor='left', titleside='right'), line_width=2))

Color nodes by adjacencies

In [None]:
node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append(f'{labels[node]} (# of connections: {len(adjacencies[1])})')
node_trace.marker.color = node_adjacencies
node_trace.text = node_text

In [None]:
fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout(title='<br>Network graph made with Python', titlefont_size=16, showlegend=False, hovermode='closest', margin=dict(b=20,l=5,r=5,t=40), annotations=[dict(text="Python code: <a href='https://plotly.com/ipython-notebooks/network-graphs/'> https://plotly.com/ipython-notebooks/network-graphs/</a>", showarrow=False, xref="paper", yref="paper", x=0.005, y=-0.002)], xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))


In [None]:
fig.show()