In [1]:
import pandas as pd
import weaviate
import weaviate.classes as wvc
import os
import requests
import json
from pprint import pprint
import tqdm
import matplotlib
from main.keys import open_ai_key, weaviate_url, weaviate_key
from weaviate.classes.query import Move
import torch
from spacy import displacy
import spacy
import matplotlib.pyplot as plt

In [28]:
data_ner = pd.read_feather("data/postings_w_embeddings_v2.fth")
data_original = pd.read_feather("data/postings_w_embeddings_v2_original.fth")
print(data_ner.shape)
print(data_original.shape)

(99513, 52)
(9955, 51)


In [29]:
data_ner = data_ner.drop(columns=[ 'scraped','formatted_work_type',
       'location', 'applies', 'original_listed_time', 'remote_allowed',
       'application_type', 'expiry', 'inferred_benefits', 'skills_desc', 'views', 'job_region', 'listed_time', 'degree', 'state', 'country_company',
       'city', 'zip_code', 'address',
       'posting_domain','description_company', 'sponsored','closed_time','job_posting_url','application_url','url', 'description'])

In [30]:
data_ner.to_feather("data/postings_w_embeddings_v2_small.fth")

In [27]:
data_ner.columns

Index(['job_id', 'scraped', 'company_id', 'work_type', 'formatted_work_type',
       'location', 'applies', 'original_listed_time', 'remote_allowed',
       'application_type', 'expiry', 'inferred_benefits', 'closed_time',
       'formatted_experience_level', 'years_experience', 'title',
       'skills_desc', 'views', 'job_region', 'listed_time', 'degree',
       'posting_domain', 'sponsored', 'country', 'country_code',
       'job_functions', 'industry_names', 'company_name',
       'description_company', 'company_size', 'state', 'country_company',
       'city', 'zip_code', 'address', 'text', 'entities_COMPANY',
       'entities_METHODS', 'entities_TOOLS', 'entities_EXPERIENCE',
       'entities_LEVEL', 'entities_REMOTE', 'entities_RESPONSABILITY',
       'entities_TITLE', 'entities_QUALIFICATION', 'vector', 'wv_uuid',
       'annotations'],
      dtype='object')

In [31]:
data_ner.head()
data_ner.memory_usage(deep=True)

Index                               132
job_id                           796104
company_id                       796104
work_type                       6558260
formatted_experience_level      6666543
years_experience                 796104
title                           9526364
country                         6009910
country_code                    5871267
job_functions                  11941560
industry_names                 11941560
company_name                    7421784
company_size                     796104
text                          907788734
entities_COMPANY               11941560
entities_METHODS               11941560
entities_TOOLS                 11941560
entities_EXPERIENCE            11941560
entities_LEVEL                 11941560
entities_REMOTE                11941560
entities_RESPONSABILITY        11941560
entities_TITLE                 11941560
entities_QUALIFICATION         11941560
vector                         11941560
wv_uuid                         9254709


In [4]:
data = data_ner.copy()
data['entities_TOOLS'] = data['entities_TOOLS'].apply(lambda x: x.tolist())
data['entities_METHODS'] = data['entities_METHODS'].apply(lambda x: x.tolist())
data['entities_REMOTE'] = data['entities_REMOTE'].apply(lambda x: x.tolist())
data['entities_EXPERIENCE'] = data['entities_EXPERIENCE'].apply(lambda x: x.tolist())
data['entities_RESPONSABILITY'] = data['entities_RESPONSABILITY'].apply(lambda x: x.tolist())
data['job_functions'] = data['job_functions'].apply(lambda x: x.tolist())
data['industry_names'] = data['industry_names'].apply(lambda x: x.tolist())
data_ner = data.copy()

In [5]:
data = data_original.copy()
data['entities_TOOLS'] = data['entities_TOOLS'].apply(lambda x: x.tolist())
data['entities_METHODS'] = data['entities_METHODS'].apply(lambda x: x.tolist())
data['entities_REMOTE'] = data['entities_REMOTE'].apply(lambda x: x.tolist())
data['entities_EXPERIENCE'] = data['entities_EXPERIENCE'].apply(lambda x: x.tolist())
data['entities_RESPONSABILITY'] = data['entities_RESPONSABILITY'].apply(lambda x: x.tolist())
data['job_functions'] = data['job_functions'].apply(lambda x: x.tolist())
data['industry_names'] = data['industry_names'].apply(lambda x: x.tolist())
data_original = data.copy()

In [6]:
entities = data_ner[['company_name','title','formatted_experience_level','job_functions', 'industry_names', 'entities_TOOLS', 'entities_METHODS', 'entities_REMOTE', 'entities_EXPERIENCE', 'entities_RESPONSABILITY',]]

In [7]:
entities_dict = {}

for col in entities.columns:
    col_list = entities[col].tolist()
    print(col)
    
    col_list_flat = []
    for item in col_list:
        if isinstance(item, (list, tuple)):
            col_list_flat.extend([subitem for subitem in item if subitem is not None])
        elif item is not None:
            col_list_flat.append(item)
    
    entities_dict[col] = col_list_flat

company_name
title
formatted_experience_level
job_functions
industry_names
entities_TOOLS
entities_METHODS
entities_REMOTE
entities_EXPERIENCE
entities_RESPONSABILITY


In [8]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def calculate_embedding_metrics(df_sample):
    # Ensure 'description' column exists
    if 'description' not in df_sample.columns:
        raise ValueError("DataFrame must contain 'description' column")

    # List of NER columns
    ner_columns = [
        'company_name', 'title', 'formatted_experience_level', 'job_functions', 
        'industry_names', 'entities_TOOLS', 'entities_METHODS', 'entities_REMOTE', 
        'entities_EXPERIENCE', 'entities_RESPONSABILITY'
    ]

    # Calculate total length of descriptions
    total_description_length = df_sample['description'].str.len().sum()

    # Calculate total length of all NERs and collect all NERs
    total_ner_length = 0
    all_ners = []
    for col in ner_columns:
        if col in df_sample.columns:
            # Sum lengths of all entities in the list, handling None values
            total_ner_length += df_sample[col].apply(lambda x: sum(len(str(entity)) for entity in (x or []) if entity is not None)).sum()
            # Collect all non-None entities
            all_ners.extend([str(entity) for ner_list in df_sample[col].dropna() for entity in (ner_list or []) if entity is not None])

    # Calculate length of unique NERs
    unique_ners = set(all_ners)
    unique_ner_length = sum(len(entity) for entity in unique_ners)

    return {
        'full_description_length': total_description_length,
        'full_ner_length': total_ner_length,
        'unique_ner_length': unique_ner_length
    }

def generate_plot_data(df, sample_sizes):
    plot_data = []
    for size in sample_sizes:
        sample = df.sample(n=size, random_state=42)
        metrics = calculate_embedding_metrics(sample)
        metrics['sample_size'] = size
        plot_data.append(metrics)
    return pd.DataFrame(plot_data)

# Example usage:
# Assuming 'data_ner' is your full DataFrame
sample_sizes = [100, 500, 1000, 5000, 10000,20000,30000,40000,50000,60000,70000,80000,90000, len(data_ner)]  # Adjust as needed
plot_data = generate_plot_data(data_ner, sample_sizes)

# Create the Plotly figure
fig = make_subplots(rows=1, cols=1, shared_xaxes=True)

# Add traces for each metric
fig.add_trace(
    go.Scatter(x=plot_data['sample_size'], y=plot_data['full_description_length'], 
               mode='lines+markers', name='Full Description', line=dict(color='blue'))
)
fig.add_trace(
    go.Scatter(x=plot_data['sample_size'], y=plot_data['full_ner_length'], 
               mode='lines+markers', name='Full NERs', line=dict(color='red'))
)
fig.add_trace(
    go.Scatter(x=plot_data['sample_size'], y=plot_data['unique_ner_length'], 
               mode='lines+markers', name='Unique NERs', line=dict(color='green'))
)

# Update layout
fig.update_layout(
    title='Number of Tokens vs Listings Embedded',
    xaxis_title='Number of Listings',
    yaxis_title='Number of Tokens',
    legend_title='Embedding Method',
    hovermode='x unified',
    template='plotly_white'
)

# Add gridlines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')

# Show the plot
fig.show()


In [9]:
import pandas as pd

def generate_unique_ner_dict(df):
    # List of NER columns
    ner_columns = [
        'company_name', 'title', 'formatted_experience_level', 'job_functions', 
        'industry_names', 'entities_TOOLS', 'entities_METHODS', 'entities_REMOTE', 
        'entities_EXPERIENCE', 'entities_RESPONSABILITY'
    ]

    unique_ner_dict = {}

    for col in ner_columns:
        if col in df.columns:
            # Flatten the list of lists and get unique entries
            unique_entries = set()
            for entry_list in df[col].dropna():
                if isinstance(entry_list, list):
                    unique_entries.update(str(item) for item in entry_list if item is not None)
                else:
                    unique_entries.add(str(entry_list))
            
            # Store the unique entries in the dictionary
            unique_ner_dict[col] = sorted(list(unique_entries))

    return unique_ner_dict

# Example usage:
# Assuming 'data_ner' is your DataFrame
unique_ners = generate_unique_ner_dict(data_ner)

# Print the results
for column, entries in unique_ners.items():
    print(f"\n{column}:")
    print(f"Number of unique entries: {len(entries)}")
    print("Sample entries (up to 10):", entries[:10])


company_name:
Number of unique entries: 13750
Sample entries (up to 10): [' Coeur d’Alene Casino Resort Hotel', ' Farmers® Insurance of Michigan - District Office ', ' Free Rein Life Ltd', ' Georgia Pacific', ' Health Matching Account Services', ' Pilot Company', ' The Times-Picayune | Nola.com', '"A" Best International Placement Services, LLC ', '#DegreesNYC', '#twiceasnice Recruiting']

title:
Number of unique entries: 62527
Sample entries (up to 10): ['  Lead Frontend Engineer ', ' Account Manager Entry Level', ' Associate Technical Account Manager', ' Corporate Lawyer - London - 1-5PQE', ' Falcon 2000Easy', ' Field QA Specialist', ' Field Service Engineer', ' Head Teller (PC 14)', ' Instructional Designer and Technologist I', ' Intune Endpoint Engineer']

formatted_experience_level:
Number of unique entries: 6
Sample entries (up to 10): ['Associate', 'Director', 'Entry level', 'Executive', 'Internship', 'Mid-Senior level']

job_functions:
Number of unique entries: 35
Sample entrie

In [11]:
from main.keys import open_ai_key, weaviate_url, weaviate_key
import os
import weaviate
import openai
from typing import List, Dict
import numpy as np
from tqdm import tqdm

os.environ["OPENAI_APIKEY"] = open_ai_key
os.environ["WCD_URL"] = weaviate_url
os.environ["WCD_API_KEY"] = weaviate_key

openai_api_key = os.environ.get("OPENAI_APIKEY", "<your OpenAI API key if not set as env var>")
openai.api_key = openai_api_key

def vectorize(texts: List[str]) -> List[List[float]]:
    # print(texts)
    response = openai.embeddings.create(
        input=texts, model="text-embedding-3-small"
    )
    # print(f"{[item.embedding for item in response.data]=}")
    return [item.embedding for item in response.data]

def prepare_entity_texts(entities_dict):
    prepared_texts = {}
    for column, entities in entities_dict.items():
        if column.startswith('entities_'):
            prefix = column.split('_')[1] + ': '
        else:
            prefix = column + ': '
        prepared_texts[column] = [prefix + entity for entity in entities]
    return prepared_texts

def generate_embeddings_for_entities(entities_dict: Dict[str, List[str]], batch_size: int = 1000) -> Dict[str, Dict[str, List[float]]]:
    embeddings_dict = {}
    prepared_texts = prepare_entity_texts(entities_dict)
    
    for entity_type, texts in prepared_texts.items():
        print(f"Generating embeddings for {entity_type}")
        # if entity_type != "entities_TOOLS":
            # continue
        print(f"Generating embeddings for {entity_type}")
        entity_embeddings = {}
        
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i+batch_size]
            try:
                embeddings = vectorize(batch)
                for text, embedding in zip(batch, embeddings):
                    entity = text.split(': ', 1)[1]  # Extract the entity part
                    entity_embeddings[entity] = embedding
            except Exception as e:
                print(f"Error in batch {i}-{i+batch_size} for {entity_type}: {str(e)}")
            # break
        
        embeddings_dict[entity_type] = entity_embeddings
    
    return embeddings_dict

# Generate embeddings for all unique entities
embeddings_dict = generate_embeddings_for_entities(unique_ners)


# Save embeddings to disk
np.save('data/entity_embeddings_ner.npy', embeddings_dict)

print("Embeddings generation completed and saved to disk.")


Generating embeddings for company_name
Generating embeddings for company_name


100%|██████████| 14/14 [00:39<00:00,  2.82s/it]


Generating embeddings for title
Generating embeddings for title


100%|██████████| 63/63 [02:54<00:00,  2.77s/it]


Generating embeddings for formatted_experience_level
Generating embeddings for formatted_experience_level


100%|██████████| 1/1 [00:00<00:00,  1.92it/s]


Generating embeddings for job_functions
Generating embeddings for job_functions


100%|██████████| 1/1 [00:00<00:00,  1.19it/s]


Generating embeddings for industry_names
Generating embeddings for industry_names


100%|██████████| 1/1 [00:01<00:00,  1.54s/it]


Generating embeddings for entities_TOOLS
Generating embeddings for entities_TOOLS


100%|██████████| 9/9 [00:23<00:00,  2.66s/it]


Generating embeddings for entities_METHODS
Generating embeddings for entities_METHODS


100%|██████████| 17/17 [00:45<00:00,  2.68s/it]


Generating embeddings for entities_REMOTE
Generating embeddings for entities_REMOTE


100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


Generating embeddings for entities_EXPERIENCE
Generating embeddings for entities_EXPERIENCE


100%|██████████| 42/42 [02:11<00:00,  3.13s/it]


Generating embeddings for entities_RESPONSABILITY
Generating embeddings for entities_RESPONSABILITY


100%|██████████| 279/279 [13:34<00:00,  2.92s/it]


Embeddings generation completed and saved to disk.


In [12]:
embeddings_dict = np.load('data/entity_embeddings_ner.npy', allow_pickle=True).item()

In [13]:
import pandas as pd
import numpy as np

flattened_data = [
    {
        'type': entity_type,
        'value': entity_value,
        'embedding': np.array(embedding)
    }
    for entity_type, entities in embeddings_dict.items()
    for entity_value, embedding in entities.items()
]

df_embeddings = pd.DataFrame(flattened_data)

print(df_embeddings.head())
print(df_embeddings.shape)

           type                                              value  \
0  company_name                  Coeur d’Alene Casino Resort Hotel   
1  company_name   Farmers® Insurance of Michigan - District Off...   
2  company_name                                 Free Rein Life Ltd   
3  company_name                                    Georgia Pacific   
4  company_name                   Health Matching Account Services   

                                           embedding  
0  [0.018065080046653748, -0.0579984188079834, 0....  
1  [0.0029444394167512655, 0.0003384020528756082,...  
2  [0.05414898321032524, -0.026501184329390526, 0...  
3  [-0.00013810588279739022, -0.01255276892334222...  
4  [-0.004248972982168198, -0.017296072095632553,...  
(422783, 3)


In [14]:
df_embeddings_tools = df_embeddings.query('type == "entities_RESPONSABILITY"')
df_embeddings_tools.shape

(278540, 3)

In [15]:
embeddings_dict['formatted_experience_level']

{'Associate': [-0.021765707060694695,
  0.00182890216819942,
  0.020852241665124893,
  -0.01787559874355793,
  0.005594983696937561,
  0.0035042474046349525,
  0.03776712343096733,
  0.031372856348752975,
  0.005303619429469109,
  0.051815614104270935,
  0.026112549006938934,
  0.003783799707889557,
  -0.03154610097408295,
  0.04935870319604874,
  0.0330737940967083,
  0.07685720175504684,
  0.008551938459277153,
  -0.010024510324001312,
  -0.0009228179696947336,
  0.03392426669597626,
  0.006543886847794056,
  -0.006689568981528282,
  0.011347461491823196,
  0.02576606161892414,
  0.005008317530155182,
  -0.028112726286053658,
  -0.045295350253582,
  0.006043842528015375,
  0.037074148654937744,
  -0.015473811887204647,
  0.0581783801317215,
  -0.04217696562409401,
  0.07244735956192017,
  0.005409928038716316,
  0.002045456785708666,
  0.03764113038778305,
  0.04658680409193039,
  0.001396777224726975,
  0.026900021359324455,
  -0.020096268504858017,
  -0.00921341497451067,
  -0.0197

In [16]:
import pandas as pd

# Assuming your DataFrame is named 'df'

# List of columns we want to include in our dictionary
columns_to_include = [
    'company_name', 'title', 'formatted_experience_level', 'job_functions', 
    'industry_names', 'entities_TOOLS', 'entities_METHODS', 'entities_REMOTE', 
    'entities_EXPERIENCE', 'entities_RESPONSABILITY'
]

# Convert DataFrame to dictionary
job_dict = {}

for _, row in data_original.iterrows():
    job_id = row['job_id']
    job_dict[job_id] = {col: row[col] for col in columns_to_include}

    # Convert list-like strings to actual lists
    for col in columns_to_include:
        if isinstance(job_dict[job_id][col], str) and job_dict[job_id][col].startswith('[') and job_dict[job_id][col].endswith(']'):
            try:
                job_dict[job_id][col] = eval(job_dict[job_id][col])
            except:
                # If eval fails, keep the original string
                pass

# Print a sample of the dictionary to verify
sample_job_id = next(iter(job_dict))
sample_job_id = next(iter(job_dict))
print(f"Sample entry for job_id {sample_job_id}:")
print(job_dict[sample_job_id])

# Print the total number of jobs in the dictionary
print(f"\nTotal number of jobs: {len(job_dict)}")


Sample entry for job_id 3940522647:
{'company_name': 'AlliedTravelCareers', 'title': 'Travel PT - $2,410 per week in Plano, TX', 'formatted_experience_level': 'Mid-Senior level', 'job_functions': ['Management', 'Manufacturing'], 'industry_names': ['Hospitals and Health Care'], 'entities_TOOLS': ['BluePipes.', 'BluePipes.'], 'entities_METHODS': ['Partner of the Year, Largest Patient Impact, and Most Engaged Travel Associate.'], 'entities_REMOTE': [], 'entities_EXPERIENCE': [], 'entities_RESPONSABILITY': []}

Total number of jobs: 9955


In [17]:
data_original.query('job_id == 3940522647').description.values

array(["AlliedTravelCareers is working with FlexCare Medical Staffing to find a qualified PT in Plano, Texas, 75023! Pay Information $2,410 per week About The Position FlexCare is a nationwide leader in the staffing of travel nurses and clinicians. With access to thousands of facilities around the country, our mission is to deliver premier travel staffing solutions with integrity and transparency. FlexCare’s exceptional service and reputation make it one of the most awarded healthcare staffing companies in the industry each year, ranking as one of the top travel nursing companies in the nation from industry authorities like Highway Hypodermics, Travel Nursing Central, and BluePipes. FlexCare has also been recognized by hospital MSP's with awards such as; Partner of the Year, Largest Patient Impact, and Most Engaged Travel Associate. 8881465EXPPLAT Benefits FlexCare's robust benefits package for all travel assignments includes: MaxPay - our commitment to offer the maximum pay package wi

In [18]:
embeddings_dict['company_name']['AlliedTravelCareers']

[0.01218738779425621,
 -0.043197814375162125,
 0.08243010193109512,
 0.05049438402056694,
 -0.03584836795926094,
 -0.004097711760550737,
 0.00018433507648296654,
 0.03579549491405487,
 0.0357426218688488,
 -0.017131078988313675,
 -0.026873057708144188,
 -0.02826099283993244,
 0.03896791860461235,
 -0.0165626872330904,
 0.015333373099565506,
 0.02693914994597435,
 -0.013760380446910858,
 -0.003345913952216506,
 0.0007984755211509764,
 -0.020396029576659203,
 0.026621907949447632,
 -0.000733622582629323,
 -0.049701280891895294,
 -0.02836674079298973,
 0.017421884462237358,
 -0.018267864361405373,
 -0.015346591360867023,
 -0.004286074545234442,
 0.008942265063524246,
 -0.04108286648988724,
 0.08549677580595016,
 -0.050679441541433334,
 0.023621326312422752,
 0.02678052894771099,
 0.018704071640968323,
 0.04798288270831108,
 0.029715020209550858,
 0.02306615188717842,
 0.018492577597498894,
 -0.015346591360867023,
 0.008486228995025158,
 -0.003896130947396159,
 0.021162698045372963,
 0.069

In [19]:
import numpy as np

# Assuming you have your job dictionary as 'job_dict' and your embeddings dictionary as 'embeddings_dict'

def get_embedding(key, value, embeddings_dict):
    if key not in embeddings_dict:
        return []
    
    if isinstance(value, list):
        return [embeddings_dict[key].get(item, []) for item in value if item in embeddings_dict[key]]
    elif isinstance(value, str):
        return embeddings_dict[key].get(value, [])
    else:
        print('defaulting')
        return []

# Transform the job dictionary
transformed_job_dict = {}

for job_id, job_data in job_dict.items():
    transformed_job_data = {}
    transformed_job_data['embeddings'] = {}
    for key, value in job_data.items():
        if key in ['company_name', 'title', 'formatted_experience_level']:
            # For single string fields
            embedding = get_embedding(key, value, embeddings_dict)
            transformed_job_data['embeddings'][key] = {value: embedding} if embedding else {}
        elif key in ['job_functions', 'industry_names']:
            # For list fields that are not entity fields
            embeddings = get_embedding(key, value, embeddings_dict)
            transformed_job_data['embeddings'][key] = dict(zip(value, embeddings)) if embeddings else {}
        elif key.startswith('entities_'):
            # For entity fields
            embeddings = get_embedding(key, value, embeddings_dict)
            transformed_job_data['embeddings'][key] = dict(zip(value, embeddings)) if embeddings else {}
    
    transformed_job_dict[job_id] = transformed_job_data
    # break  # Remove this line if you want to process all jobs, not just the first one

# Print a sample of the transformed dictionary to verify
sample_job_id = next(iter(transformed_job_dict))
print(f"Sample entry for job_id {sample_job_id}:")
for key, value in transformed_job_dict[sample_job_id].items():
    print(f"{key}: {value}")

# Print the total number of jobs in the transformed dictionary
print(f"\nTotal number of jobs: {len(transformed_job_dict)}")


defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting
defaulting

In [20]:
# Step 2: Add average embeddings
for job_id, job_data in transformed_job_dict.items():
    all_embeddings = []
    for key, value in job_data['embeddings'].items():
        if value == {}:
            continue
        if isinstance(value, dict):
            emb_data = np.stack([emb for emb in value.values()])
            emb_data = np.mean(emb_data, axis=0).tolist()
            all_embeddings.extend([emb_data])
        elif isinstance(value, list):
            all_embeddings.extend([emb for emb in value if emb != []])
    
    all_embeddings = np.stack(all_embeddings, axis=0)
    avg_embedding = np.mean(all_embeddings, axis=0).tolist()
    transformed_job_dict[job_id]['average_embedding'] = avg_embedding
    transformed_job_dict[job_id]['num_embeddings'] = all_embeddings.shape[0]
    # break

# Print a sample of the transformed dictionary to verify
sample_job_id = next(iter(transformed_job_dict))
print(f"Sample entry for job_id {sample_job_id}:")
print("Embeddings:")
for key, value in transformed_job_dict[sample_job_id]['embeddings'].items():
    print(f"  {key}: {value}")
print("Average Embedding:")
print(transformed_job_dict[sample_job_id]['average_embedding'])  # Printing first 5 elements

# Print the total number of jobs in the transformed dictionary
print(f"\nTotal number of jobs: {len(transformed_job_dict)}")

Sample entry for job_id 3940522647:
Embeddings:
  company_name: {'AlliedTravelCareers': [0.01218738779425621, -0.043197814375162125, 0.08243010193109512, 0.05049438402056694, -0.03584836795926094, -0.004097711760550737, 0.00018433507648296654, 0.03579549491405487, 0.0357426218688488, -0.017131078988313675, -0.026873057708144188, -0.02826099283993244, 0.03896791860461235, -0.0165626872330904, 0.015333373099565506, 0.02693914994597435, -0.013760380446910858, -0.003345913952216506, 0.0007984755211509764, -0.020396029576659203, 0.026621907949447632, -0.000733622582629323, -0.049701280891895294, -0.02836674079298973, 0.017421884462237358, -0.018267864361405373, -0.015346591360867023, -0.004286074545234442, 0.008942265063524246, -0.04108286648988724, 0.08549677580595016, -0.050679441541433334, 0.023621326312422752, 0.02678052894771099, 0.018704071640968323, 0.04798288270831108, 0.029715020209550858, 0.02306615188717842, 0.018492577597498894, -0.015346591360867023, 0.008486228995025158, -0.00

In [21]:
data_ner['average_embedding'] = data_ner['job_id'].apply(lambda x: transformed_job_dict[x]['average_embedding'])
data_ner['num_embedding'] = data_ner['job_id'].apply(lambda x: transformed_job_dict[x]['num_embeddings'])

KeyError: 3940764958

In [None]:
data_ner['num_embedding']

1254348    7
1635670    7
1156879    8
1246363    6
2419261    8
          ..
1446578    7
8873       6
889210     6
840379     7
76326      7
Name: num_embedding, Length: 9955, dtype: int64

In [None]:
from typing import List
import os
import openai
from main.keys import open_ai_key, weaviate_url, weaviate_key
import os
import weaviate
import openai

os.environ["OPENAI_APIKEY"] = open_ai_key
os.environ["WCD_URL"] = weaviate_url
os.environ["WCD_API_KEY"] = weaviate_key

openai_api_key = os.environ.get("OPENAI_APIKEY", "<your OpenAI API key if not set as env var>")
openai.api_key = openai_api_key
# Define a function to call the endpoint and obtain embeddings
def vectorize(texts: List[str]) -> List[List[float]]:

    response = openai.embeddings.create(
        input=texts, model="text-embedding-3-small"
    )

    return response.data[0].embedding

x_text = "Machine Learning Engineer"
x_vector = vectorize([x_text])

y_text = "Data Scientist"
y_vector = vectorize([y_text])

z_text = "Accountant"
z_vector = vectorize([z_text])

In [None]:
def cosine_distance( a, b):
    #Cosine distance is 1 - cosine similarity
    # cos(theta) = (a . b) / (||a|| ||b||)
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*(np.linalg.norm(b)))
    return 1 - cos_sim

In [None]:
df = data_ner.copy()

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import plot
# Assuming df is your DataFrame and it has columns 'tsne-2d-one', 'tsne-2d-two' and 'title'
df = df.copy()
df['x_dist'] = df['average_embedding'].apply(lambda x: cosine_distance(x, x_vector))
df['y_dist'] = df['average_embedding'].apply(lambda x: cosine_distance(x, y_vector))
df['z_dist'] = df['average_embedding'].apply(lambda x: cosine_distance(x, z_vector))

distance_df = df.copy()
x = distance_df['x_dist'].values
y = distance_df['y_dist'].values
z = distance_df['z_dist'].values
num_embeddings = distance_df['num_embedding'].values
global_min = min(x.min(), y.min(), z.min())
global_max = max(x.max(), y.max(), z.max())
fig = go.Figure(data=go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=8,
        color=x+y+z ,  # set color to cluster values
        colorscale='Viridis',  # choose a colorscale
        opacity=0.8,
        colorbar=dict(title='Semantic Distance')
    ),
    # marker=dict(size=8, opacity=0.5),
    text=distance_df['title']+' | '+distance_df['company_name']+' | '+distance_df['num_embedding'].astype(str),  # this will set the hover text
    hoverinfo='text',
    name='Semantic Distance'

    
))

fig.update_layout(title='t-SNE plot',
                  scene=dict(
                      xaxis_title=x_text,
                      yaxis_title=y_text,
                      zaxis_title=z_text,
                      aspectmode='cube',
                      xaxis=dict(range=[global_min, global_max]),  # set range for x axis
                      yaxis=dict(range=[global_min, global_max]),  # set range for y axis
                      zaxis=dict(range=[global_min, global_max]),  # set range for z axis
                  ),
                  hovermode='closest')


fig.show()

plot(fig)

'temp-plot.html'

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import plot
# Assuming df is your DataFrame and it has columns 'tsne-2d-one', 'tsne-2d-two' and 'title'
df = df.copy()
df['x_dist'] = df['vector'].apply(lambda x: cosine_distance(x, x_vector))
df['y_dist'] = df['vector'].apply(lambda x: cosine_distance(x, y_vector))
df['z_dist'] = df['vector'].apply(lambda x: cosine_distance(x, z_vector))

distance_df = df.copy()
x = distance_df['x_dist'].values
y = distance_df['y_dist'].values
z = distance_df['z_dist'].values
num_embeddings = distance_df['num_embedding'].values
global_min = min(x.min(), y.min(), z.min())
global_max = max(x.max(), y.max(), z.max())
fig = go.Figure(data=go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=8,
        color=x+y+z ,  # set color to cluster values
        colorscale='Viridis',  # choose a colorscale
        opacity=0.8,
        colorbar=dict(title='Semantic Distance')
    ),
    # marker=dict(size=8, opacity=0.5),
    text=distance_df['title']+' | '+distance_df['company_name']+' | '+distance_df['num_embedding'].astype(str),  # this will set the hover text
    hoverinfo='text',
    name='Semantic Distance'

    
))

fig.update_layout(title='t-SNE plot',
                  scene=dict(
                      xaxis_title=x_text,
                      yaxis_title=y_text,
                      zaxis_title=z_text,
                      aspectmode='cube',
                      xaxis=dict(range=[global_min, global_max]),  # set range for x axis
                      yaxis=dict(range=[global_min, global_max]),  # set range for y axis
                      zaxis=dict(range=[global_min, global_max]),  # set range for z axis
                  ),
                  hovermode='closest')


fig.show()

plot(fig)

'temp-plot.html'