In [None]:
import os
import time
import threading
import json
import requests
import pandas as pd
from pandas import json_normalize
from dotenv import load_dotenv
import http.client
import urllib.parse
import snowflake.connector
from sqlalchemy import create_engine
from joblib import Parallel, delayed
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


In [None]:
load_dotenv()
snowflake_password = os.getenv('SNOWFLAKE_PASSWORD')

# Rate limit control
rate_lock = threading.Lock()
last_request_time = 0

## Connect to  Snowflake database for raw data initial processing

In [None]:
#Establish a connection to Snowflake

def connect_to_snowflake():
    try:

        conn = snowflake.connector.connect(
            user="NIKKILW2025",
            password=snowflake_password,
            account="gbszkwp-by30611",
            warehouse="SNOWFLAKE_LEARNING_WH",
            database="linkedin_db",
            schema="linkedin_raw"
        )
        print("Connection to Snowflake established successfully.")
        return conn
    except Exception as e:
        print(f"Error connecting to Snowflake: {e}")
        return None

conn = connect_to_snowflake()

In [None]:
#query the seniority data
def query_raw_api_data(conn):
    query = """
        SELECT DISTINCT SENIORITY FROM LINKEDIN_JOB_API_CLEANED_DATA
    """

    df_lvl = pd.read_sql(query, conn)
    df_lvl.SENIORITY = df_lvl.SENIORITY.str.lower()
    print(df_lvl.shape)
    return df_lvl

df_lvl = query_raw_api_data(conn)
df_lvl.head()

In [23]:
df_lvl.SENIORITY.unique()


array(['medium-high level', 'it does not correspond', 'executive',
       'not valid', 'senior level', 'minimum experience', 'management',
       'without experience', 'middle head manager', 'assistant',
       'entry level', 'senior employee', 'mid-senior level',
       'middle level', 'associate', 'not applicable',
       'exaggeration and manager', 'full-senior',
       'medium-upper level manager', 'association', 'base level',
       'some responsibility', 'unprecedented', 'beginner level',
       'trainee', 'middle level seniors', 'medium-level',
       'average seniority level', 'cadre', 'intermediate', 'internship',
       'junior', 'first job', 'director'], dtype=object)

In [None]:
# #Call ChatGPT 4.1 API
# def job_seniority_category_single_gpt(lvl):
#     global last_request_time

#     # Concurrency Rate Limit: No more than 2 times per second
#     with rate_lock:
#         elapsed = time.time() - last_request_time
#         if elapsed < 0.5:
#             time.sleep(0.5 - elapsed)
#         last_request_time = time.time()

#     # Process empty values
#     if pd.isna(lvl) or not lvl:
#         return "Not Applicable"

#     processed_input = str(lvl).lower().replace("_", " ")

#     session = requests.Session()
#     retries = Retry(
#         total=5,
#         backoff_factor=2,
#         status_forcelist=[500, 502, 503, 504]
#     )
#     session.mount('https://', HTTPAdapter(max_retries=retries))

#     try:
#         print(f"\nProcessing: {lvl}")
#         response = session.post(
#             "https://api.openai.com/v1/chat/completions",
#             headers={
#                 "Authorization": f"Bearer {os.getenv('CHATGPT_API_KEY')}",
#                 "Content-Type": "application/json",
#                 "User-Agent": "JobClassification/1.0 (Python)"
#             },
#             json={
#                 "model": "gpt-4-1106-preview",
#                 "messages": [
#                     {
#                         "role": "system",
#                         "content": (
#                             "You are an advanced job title classifier. "
#                             "Categorize the job experience into EXACTLY ONE of:\n"
#                             "'No Experience', 'Entry Level', 'Intermediate', 'Senior', 'Manager/Executive', 'Not Applicable'\n"
#                             "Special Rules:\n"
#                             "- Contains 'Minimum experience' (case insensitive) → Entry Level\n"
#                             "- Contains 'association' or 'associate' (case insensitive) → Entry Level\n"
#                             "Examples:\n"
#                             "- 'medium-high level' → Intermediate\n"
#                             "- 'executive' → Manager/Executive\n"
#                             "- 'mid-senior level' → Intermediate\n"
#                             "- 'entry level' → Entry Level\n"
#                             "- 'minimum experience' → Entry Level\n"
#                             "- 'association Level' → Entry Level\n"
#                             "- 'associate' → Entry Level\n"
#                             "- 'first job' → Entry Level\n"
#                             "Definitions:\n"
#                             "- Entry Level: 0-2 years experience\n"
#                             "- Intermediate: 3-5 years experience\n"
#                             "- Senior: 6+ years experience\n"
#                             "Output ONLY the category name, no explanations."
#                         ),
#                     },
#                     {
#                         "role": "user",
#                         "content": f"Job experience: {processed_input}"
#                     }
#                 ],
#                 "temperature": 0.3
#             },
#             timeout=45
#         )

#         print(f"Status Code: {response.status_code}")
#         if response.status_code != 200:
#             print(f"Error Response: {response.text[:200]}")

#         if response.status_code == 200:
#             try:
#                 result = response.json()
#                 final_result = result["choices"][0]["message"]["content"].strip().lower().replace(" ", "")
#                 print(f"API Response: {final_result}")

#                 # mapping returned value
#                 mapping = {
#                     'noexperience': 'No Experience',
#                     'entrylevel': 'Entry Level',
#                     'entry level': 'Entry Level',
#                     'intermediate': 'Intermediate',
#                     'seniorlevel': 'Senior',
#                     'senior': 'Senior',
#                     'manager': 'Manager/Executive',
#                     'executive': 'Manager/Executive',
#                     'manager/executive': 'Manager/Executive'
#                 }
#                 return mapping.get(final_result, "Not Applicable")

#             except json.JSONDecodeError:
#                 print(f"Invalid JSON response: {response.text[:200]}")
#                 return "Not Applicable"

#         return "Not Applicable"

#     except requests.exceptions.RequestException as e:
#         print(f"API Error ({type(e).__name__}): {str(e)}")
#         return "Not Applicable"


# # Multithread process, lower the N-job number
# start = time.time()
# df_lvl['SENIORITY_STANDARDIZED'] = Parallel(n_jobs=4, backend="threading")(
#     delayed(job_seniority_category_single_gpt)(lvl) for lvl in df_lvl['SENIORITY']
# )
# print("\nFinal Results:")
# print(df_lvl.tail())
# print(df_lvl.shape)
# print(f"\nExecution time: {time.time() - start:.2f} seconds")

In [None]:
df_lvl.head()

In [None]:
#write to a new snowflake table for seniority

def load_to_snowflake(df_lvl):
    # Create a Snowflake connection engine
   engine = create_engine(
        'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'.format(
        user="NIKKILW2025",
        password=snowflake_password,
        account="gbszkwp-by30611",
        warehouse="SNOWFLAKE_LEARNING_WH",
        database="linkedin_db",
        schema="linkedin_raw"
    )
   )

   table_name = "job_seniority"

   df_lvl.to_sql(
        name=table_name,
        con=engine,
        if_exists='replace',
        index=False
    )

   print(f"Data loaded to Snowflake table {table_name} successfully.")


load_to_snowflake(df_lvl)