In [1]:
import pandas as pd
from pandas import json_normalize
import requests
import json
import snowflake.connector

from dotenv import load_dotenv
import os
import http.client
import urllib.parse

from sqlalchemy import create_engine

In [2]:
load_dotenv()
snowflake_password = os.getenv('SNOWFLAKE_PASSWORD')

## Connect to  Snowflake database for raw data initial processing

In [3]:
#Establish a connection to Snowflake

def connect_to_snowflake():
    try:

        conn = snowflake.connector.connect(
            user="NIKKILW2025",
            password=snowflake_password,
            account="gbszkwp-by30611",
            warehouse="SNOWFLAKE_LEARNING_WH",
            database="linkedin_db",
            schema="linkedin_raw"
        )
        print("Connection to Snowflake established successfully.")
        return conn
    except Exception as e:
        print(f"Error connecting to Snowflake: {e}")
        return None

conn = connect_to_snowflake()

Connection to Snowflake established successfully.


In [4]:
#query the seniority data
def query_raw_api_data(conn):
    query = """
        SELECT DISTINCT SENIORITY FROM LINKEDIN_JOB_API_CLEANED_DATA
    """

    df_lvl = pd.read_sql(query, conn)
    print(df_lvl.shape)
    return df_lvl

df_lvl = query_raw_api_data(conn)
df_lvl.head()

  df_lvl = pd.read_sql(query, conn)


(33, 1)


Unnamed: 0,SENIORITY
0,Medium-high level
1,It does not correspond
2,Executive
3,Not valid
4,Senior level


In [5]:
#Deepseek API access request


def ds_job_seniority_category(exp_lvl_raw):

    deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
    deepseek_api_url = "https://api.deepseek.com/v1/chat/completions"

    headers = {
        "Authorization": f"Bearer {deepseek_api_key}",
        "Content-Type": "application/json",
    }


    #construct the prompt
    user_content = (
        "Below are several desired job experience descriptions. \n"
        "For each one, please categorize as 'No Experience', 'Entry_Level', 'Intermediate', 'Senior', 'Manager/Executive', or 'Not Applicable'. \n"
        "Only return the category for each line, e.g.: 'No Experience'. Output should be one category per line, in the same order as the input.\n\n"
    )

    for idx, lvl in enumerate(exp_lvl_raw,1):
        user_content = user_content + f"{lvl}\n"

    data = {
        "model": "deepseek-chat",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": user_content},
        ],
        "stream" : False,
    }

    response =  requests.post(deepseek_api_url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        result =  response.json()
        exp_lvl = [
            line.strip()
            for line in result["choices"][0]["message"]["content"].strip().split('\n')
            if line.strip()
        ]
        return exp_lvl
    else:
        print(f"Error: {response.status_code}, {response.text}")

exp_lvl_raw  = df_lvl['SENIORITY']
df_lvl['SENIORITY_STANDARDIZED'] = ds_job_seniority_category(exp_lvl_raw)
df_lvl['SENIORITY_STANDARDIZED'][:5]

0         Intermediate
1       Not Applicable
2    Manager/Executive
3       Not Applicable
4               Senior
Name: SENIORITY_STANDARDIZED, dtype: object

In [6]:
#write to a new snowflake table for seniority

def load_to_snowflake(df_lvl):
    # Create a Snowflake connection engine
   engine = create_engine(
        'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'.format(
        user="NIKKILW2025",
        password=snowflake_password,
        account="gbszkwp-by30611",
        warehouse="SNOWFLAKE_LEARNING_WH",
        database="linkedin_db",
        schema="linkedin_raw"
    )
   )

   table_name = "job_seniority"

   df_lvl.to_sql(
        name=table_name,
        con=engine,
        if_exists='append',
        index=False
    )

   print(f"Data loaded to Snowflake table {table_name} successfully.")


load_to_snowflake(df_lvl)

Data loaded to Snowflake table job_seniority successfully.
