In [2]:
import pandas as pd
from pandas import json_normalize
import requests
import json
import snowflake.connector

from dotenv import load_dotenv
import os
import http.client
import urllib.parse


from googletrans import Translator
import string
import re

from sqlalchemy import create_engine

In [3]:
load_dotenv()
rapidapi_key = os.getenv('RAPIDAPI_KEY')
rapidapi_host = "linkedin-job-search-api.p.rapidapi.com"
snowflake_password = os.getenv('SNOWFLAKE_PASSWORD')

In [4]:
def extract_linkedin_job_data():

    headers = {
        'x-rapidapi-key': rapidapi_key,
        'x-rapidapi-host': rapidapi_host
    }

    location = "Australia"
    limit = 100
    offset = 0
    titles = ["Data Engineer" , "Data Scientist", "Data Analyst"]

    df_daily_all = pd.DataFrame()

    for title_filter in titles:

        #URL code the title filter and location filter
        title_encoded=urllib.parse.quote(title_filter)
        location_encoded=urllib.parse.quote(location)

        #API endpoint 7day data query
        #base_url = f"/active-jb-7d?limit={limit}&offset={offset}&title_filter={title_encoded}&location_filter={location_encoded}"

        #API endpoint last 24 hour data query
        base_url = f"/active-jb-24h?limit={limit}&offset={offset}&title_filter={title_encoded}&location_filter={location_encoded}"
        url =  f"https://{rapidapi_host}{base_url}"


        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            data = response.json()
            # Convert the JSON data to a DataFrame
            df_daily = json_normalize(data)
            df_daily['job_category'] = title_filter
            print(title_filter, df_daily.shape)
            df_daily_all = pd.concat([df_daily_all, df_daily], ignore_index=True)


        else:
            print(f"Error: {response.status_code}")

    return df_daily_all

df_daily_all = extract_linkedin_job_data()
# df_daily_all

Data Engineer (23, 51)
Data Scientist (2, 45)
Data Analyst (53, 51)


In [5]:
# df_daily_all = pd.read_csv('linkedin_jobs_daily.csv')
df_daily_all.head()

Unnamed: 0,id,date_posted,date_created,title,organization,organization_url,date_validthrough,locations_raw,location_type,location_requirements_raw,...,seniority,directapply,linkedin_org_slug,salary_raw.@type,salary_raw.currency,salary_raw.value.@type,salary_raw.value.minValue,salary_raw.value.maxValue,salary_raw.value.unitText,job_category
0,1639894859,2025-05-09T12:46:03,2025-05-09T13:05:48.901434,Lead Data Engineer,Wesfarmers OneDigital,https://www.linkedin.com/company/wesfarmers-on...,2025-06-08T12:46:03,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,False,wesfarmers-onedigital,,,,,,,Data Engineer
1,1639890667,2025-05-09T12:40:18,2025-05-09T13:03:24.403216,Senior Data Engineer,Launch Recruitment,https://www.linkedin.com/company/launch-recrui...,2025-06-08T12:40:18,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,False,launch-recruitment,,,,,,,Data Engineer
2,1639797145,2025-05-09T12:23:08,2025-05-09T12:33:22.158931,Market Data Systems Senior Engineer,Westpac Institutional Bank,https://www.linkedin.com/company/westpac-insti...,2025-06-08T12:23:08,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Not Applicable,False,westpac-institutional-bank,,,,,,,Data Engineer
3,1639863801,2025-05-09T12:13:54,2025-05-09T12:54:43.027777,GCP Data Engineer,Ingrity,https://www.linkedin.com/company/ingrity-pty-ltd,2025-06-08T12:13:53,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,False,ingrity-pty-ltd,,,,,,,Data Engineer
4,1639788025,2025-05-09T12:07:21,2025-05-09T12:24:43.170971,Full Stack Developer/ Senior Data Engineer,Peoplebank,https://www.linkedin.com/company/peoplebank,2025-06-08T12:07:21,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Not Applicable,False,peoplebank,,,,,,,Data Engineer


In [6]:
def get_clean_data_jobs(df_daily_all):
     pattern=re.compile(r'\bData Engineer\b|\bData Scientist\b|\bData Analyst\b' , re.IGNORECASE)
     df_daily_all = df_daily_all[df_daily_all['title'].str.contains(pattern)]
     return df_daily_all

df_daily_all = get_clean_data_jobs(df_daily_all)
df_daily_all.shape

(64, 51)

In [7]:
df_daily_all.to_csv('linkedin_jobs_daily.csv', index=False)

In [8]:
df_daily_all = pd.read_csv('linkedin_jobs_daily.csv')
df_daily_all.shape

(64, 51)

In [9]:
def update_columns(df_daily_all):
    df_daily_all.columns = df_daily_all.columns.str.upper()
    df_daily_all = df_daily_all[['ID', 'DATE_POSTED', 'DATE_CREATED', 'TITLE', 'JOB_CATEGORY',
       'ORGANIZATION', 'ORGANIZATION_URL', 'DATE_VALIDTHROUGH', 'LOCATIONS_RAW',
       'LOCATION_TYPE', 'LOCATION_REQUIREMENTS_RAW', 'EMPLOYMENT_TYPE', 'URL',
       'SOURCE_TYPE', 'SOURCE', 'SOURCE_DOMAIN', 'ORGANIZATION_LOGO',
       'CITIES_DERIVED', 'REGIONS_DERIVED', 'COUNTRIES_DERIVED',
       'LOCATIONS_DERIVED', 'TIMEZONES_DERIVED', 'LATS_DERIVED',
       'LNGS_DERIVED', 'REMOTE_DERIVED', 'RECRUITER_NAME', 'RECRUITER_TITLE',
       'RECRUITER_URL', 'LINKEDIN_ORG_EMPLOYEES', 'LINKEDIN_ORG_URL',
       'LINKEDIN_ORG_SIZE', 'LINKEDIN_ORG_SLOGAN', 'LINKEDIN_ORG_INDUSTRY',
       'LINKEDIN_ORG_FOLLOWERS', 'LINKEDIN_ORG_HEADQUARTERS',
       'LINKEDIN_ORG_TYPE', 'LINKEDIN_ORG_FOUNDEDDATE',
       'LINKEDIN_ORG_SPECIALTIES', 'LINKEDIN_ORG_LOCATIONS',
       'LINKEDIN_ORG_DESCRIPTION', 'LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED',
       'SENIORITY', 'DIRECTAPPLY', 'LINKEDIN_ORG_SLUG']]
    return df_daily_all

df_daily_all = update_columns(df_daily_all)
df_daily_all.head(2)

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1639894859,2025-05-09T12:46:03,2025-05-09T13:05:48.901434,Lead Data Engineer,Data Engineer,Wesfarmers OneDigital,https://www.linkedin.com/company/wesfarmers-on...,2025-06-08T12:46:03,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"Docklands, Victoria",Public Company,,[''],"['699 Collins St, Docklands, Victoria 3008, AU']",Wesfarmers OneDigital is the digital driver of...,False,Mid-Senior level,False,wesfarmers-onedigital
1,1639890667,2025-05-09T12:40:18,2025-05-09T13:03:24.403216,Senior Data Engineer,Data Engineer,Launch Recruitment,https://www.linkedin.com/company/launch-recrui...,2025-06-08T12:40:18,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,Sydney,Privately Held,,"['Search and Selection', 'Executive Search', '...","['Level 23, 9 Castlereagh Street, Sydney, 2000...",Welcome to Launch Recruitment! \n\nWe are one ...,True,Mid-Senior level,False,launch-recruitment


### Process the df_daily_all data 


In [10]:
def extract_job_date(date_created):
    #extract the job_date from job_created
    df_daily_all['job_date'] = pd.to_datetime(df_daily_all['DATE_CREATED']).dt.date
    return df_daily_all['job_date']

#Apply the function to the DataFrame
df_daily_all['job_date'] = extract_job_date(df_daily_all['DATE_CREATED'])
df_daily_all['job_date'][:5]


0    2025-05-09
1    2025-05-09
2    2025-05-09
3    2025-05-09
4    2025-05-09
Name: job_date, dtype: object

In [11]:
#Extrac the job city and state from the LOCATIONS_RAW field

def extract_city(list):
    """
    Extracts the city from the given text using regex.
    """
    # Regex pattern to match the city in the LOCATIONS_RAW field
    text = str(list)
    city_pattern = r"'addressLocality':\s*'(.*)',\s'addressRegion':"
    match = re.search(city_pattern, text)
    if match:
        city = match.group(1)
        if 'sidney' in city.lower() or 'sídney' in city.lower() or '悉尼' in city.lower(): #fix wrong city name
            return "Sydney"
        return city
    else:
        return None



def extract_state(list):
    """
    Extracts the state from the given text using regex.
    """
    # Regex pattern to match the region/state in the LOCATIONS_RAW field
    text = str(list)
    state_pattern = r"'addressRegion':\s*(.*)',\s'streetAddress'"
    match = re.search(state_pattern, text)
    if match:
        state = match.group(1)
        state= state.replace("'", "").strip()
        return state
    else:
        return None


#Extract city and state from Locations Raw
df_daily_all['city'] = df_daily_all['LOCATIONS_RAW'].apply(extract_city)


df_daily_all['state'] = df_daily_all['LOCATIONS_RAW'].apply(extract_state)



df_daily_all.head()

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG,job_date,city,state
0,1639894859,2025-05-09T12:46:03,2025-05-09T13:05:48.901434,Lead Data Engineer,Data Engineer,Wesfarmers OneDigital,https://www.linkedin.com/company/wesfarmers-on...,2025-06-08T12:46:03,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,[''],"['699 Collins St, Docklands, Victoria 3008, AU']",Wesfarmers OneDigital is the digital driver of...,False,Mid-Senior level,False,wesfarmers-onedigital,2025-05-09,Melbourne,VIC
1,1639890667,2025-05-09T12:40:18,2025-05-09T13:03:24.403216,Senior Data Engineer,Data Engineer,Launch Recruitment,https://www.linkedin.com/company/launch-recrui...,2025-06-08T12:40:18,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Search and Selection', 'Executive Search', '...","['Level 23, 9 Castlereagh Street, Sydney, 2000...",Welcome to Launch Recruitment! \n\nWe are one ...,True,Mid-Senior level,False,launch-recruitment,2025-05-09,Sydney,NSW
2,1639863801,2025-05-09T12:13:54,2025-05-09T12:54:43.027777,GCP Data Engineer,Data Engineer,Ingrity,https://www.linkedin.com/company/ingrity-pty-ltd,2025-06-08T12:13:53,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Analytics Advisory', 'Data Strategy', 'Data ...","['Level 6, 478 George Street, Sydney, NSW 2000...",INGRITY is a progressive data and analytics co...,False,Mid-Senior level,False,ingrity-pty-ltd,2025-05-09,Millers Point,NSW
3,1639788025,2025-05-09T12:07:21,2025-05-09T12:24:43.170971,Full Stack Developer/ Senior Data Engineer,Data Engineer,Peoplebank,https://www.linkedin.com/company/peoplebank,2025-06-08T12:07:21,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Digital / Digital Transformation', 'Business...","['345 George Street, Level 13, Sydney, NSW 200...","Peoplebank, part of RGF Staffing ANZ is a lead...",True,Not Applicable,False,peoplebank,2025-05-09,Woden,
4,1639748347,2025-05-09T11:37:35,2025-05-09T12:03:29.08066,Lead Data Engineer | CIE,Data Engineer,Macquarie Group,https://www.linkedin.com/company/macquariegroup,2025-06-25T08:02:54,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Advisory and capital raising', 'financing', ...","['1 Elizabeth St, Sydney, NSW 2000, AU', '660 ...","At Macquarie, we empower people to innovate an...",False,Mid-Senior level,False,macquariegroup,2025-05-09,Sydney,NSW


In [12]:
#Extract the employment type from the EMPLOYMENT_TYPE field

def extract_employment_type(df_daily_all):
    """
    Extracts the employment type from the given text using regex.
    """
    df_daily_all['employment_type'] = (
        df_daily_all['EMPLOYMENT_TYPE']
        .astype(str)
        .str.replace(r"[\[\]']", '', regex=True)
        .str.strip()
    )

    df_daily_all.drop(columns=['EMPLOYMENT_TYPE'], inplace=True)

    return df_daily_all['employment_type']


df_daily_all['employment_type'] = extract_employment_type(df_daily_all)
df_daily_all['employment_type'].tail()

59    CONTRACTOR
60     FULL_TIME
61     FULL_TIME
62     FULL_TIME
63    CONTRACTOR
Name: employment_type, dtype: object

In [13]:
#Extract the employment size
def extract_employee_size(LINKEDIN_ORG_SIZE):
    """
    Extracts the employee size from the given text using regex.
    """
    df_daily_all['org_size'] = (
        df_daily_all['LINKEDIN_ORG_SIZE']
        .astype(str)
        .str.replace(r"employees", '', regex=True)
        .str.strip()
    )

    return df_daily_all['org_size']


df_daily_all['org_size'] = extract_employee_size(df_daily_all['LINKEDIN_ORG_SIZE'])
df_daily_all['org_size'].tail()

59         201-500
60         10,001+
61           11-50
62       501-1,000
63    5,001-10,000
Name: org_size, dtype: object

In [14]:
df_daily_all.columns = df_daily_all.columns.str.upper()
df_daily_all.head(2)

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORG_SIZE
0,1639894859,2025-05-09T12:46:03,2025-05-09T13:05:48.901434,Lead Data Engineer,Data Engineer,Wesfarmers OneDigital,https://www.linkedin.com/company/wesfarmers-on...,2025-06-08T12:46:03,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,Wesfarmers OneDigital is the digital driver of...,False,Mid-Senior level,False,wesfarmers-onedigital,2025-05-09,Melbourne,VIC,FULL_TIME,201-500
1,1639890667,2025-05-09T12:40:18,2025-05-09T13:03:24.403216,Senior Data Engineer,Data Engineer,Launch Recruitment,https://www.linkedin.com/company/launch-recrui...,2025-06-08T12:40:18,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,Welcome to Launch Recruitment! \n\nWe are one ...,True,Mid-Senior level,False,launch-recruitment,2025-05-09,Sydney,NSW,FULL_TIME,51-200


In [15]:
#Only keep the relevant columns

df_daily_all = df_daily_all[['ID', 'TITLE', 'JOB_CATEGORY',
       'JOB_DATE', 'CITY', 'STATE', 'EMPLOYMENT_TYPE' ,
       'ORGANIZATION', 'ORGANIZATION_URL', 'URL',
       'SOURCE_TYPE', 'SOURCE', 'SOURCE_DOMAIN',
       'ORGANIZATION_LOGO', 'REMOTE_DERIVED', 'RECRUITER_NAME', 'RECRUITER_TITLE',
       'RECRUITER_URL', 'LINKEDIN_ORG_URL',
       'ORG_SIZE', 'LINKEDIN_ORG_INDUSTRY',
       'LINKEDIN_ORG_HEADQUARTERS',
       'LINKEDIN_ORG_TYPE', 'LINKEDIN_ORG_FOUNDEDDATE',
       'LINKEDIN_ORG_SPECIALTIES', 'LINKEDIN_ORG_LOCATIONS',
       'LINKEDIN_ORG_DESCRIPTION','LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED',
       'SENIORITY', 'DIRECTAPPLY',
       'LINKEDIN_ORG_SLUG']]


df_daily_all.head()

Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1639894859,Lead Data Engineer,Data Engineer,2025-05-09,Melbourne,VIC,FULL_TIME,Wesfarmers OneDigital,https://www.linkedin.com/company/wesfarmers-on...,https://au.linkedin.com/jobs/view/lead-data-en...,...,"Docklands, Victoria",Public Company,,[''],"['699 Collins St, Docklands, Victoria 3008, AU']",Wesfarmers OneDigital is the digital driver of...,False,Mid-Senior level,False,wesfarmers-onedigital
1,1639890667,Senior Data Engineer,Data Engineer,2025-05-09,Sydney,NSW,FULL_TIME,Launch Recruitment,https://www.linkedin.com/company/launch-recrui...,https://au.linkedin.com/jobs/view/senior-data-...,...,Sydney,Privately Held,,"['Search and Selection', 'Executive Search', '...","['Level 23, 9 Castlereagh Street, Sydney, 2000...",Welcome to Launch Recruitment! \n\nWe are one ...,True,Mid-Senior level,False,launch-recruitment
2,1639863801,GCP Data Engineer,Data Engineer,2025-05-09,Millers Point,NSW,CONTRACTOR,Ingrity,https://www.linkedin.com/company/ingrity-pty-ltd,https://au.linkedin.com/jobs/view/gcp-data-eng...,...,"Sydney, NSW",Privately Held,2018.0,"['Analytics Advisory', 'Data Strategy', 'Data ...","['Level 6, 478 George Street, Sydney, NSW 2000...",INGRITY is a progressive data and analytics co...,False,Mid-Senior level,False,ingrity-pty-ltd
3,1639788025,Full Stack Developer/ Senior Data Engineer,Data Engineer,2025-05-09,Woden,,CONTRACTOR,Peoplebank,https://www.linkedin.com/company/peoplebank,https://au.linkedin.com/jobs/view/full-stack-d...,...,"Sydney, NSW",Privately Held,1990.0,"['Digital / Digital Transformation', 'Business...","['345 George Street, Level 13, Sydney, NSW 200...","Peoplebank, part of RGF Staffing ANZ is a lead...",True,Not Applicable,False,peoplebank
4,1639748347,Lead Data Engineer | CIE,Data Engineer,2025-05-09,Sydney,NSW,FULL_TIME,Macquarie Group,https://www.linkedin.com/company/macquariegroup,https://au.linkedin.com/jobs/view/lead-data-en...,...,"Sydney, NSW",Public Company,1969.0,"['Advisory and capital raising', 'financing', ...","['1 Elizabeth St, Sydney, NSW 2000, AU', '660 ...","At Macquarie, we empower people to innovate an...",False,Mid-Senior level,False,macquariegroup


## Connect to  Snowflake database for raw data initial processing

In [16]:
#Establish a connection to Snowflake

def connect_to_snowflake():
    try:

        conn = snowflake.connector.connect(
            user="NIKKILW2025",
            password=snowflake_password,
            account="gbszkwp-by30611",
            warehouse="SNOWFLAKE_LEARNING_WH",
            database="linkedin_db",
            schema="linkedin_raw"
        )
        print("Connection to Snowflake established successfully.")
        return conn
    except Exception as e:
        print(f"Error connecting to Snowflake: {e}")
        return None

conn = connect_to_snowflake()

Connection to Snowflake established successfully.


In [17]:
#query the raw data and narrow down to DE, DS and DA roles
#This data is filtered by relevant roles, and translated into English (but no city, state and seniority fix)
def query_existing_job_data(conn):
    query = """
        SELECT * FROM LINKEDIN_JOB_API_CLEANED_DATA
        WHERE (
            lower(TITLE) LIKE '%data engineer%'
            OR lower(TITLE) LIKE '%data scientist%'
            OR lower(TITLE) LIKE '%data analyst%'
            )
    """

    df = pd.read_sql(query, conn)
    print(df.shape)
    return df

df = query_existing_job_data(conn)
df.head()

  df = pd.read_sql(query, conn)


(371, 31)


Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1599037955,Senior Data Engineer,Data Engineer,2025-04-28,Greater Sydney Area,,FULL_TIME,One51 | Data & Analytics Consultancy,https://www.linkedin.com/company/one51consulting,https://au.linkedin.com/jobs/view/senior-data-...,...,"Sydney, NSW",Privately Held,2020.0,"['Business Intelligence', 'Data Warehousing', ...","['333 George Street, Level 13, Sydney, NSW 200...",Drawing on a wealth of expertise and a deep un...,False,Mid-Senior level,False,one51consulting
1,1598878880,Senior Data Engineer,Data Engineer,2025-04-28,Brisbane,QLD,CONTRACTOR,Data#3,https://www.linkedin.com/company/data3,https://au.linkedin.com/jobs/view/senior-data-...,...,"Toowong, Queensland",Public Company,1977.0,"['Cloud Solutions', 'Mobility Solutions', 'Sec...","['555 Coronation Dr, Toowong, Queensland 4066,...","Data#3 Limited (DTL), is focused on helping cu...",False,Medium-high level,False,data3
2,1598880069,Data Engineer,Data Engineer,2025-04-28,Sydney,NSW,CONTRACTOR,Whizdom,https://www.linkedin.com/company/whizdom-recru...,https://au.linkedin.com/jobs/view/data-enginee...,...,"Canberra, Australian Capital Territory",Privately Held,2006.0,"['IT Recruitment', 'Recruitment for Government...","['28-34 Thynne St, Unit 7, Canberra, Australia...","Established in 2006, Whizdom is an Australian ...",True,Middle level,True,whizdom-recruitment
3,1598878894,Senior Data Engineer and Business Intelligence...,Data Engineer,2025-04-28,Moreton Bay,QLD,FULL_TIME,University of the Sunshine Coast,https://www.linkedin.com/school/university-of-...,https://au.linkedin.com/jobs/view/senior-data-...,...,,,,,,,,Not Applicable,False,
4,1598880083,Senior Data Engineer and Business Intelligence...,Data Engineer,2025-04-28,Sunshine Coast,QLD,FULL_TIME,University of the Sunshine Coast,https://www.linkedin.com/school/university-of-...,https://au.linkedin.com/jobs/view/senior-data-...,...,,,,,,,,Not Applicable,False,


In [18]:
df.to_csv('api_cleaned_job_id_snowflake.csv', index=False)

In [19]:
list_daily = df_daily_all.columns.tolist()
list_df = df.columns.tolist()
list_daily ==  list_df

True

In [20]:
#Check the Job ID from df and only keep those new jobs based on the Job IDs
def keep_new_jobs():
    existing_job_ids = df.ID.unique().tolist()

    df_new_jobs = df_daily_all[~(df_daily_all['ID'].isin(existing_job_ids))].reset_index(drop=True)
    print(f'{df_new_jobs.shape[0]} new jobs ready to load to Snowflake')
    return df_new_jobs

df_new_jobs = keep_new_jobs()
df_new_jobs.tail()

54 new jobs ready to load to Snowflake


Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
49,1639158377,Data Analyst,Data Analyst,2025-05-09,Brisbane,QLD,FULL_TIME,Michael Page,https://www.linkedin.com/company/michael-page,https://au.linkedin.com/jobs/view/data-analyst...,...,"Addlestone, Weybridge",Public Company,,[''],"['PageGroup, Addlestone, Weybridge KT15 2QW, G...",Welcome to the Michael Page global company pro...,True,Entry level,False,michael-page
50,1638909805,Principal Data Analyst,Data Analyst,2025-05-09,Sydney,NSW,FULL_TIME,Ubank,https://www.linkedin.com/company/ubank-au,https://au.linkedin.com/jobs/view/principal-da...,...,"Sydney, New South Wales",Privately Held,,[''],"['2 Carrington St, Level 4, Sydney, New South ...","At Ubank, we’re all about helping you be more ...",False,Mid-Senior level,False,ubank-au
51,1638904876,Revenue Optimisation Data Analyst,Data Analyst,2025-05-09,Melbourne,VIC,FULL_TIME,Alfred Health,https://www.linkedin.com/company/alfred-hospital,https://au.linkedin.com/jobs/view/revenue-opti...,...,"Melbourne, VIC",Educational,,[''],"['55 Commercial Road, Melbourne, VIC 3004, AU'...",Alfred Health is a leading metropolitan health...,False,Entry level,False,alfred-hospital
52,1638870930,Data Analyst,Data Analyst,2025-05-09,Sydney,NSW,FULL_TIME,Lagardere AWPL,https://www.linkedin.com/company/lagardere-awpl,https://au.linkedin.com/jobs/view/data-analyst...,...,"Sydney, New South Wales",Partnership,,[''],"['580 George St, Level 8, Sydney, New South Wa...",LagardèreAWPL is the Pacific region's largest ...,False,Not Applicable,False,lagardere-awpl
53,1638898041,Performance & Data Analyst,Data Analyst,2025-05-09,Parramatta,NSW,FULL_TIME,Pepper Money ANZ,https://www.linkedin.com/company/peppermoneyanz,https://au.linkedin.com/jobs/view/performance-...,...,"North Sydney, NSW",Public Company,2000.0,"['Lending', 'Asset Management', 'Asset Servici...","['Level 27, 177 Pacific Highway, North Sydney,...",As one of Australia's leading non-bank lenders...,False,No corresponde,False,peppermoneyanz


In [21]:
#Translate new job's business name, city, state and seniority to English

translator = Translator()

def translate_text(text, target_language='en'):
    try:
        if not text or pd.isna(text):
            return 'NA'
        translated_text = translator.translate(str(text), dest=target_language)
        return translated_text.text
    except Exception as e:
        print(f"Error translating text: {e} (Text: {text})")
        return text

for col in ['CITY', 'STATE', 'ORGANIZATION', 'SENIORITY']:
    unique_values = df_new_jobs[col].dropna().unique()
    translation_map = {val: translate_text(val, target_language='en') for val in unique_values}
    df_new_jobs[col] = df_new_jobs[col].map(translation_map).fillna('NA')

In [22]:
df_new_jobs['CITY'].unique()

array(['Melbourne', 'Sydney', 'Millers Point', 'Woden', 'Parramatta',
       'Brisbane', 'Murray Bridge', 'Nowra', 'Lismore', 'Mackay',
       'Newcastle', 'Hervey Bay', 'Geelong', 'Warrnambool', 'Gladstone',
       'Toowoomba', 'Burnie', 'Maryborough', 'Wagga Wagga', 'Bendigo',
       'Ballarat', 'Townsville', 'Launceston', 'Wollongong', 'Gold Coast',
       'Darwin', 'Cairns', 'Hobart', 'Canberra', 'Adelaide', 'Perth',
       'New South Wales'], dtype=object)

In [23]:
df_new_jobs['STATE'].unique()

array(['VIC', 'NSW', 'NA', 'QLD', 'on', 'That', 'NT', 'Of', 'Australia'],
      dtype=object)

In [24]:

def load_to_snowflake(df_new_jobs):
    # Create a Snowflake connection engine
    engine = create_engine(
        'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'.format(
            user="NIKKILW2025",
            password=snowflake_password,
            account="gbszkwp-by30611",
            warehouse="SNOWFLAKE_LEARNING_WH",
            database="linkedin_db",
            schema="linkedin_raw"
        )
    )

    table_name = "linkedin_job_api_cleaned_data"

    df_new_jobs.to_sql(
        name=table_name,
        con=engine,
        if_exists='append', #append data
        index=False
    )

    print(f"Data loaded to Snowflake table {table_name} successfully.")


load_to_snowflake(df_new_jobs)

Data loaded to Snowflake table linkedin_job_api_cleaned_data successfully.
