In [1]:
import pandas as pd
from pandas import json_normalize
import requests
import json
import snowflake.connector

from dotenv import load_dotenv
import os
import http.client
import urllib.parse


from googletrans import Translator
import string
import re

from sqlalchemy import create_engine

In [2]:
load_dotenv()
rapidapi_key = os.getenv('RAPIDAPI_KEY')
rapidapi_host = "linkedin-job-search-api.p.rapidapi.com"
snowflake_password = os.getenv('SNOWFLAKE_PASSWORD')

In [3]:
def extract_linkedin_job_data():

    headers = {
        'x-rapidapi-key': rapidapi_key,
        'x-rapidapi-host': rapidapi_host
    }

    location = "Australia"
    limit = 100
    offset = 0
    titles = ["Data Engineer" , "Data Scientist", "Data Analyst"]

    df_daily_all = pd.DataFrame()

    for title_filter in titles:

        #URL code the title filter and location filter
        title_encoded=urllib.parse.quote(title_filter)
        location_encoded=urllib.parse.quote(location)

        #API endpoint 7day data query
        #base_url = f"/active-jb-7d?limit={limit}&offset={offset}&title_filter={title_encoded}&location_filter={location_encoded}"

        #API endpoint last 24 hour data query
        base_url = f"/active-jb-24h?limit={limit}&offset={offset}&title_filter={title_encoded}&location_filter={location_encoded}"
        url =  f"https://{rapidapi_host}{base_url}"


        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            data = response.json()
            # Convert the JSON data to a DataFrame
            df_daily = json_normalize(data)
            df_daily['job_category'] = title_filter
            print(title_filter, df_daily.shape)
            df_daily_all = pd.concat([df_daily_all, df_daily], ignore_index=True)


        else:
            print(f"Error: {response.status_code}")

    return df_daily_all

df_daily_all = extract_linkedin_job_data()
# df_daily_all

Data Engineer (15, 51)
Data Scientist (3, 45)
Data Analyst (10, 45)


In [5]:
# df_daily_all = pd.read_csv('linkedin_jobs_daily.csv')
df_daily_all.head()

Unnamed: 0,id,date_posted,date_created,title,organization,organization_url,date_validthrough,locations_raw,location_type,location_requirements_raw,...,directapply,linkedin_org_slug,salary_raw.@type,salary_raw.currency,salary_raw.value.@type,salary_raw.value.minValue,salary_raw.value.maxValue,salary_raw.value.unitText,salary_raw,job_category
0,1627378650,2025-05-06T03:01:54,2025-05-06T03:06:33.413229,Senior Data Engineer,Talenza,https://www.linkedin.com/company/talenza,2025-06-05T03:01:16,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,True,talenza,MonetaryAmount,AUD,QuantitativeValue,1200.0,1200.0,DAY,,Data Engineer
1,1627328620,2025-05-06T02:26:25,2025-05-06T02:52:30.631983,Senior Data Engineer,Exco Partners,https://www.linkedin.com/company/excopartners,2025-05-16T02:26:24,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,False,excopartners,,,,,,,,Data Engineer
2,1627026158,2025-05-06T00:57:01,2025-05-06T01:03:51.755077,Data Engineer (Databricks experience),Halo Labs,https://www.linkedin.com/company/halolabs-au,2025-11-02T00:57:01,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,True,halolabs-au,,,,,,,,Data Engineer
3,1626936927,2025-05-06T00:25:33,2025-05-06T00:35:16.136102,Senior Data Engineer - TSPV - Canberra,Vertical Scope Group,https://www.linkedin.com/company/vertical-scop...,2025-06-05T00:25:33,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,True,vertical-scope-group,,,,,,,,Data Engineer
4,1626866445,2025-05-05T23:45:20,2025-05-06T00:03:12.175043,Data Engineer,Blinq,https://www.linkedin.com/company/blinq-me,2025-06-04T23:45:20,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,True,blinq-me,,,,,,,,Data Engineer


In [6]:
def get_clean_data_jobs(df_daily_all):
     pattern=re.compile(r'\bData Engineer\b|\bData Scientist\b|\bData Analyst\b' , re.IGNORECASE)
     df_daily_all = df_daily_all[df_daily_all['title'].str.contains(pattern)]
     return df_daily_all

df_daily_all = get_clean_data_jobs(df_daily_all)
df_daily_all.shape

(20, 51)

In [7]:
df_daily_all.to_csv('linkedin_jobs_daily.csv', index=False)

In [8]:
df_daily_all = pd.read_csv('linkedin_jobs_daily.csv')
df_daily_all.shape

(20, 51)

In [9]:
def update_columns(df_daily_all):
    df_daily_all.columns = df_daily_all.columns.str.upper()
    df_daily_all = df_daily_all[['ID', 'DATE_POSTED', 'DATE_CREATED', 'TITLE', 'JOB_CATEGORY',
       'ORGANIZATION', 'ORGANIZATION_URL', 'DATE_VALIDTHROUGH', 'LOCATIONS_RAW',
       'LOCATION_TYPE', 'LOCATION_REQUIREMENTS_RAW', 'EMPLOYMENT_TYPE', 'URL',
       'SOURCE_TYPE', 'SOURCE', 'SOURCE_DOMAIN', 'ORGANIZATION_LOGO',
       'CITIES_DERIVED', 'REGIONS_DERIVED', 'COUNTRIES_DERIVED',
       'LOCATIONS_DERIVED', 'TIMEZONES_DERIVED', 'LATS_DERIVED',
       'LNGS_DERIVED', 'REMOTE_DERIVED', 'RECRUITER_NAME', 'RECRUITER_TITLE',
       'RECRUITER_URL', 'LINKEDIN_ORG_EMPLOYEES', 'LINKEDIN_ORG_URL',
       'LINKEDIN_ORG_SIZE', 'LINKEDIN_ORG_SLOGAN', 'LINKEDIN_ORG_INDUSTRY',
       'LINKEDIN_ORG_FOLLOWERS', 'LINKEDIN_ORG_HEADQUARTERS',
       'LINKEDIN_ORG_TYPE', 'LINKEDIN_ORG_FOUNDEDDATE',
       'LINKEDIN_ORG_SPECIALTIES', 'LINKEDIN_ORG_LOCATIONS',
       'LINKEDIN_ORG_DESCRIPTION', 'LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED',
       'SENIORITY', 'DIRECTAPPLY', 'LINKEDIN_ORG_SLUG']]
    return df_daily_all

df_daily_all = update_columns(df_daily_all)
df_daily_all.head(2)

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1627378650,2025-05-06T03:01:54,2025-05-06T03:06:33.413229,Senior Data Engineer,Data Engineer,Talenza,https://www.linkedin.com/company/talenza,2025-06-05T03:01:16,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"Sydney, New South Wales",Privately Held,,"['Permanent Recruitment', 'Contract Recruitmen...","['10 Spring St, Level 7, Sydney, New South Wal...",Talenza is a full-service technology and busin...,True,Not Applicable,True,talenza
1,1627328620,2025-05-06T02:26:25,2025-05-06T02:52:30.631983,Senior Data Engineer,Data Engineer,Exco Partners,https://www.linkedin.com/company/excopartners,2025-05-16T02:26:24,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"Melbourne, Victoria",Privately Held,2000.0,[''],"['350 Collins St, Level 14, Melbourne, Victori...",Exco Partners is an Australian Consulting and ...,False,Mid-Senior level,False,excopartners


### Process the df_daily_all data 


In [10]:
def extract_job_date(date_created):
    #extract the job_date from job_created
    df_daily_all['job_date'] = pd.to_datetime(df_daily_all['DATE_CREATED']).dt.date
    return df_daily_all['job_date']

#Apply the function to the DataFrame
df_daily_all['job_date'] = extract_job_date(df_daily_all['DATE_CREATED'])
df_daily_all['job_date'][:5]


0    2025-05-06
1    2025-05-06
2    2025-05-06
3    2025-05-06
4    2025-05-06
Name: job_date, dtype: object

In [11]:
#Extrac the job city and state from the LOCATIONS_RAW field

def extract_city(list):
    """
    Extracts the city from the given text using regex.
    """
    # Regex pattern to match the city in the LOCATIONS_RAW field
    text = str(list)
    city_pattern = r"'addressLocality':\s*'(.*)',\s'addressRegion':"
    match = re.search(city_pattern, text)
    if match:
        city = match.group(1)
        if 'sidney' in city.lower() or 'sídney' in city.lower() or '悉尼' in city.lower(): #fix wrong city name
            return "Sydney"
        return city
    else:
        return None



def extract_state(list):
    """
    Extracts the state from the given text using regex.
    """
    # Regex pattern to match the region/state in the LOCATIONS_RAW field
    text = str(list)
    state_pattern = r"'addressRegion':\s*(.*)',\s'streetAddress'"
    match = re.search(state_pattern, text)
    if match:
        state = match.group(1)
        state= state.replace("'", "").strip()
        return state
    else:
        return None


#Extract city and state from Locations Raw
df_daily_all['city'] = df_daily_all['LOCATIONS_RAW'].apply(extract_city)


df_daily_all['state'] = df_daily_all['LOCATIONS_RAW'].apply(extract_state)



df_daily_all.head()

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG,job_date,city,state
0,1627378650,2025-05-06T03:01:54,2025-05-06T03:06:33.413229,Senior Data Engineer,Data Engineer,Talenza,https://www.linkedin.com/company/talenza,2025-06-05T03:01:16,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Permanent Recruitment', 'Contract Recruitmen...","['10 Spring St, Level 7, Sydney, New South Wal...",Talenza is a full-service technology and busin...,True,Not Applicable,True,talenza,2025-05-06,Melbourne,VIC
1,1627328620,2025-05-06T02:26:25,2025-05-06T02:52:30.631983,Senior Data Engineer,Data Engineer,Exco Partners,https://www.linkedin.com/company/excopartners,2025-05-16T02:26:24,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,[''],"['350 Collins St, Level 14, Melbourne, Victori...",Exco Partners is an Australian Consulting and ...,False,Mid-Senior level,False,excopartners,2025-05-06,Melbourne,VIC
2,1627026158,2025-05-06T00:57:01,2025-05-06T01:03:51.755077,Data Engineer (Databricks experience),Data Engineer,Halo Labs,https://www.linkedin.com/company/halolabs-au,2025-11-02T00:57:01,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,[''],"['60 Martin Pl, Sydney, New South Wales 2000, ...","Halo Labs is a future focused, end-to-end data...",False,Mid-Senior level,True,halolabs-au,2025-05-06,Brisbane,QLD
3,1626936927,2025-05-06T00:25:33,2025-05-06T00:35:16.136102,Senior Data Engineer - TSPV - Canberra,Data Engineer,Vertical Scope Group,https://www.linkedin.com/company/vertical-scop...,2025-06-05T00:25:33,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Security Cleared Recruitment', 'Security Cle...","['135 Bamfield Rd Heidelberg Heights, Unit 121...",Vertical Scope Group (VSG) are a security clea...,True,Mid-Senior level,True,vertical-scope-group,2025-05-06,Canberra,
4,1626866445,2025-05-05T23:45:20,2025-05-06T00:03:12.175043,Data Engineer,Data Engineer,Blinq,https://www.linkedin.com/company/blinq-me,2025-06-04T23:45:20,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,[''],"['4 Bank Pl, Level 3, Melbourne, Victoria 3000...","At Blinq, our vision is to power the start of ...",False,Not Applicable,True,blinq-me,2025-05-06,Sydney,NSW


In [12]:
#Extract the employment type from the EMPLOYMENT_TYPE field

def extract_employment_type(df_daily_all):
    """
    Extracts the employment type from the given text using regex.
    """
    df_daily_all['employment_type'] = (
        df_daily_all['EMPLOYMENT_TYPE']
        .astype(str)
        .str.replace(r"[\[\]']", '', regex=True)
        .str.strip()
    )

    df_daily_all.drop(columns=['EMPLOYMENT_TYPE'], inplace=True)

    return df_daily_all['employment_type']


df_daily_all['employment_type'] = extract_employment_type(df_daily_all)
df_daily_all['employment_type'].tail()

15     FULL_TIME
16     FULL_TIME
17     FULL_TIME
18     FULL_TIME
19    CONTRACTOR
Name: employment_type, dtype: object

In [13]:
#Extract the employment size
def extract_employee_size(LINKEDIN_ORG_SIZE):
    """
    Extracts the employee size from the given text using regex.
    """
    df_daily_all['org_size'] = (
        df_daily_all['LINKEDIN_ORG_SIZE']
        .astype(str)
        .str.replace(r"employees", '', regex=True)
        .str.strip()
    )

    return df_daily_all['org_size']


df_daily_all['org_size'] = extract_employee_size(df_daily_all['LINKEDIN_ORG_SIZE'])
df_daily_all['org_size'].tail()

15     1,001-5,000
16    5,001-10,000
17     1,001-5,000
18         201-500
19         10,001+
Name: org_size, dtype: object

In [14]:
df_daily_all.columns = df_daily_all.columns.str.upper()
df_daily_all.head(2)

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORG_SIZE
0,1627378650,2025-05-06T03:01:54,2025-05-06T03:06:33.413229,Senior Data Engineer,Data Engineer,Talenza,https://www.linkedin.com/company/talenza,2025-06-05T03:01:16,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,Talenza is a full-service technology and busin...,True,Not Applicable,True,talenza,2025-05-06,Melbourne,VIC,CONTRACTOR,51-200
1,1627328620,2025-05-06T02:26:25,2025-05-06T02:52:30.631983,Senior Data Engineer,Data Engineer,Exco Partners,https://www.linkedin.com/company/excopartners,2025-05-16T02:26:24,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,Exco Partners is an Australian Consulting and ...,False,Mid-Senior level,False,excopartners,2025-05-06,Melbourne,VIC,FULL_TIME,51-200


In [15]:
#Only keep the relevant columns

df_daily_all = df_daily_all[['ID', 'TITLE', 'JOB_CATEGORY',
       'JOB_DATE', 'CITY', 'STATE', 'EMPLOYMENT_TYPE' ,
       'ORGANIZATION', 'ORGANIZATION_URL', 'URL',
       'SOURCE_TYPE', 'SOURCE', 'SOURCE_DOMAIN',
       'ORGANIZATION_LOGO', 'REMOTE_DERIVED', 'RECRUITER_NAME', 'RECRUITER_TITLE',
       'RECRUITER_URL', 'LINKEDIN_ORG_URL',
       'ORG_SIZE', 'LINKEDIN_ORG_INDUSTRY',
       'LINKEDIN_ORG_HEADQUARTERS',
       'LINKEDIN_ORG_TYPE', 'LINKEDIN_ORG_FOUNDEDDATE',
       'LINKEDIN_ORG_SPECIALTIES', 'LINKEDIN_ORG_LOCATIONS',
       'LINKEDIN_ORG_DESCRIPTION','LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED',
       'SENIORITY', 'DIRECTAPPLY',
       'LINKEDIN_ORG_SLUG']]


df_daily_all.head()

Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1627378650,Senior Data Engineer,Data Engineer,2025-05-06,Melbourne,VIC,CONTRACTOR,Talenza,https://www.linkedin.com/company/talenza,https://au.linkedin.com/jobs/view/senior-data-...,...,"Sydney, New South Wales",Privately Held,,"['Permanent Recruitment', 'Contract Recruitmen...","['10 Spring St, Level 7, Sydney, New South Wal...",Talenza is a full-service technology and busin...,True,Not Applicable,True,talenza
1,1627328620,Senior Data Engineer,Data Engineer,2025-05-06,Melbourne,VIC,FULL_TIME,Exco Partners,https://www.linkedin.com/company/excopartners,https://au.linkedin.com/jobs/view/senior-data-...,...,"Melbourne, Victoria",Privately Held,2000.0,[''],"['350 Collins St, Level 14, Melbourne, Victori...",Exco Partners is an Australian Consulting and ...,False,Mid-Senior level,False,excopartners
2,1627026158,Data Engineer (Databricks experience),Data Engineer,2025-05-06,Brisbane,QLD,FULL_TIME,Halo Labs,https://www.linkedin.com/company/halolabs-au,https://au.linkedin.com/jobs/view/data-enginee...,...,"Sydney, New South Wales",Privately Held,2021.0,[''],"['60 Martin Pl, Sydney, New South Wales 2000, ...","Halo Labs is a future focused, end-to-end data...",False,Mid-Senior level,True,halolabs-au
3,1626936927,Senior Data Engineer - TSPV - Canberra,Data Engineer,2025-05-06,Canberra,,FULL_TIME,Vertical Scope Group,https://www.linkedin.com/company/vertical-scop...,https://au.linkedin.com/jobs/view/senior-data-...,...,"Melbourne, VIC",Privately Held,2020.0,"['Security Cleared Recruitment', 'Security Cle...","['135 Bamfield Rd Heidelberg Heights, Unit 121...",Vertical Scope Group (VSG) are a security clea...,True,Mid-Senior level,True,vertical-scope-group
4,1626866445,Data Engineer,Data Engineer,2025-05-06,Sydney,NSW,FULL_TIME,Blinq,https://www.linkedin.com/company/blinq-me,https://au.linkedin.com/jobs/view/data-enginee...,...,"Melbourne, Victoria",Privately Held,,[''],"['4 Bank Pl, Level 3, Melbourne, Victoria 3000...","At Blinq, our vision is to power the start of ...",False,Not Applicable,True,blinq-me


## Connect to  Snowflake database for raw data initial processing

In [16]:
#Establish a connection to Snowflake

def connect_to_snowflake():
    try:

        conn = snowflake.connector.connect(
            user="NIKKILW2025",
            password=snowflake_password,
            account="gbszkwp-by30611",
            warehouse="SNOWFLAKE_LEARNING_WH",
            database="linkedin_db",
            schema="linkedin_raw"
        )
        print("Connection to Snowflake established successfully.")
        return conn
    except Exception as e:
        print(f"Error connecting to Snowflake: {e}")
        return None

conn = connect_to_snowflake()

Connection to Snowflake established successfully.


In [17]:
#query the raw data and narrow down to DE, DS and DA roles
#This data is filtered by relevant roles, and translated into English (but no city, state and seniority fix)
def query_existing_job_data(conn):
    query = """
        SELECT * FROM LINKEDIN_JOB_API_CLEANED_DATA
        WHERE (
            lower(TITLE) LIKE '%data engineer%'
            OR lower(TITLE) LIKE '%data scientist%'
            OR lower(TITLE) LIKE '%data analyst%'
            )
    """

    df = pd.read_sql(query, conn)
    print(df.shape)
    return df

df = query_existing_job_data(conn)
df.head()

  df = pd.read_sql(query, conn)


(291, 31)


Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1599037955,Senior Data Engineer,Data Engineer,2025-04-28,Greater Sydney Area,,FULL_TIME,One51 | Data & Analytics Consultancy,https://www.linkedin.com/company/one51consulting,https://au.linkedin.com/jobs/view/senior-data-...,...,"Sydney, NSW",Privately Held,2020.0,"['Business Intelligence', 'Data Warehousing', ...","['333 George Street, Level 13, Sydney, NSW 200...",Drawing on a wealth of expertise and a deep un...,False,Mid-Senior level,False,one51consulting
1,1598878880,Senior Data Engineer,Data Engineer,2025-04-28,Brisbane,QLD,CONTRACTOR,Data#3,https://www.linkedin.com/company/data3,https://au.linkedin.com/jobs/view/senior-data-...,...,"Toowong, Queensland",Public Company,1977.0,"['Cloud Solutions', 'Mobility Solutions', 'Sec...","['555 Coronation Dr, Toowong, Queensland 4066,...","Data#3 Limited (DTL), is focused on helping cu...",False,Medium-high level,False,data3
2,1598880069,Data Engineer,Data Engineer,2025-04-28,Sydney,NSW,CONTRACTOR,Whizdom,https://www.linkedin.com/company/whizdom-recru...,https://au.linkedin.com/jobs/view/data-enginee...,...,"Canberra, Australian Capital Territory",Privately Held,2006.0,"['IT Recruitment', 'Recruitment for Government...","['28-34 Thynne St, Unit 7, Canberra, Australia...","Established in 2006, Whizdom is an Australian ...",True,Middle level,True,whizdom-recruitment
3,1598878894,Senior Data Engineer and Business Intelligence...,Data Engineer,2025-04-28,Moreton Bay,QLD,FULL_TIME,University of the Sunshine Coast,https://www.linkedin.com/school/university-of-...,https://au.linkedin.com/jobs/view/senior-data-...,...,,,,,,,,Not Applicable,False,
4,1598880083,Senior Data Engineer and Business Intelligence...,Data Engineer,2025-04-28,Sunshine Coast,QLD,FULL_TIME,University of the Sunshine Coast,https://www.linkedin.com/school/university-of-...,https://au.linkedin.com/jobs/view/senior-data-...,...,,,,,,,,Not Applicable,False,


In [18]:
df.to_csv('api_cleaned_job_id_snowflake.csv', index=False)

In [19]:
list_daily = df_daily_all.columns.tolist()
list_df = df.columns.tolist()
list_daily ==  list_df

True

In [20]:
#Check the Job ID from df and only keep those new jobs based on the Job IDs
def keep_new_jobs():
    existing_job_ids = df.ID.unique().tolist()

    df_new_jobs = df_daily_all[~(df_daily_all['ID'].isin(existing_job_ids))].reset_index(drop=True)
    print(f'{df_new_jobs.shape[0]} new jobs ready to load to Snowflake')
    return df_new_jobs

df_new_jobs = keep_new_jobs()
df_new_jobs.tail()

15 new jobs ready to load to Snowflake


Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
10,1627100229,Senior Data Scientist | Sydney |Full-Time,Data Scientist,2025-05-06,Sydney,NSW,FULL_TIME,ETT CareerMove,https://www.linkedin.com/company/ett-careermove,https://au.linkedin.com/jobs/view/senior-data-...,...,Bengaluru,Privately Held,2013.0,['Executive Search and Leadership Hiring'],"['Bengaluru, IN']",ETT CareerMove is a leadership recruitment com...,True,Mid-Senior level,True,ett-careermove
11,1625665270,Data Scientist-Advanced Analytics,Data Scientist,2025-05-05,No Regrets,QLD,FULL_TIME,IBM,https://www.linkedin.com/company/ibm,https://au.linkedin.com/jobs/view/data-scienti...,...,"Armonk, New York, NY",Public Company,,"['Cloud', 'Mobile', 'Cognitive', 'Security', '...","['International Business Machines Corp., New O...","At IBM, we do more than work. We create. We cr...",False,Mid-Senior level,False,ibm
12,1624482129,Senior Data Scientist,Data Scientist,2025-05-05,Parque de Macquarie,NSW,FULL_TIME,Anglicare Sydney,https://www.linkedin.com/company/anglicare-sydney,https://au.linkedin.com/jobs/view/senior-data-...,...,"Norwest, New South Wales",Nonprofit,1856.0,"['Caring for Australians for over 160 years', ...","['62 Norwest Blvd, Level 2, Century Corporate ...","Anglicare Sydney provides retirement living, r...",False,Intermedio,False,anglicare-sydney
13,1626598173,Security Data Analyst,Data Analyst,2025-05-05,Мельбурн,VIC,FULL_TIME,KPMG Australia,https://www.linkedin.com/company/kpmg-australia,https://au.linkedin.com/jobs/view/security-dat...,...,"Sydney, NSW",Partnership,,"['Audit', 'Tax', 'Advisory Services', 'Managem...","['Level 38, Tower Three, International Towers ...",Welcome to KPMG. We are a global network of pr...,False,Молодой специалист,True,kpmg-australia
14,1626088003,Strategic Reform Data Analyst,Data Analyst,2025-05-05,Darwin,NT,FULL_TIME,Department of Education and Training,https://www.linkedin.com/company/department-of...,https://au.linkedin.com/jobs/view/strategic-re...,...,"Darwin, Northern Territory",Educational,,"['School', 'Education', 'Aboriginal Affairs', ...","['Mitchell Street, Darwin, Northern Territory ...","At the Department of Education and Training, w...",False,Mid-Senior level,False,department-of-education-nt


In [21]:
#Translate new job's business name, city, state and seniority to English

translator = Translator()

def translate_text(text, target_language='en'):
    try:
        if not text or pd.isna(text):
            return 'NA'
        translated_text = translator.translate(str(text), dest=target_language)
        return translated_text.text
    except Exception as e:
        print(f"Error translating text: {e} (Text: {text})")
        return text

for col in ['CITY', 'STATE', 'ORGANIZATION', 'SENIORITY']:
    unique_values = df_new_jobs[col].dropna().unique()
    translation_map = {val: translate_text(val, target_language='en') for val in unique_values}
    df_new_jobs[col] = df_new_jobs[col].map(translation_map).fillna('NA')

In [22]:
df_new_jobs['CITY'].unique()

array(['Melbourne', 'Brisbane', 'Canberra', 'Sydney', 'No Regrets',
       'Woden', 'Perth', 'Macquarie park', 'Darwin'], dtype=object)

In [23]:
df_new_jobs['STATE'].unique()

array(['VIC', 'QLD', 'NA', 'NSW', 'Of', 'NT'], dtype=object)

In [24]:

def load_to_snowflake(df_new_jobs):
    # Create a Snowflake connection engine
    engine = create_engine(
        'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'.format(
            user="NIKKILW2025",
            password=snowflake_password,
            account="gbszkwp-by30611",
            warehouse="SNOWFLAKE_LEARNING_WH",
            database="linkedin_db",
            schema="linkedin_raw"
        )
    )

    table_name = "linkedin_job_api_cleaned_data"

    df_new_jobs.to_sql(
        name=table_name,
        con=engine,
        if_exists='append', #append data
        index=False
    )

    print(f"Data loaded to Snowflake table {table_name} successfully.")


load_to_snowflake(df_new_jobs)

Data loaded to Snowflake table linkedin_job_api_cleaned_data successfully.
