In [23]:
import pandas as pd
from pandas import json_normalize
import requests
import snowflake.connector


from dotenv import load_dotenv
import os
import http.client
import urllib.parse

from langdetect import detect, LangDetectException
from googletrans import Translator
import string
import re

from sqlalchemy import create_engine


In [24]:
load_dotenv()
rapidapi_key = os.getenv('RAPIDAPI_KEY')
rapidapi_host = "linkedin-job-search-api.p.rapidapi.com"
snowflake_password = os.getenv('SNOWFLAKE_PASSWORD')

In [25]:
def extract_linkedin_job_data():

    headers = {
        'x-rapidapi-key': rapidapi_key,
        'x-rapidapi-host': rapidapi_host
    }

    location = "Australia"
    limit = 100
    offset = 0
    titles = ["Data Engineer" , "Data Scientist", "Data Analyst"]

    df_daily_all = pd.DataFrame()

    for title_filter in titles:

        #URL code the title filter and location filter
        title_encoded=urllib.parse.quote(title_filter)
        location_encoded=urllib.parse.quote(location)

        #API endpoint 7day data query
        #base_url = f"/active-jb-7d?limit={limit}&offset={offset}&title_filter={title_encoded}&location_filter={location_encoded}"

        #API endpoint last 24 hour data query
        base_url = f"/active-jb-24h?limit={limit}&offset={offset}&title_filter={title_encoded}&location_filter={location_encoded}"
        url =  f"https://{rapidapi_host}{base_url}"


        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            data = response.json()
            # Convert the JSON data to a DataFrame
            df_daily = json_normalize(data)
            df_daily['job_category'] = title_filter
            print(title_filter, df_daily.shape)
            df_daily_all = pd.concat([df_daily_all, df_daily], ignore_index=True)


        else:
            print(f"Error: {response.status_code}")

    return df_daily_all

df_daily_all = extract_linkedin_job_data()
df_daily_all

Data Engineer (14, 51)
Data Scientist (4, 45)
Data Analyst (22, 51)


Unnamed: 0,id,date_posted,date_created,title,organization,organization_url,date_validthrough,locations_raw,location_type,location_requirements_raw,...,seniority,directapply,linkedin_org_slug,salary_raw.@type,salary_raw.currency,salary_raw.value.@type,salary_raw.value.minValue,salary_raw.value.maxValue,salary_raw.value.unitText,job_category
0,1604628941,2025-04-30T01:09:33,2025-04-30T02:22:46.095216,AI / ML Engineer (H/F)- Équipe Data,WINAMAX,https://www.linkedin.com/company/winamax,2025-05-30T01:09:33,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,False,winamax,,,,,,,Data Engineer
1,1604169133,2025-04-29T23:52:27,2025-04-30T00:03:54.940518,Data Engineer,TechnologyOne,https://www.linkedin.com/company/technology-one,2025-05-29T23:51:43,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,False,technology-one,,,,,,,Data Engineer
2,1602292235,2025-04-29T14:47:36,2025-04-29T15:03:59.322895,Data Centre Engineer,Peoplebank,https://www.linkedin.com/company/peoplebank,2025-05-29T14:47:36,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,False,peoplebank,,,,,,,Data Engineer
3,1601928637,2025-04-29T10:52:47,2025-04-29T12:26:12.438087,Data Engineer,Downer,https://www.linkedin.com/company/downer,2025-05-29T10:52:47,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Entry level,False,downer,,,,,,,Data Engineer
4,1601759296,2025-04-29T09:36:00,2025-04-29T10:27:22.051813,Data Engineer,Ampstek,https://www.linkedin.com/company/ampstek,2025-05-29T09:36:00,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,True,ampstek,,,,,,,Data Engineer
5,1601626948,2025-04-29T08:46:05,2025-04-29T08:55:00.488792,Data Engineer,Avance Consulting,https://www.linkedin.com/company/avance-services,2025-05-29T08:46:05,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,True,avance-services,,,,,,,Data Engineer
6,1601624177,2025-04-29T08:36:11,2025-04-29T08:53:17.756168,Data Centre Engineer,Peoplebank,https://www.linkedin.com/company/peoplebank,2025-05-29T08:36:11,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mid-Senior level,False,peoplebank,,,,,,,Data Engineer
7,1601584498,2025-04-29T07:54:26,2025-04-29T08:24:48.346662,Data Engineer (Databricks + Azure & Python),CareCone Group,https://www.linkedin.com/company/carecone,2025-05-29T07:54:26,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Mellannivå,True,carecone,,,,,,,Data Engineer
8,1601414070,2025-04-29T06:06:46,2025-04-29T06:26:44.673404,Data Engineer,Queensland Government,https://www.linkedin.com/company/queensland-go...,2025-05-29T06:06:46,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,Not Applicable,False,queensland-government,,,,,,,Data Engineer
9,1601410503,2025-04-29T05:38:35,2025-04-29T06:24:28.089925,Data Streaming Engineer,Experis Australia,https://www.linkedin.com/company/experis-austr...,2025-05-13T05:38:00,"[{'@type': 'Place', 'address': {'@type': 'Post...",,,...,中高级,True,experis-australia,MonetaryAmount,AUD,QuantitativeValue,75.0,95.0,HOUR,Data Engineer


In [26]:
df_daily_all.to_csv('linkedin_jobs_daily.csv', index=False)

In [27]:
df_daily_all = pd.read_csv('linkedin_jobs_daily.csv')
df_daily_all.shape

(40, 51)

In [28]:
def update_columns(df_daily_all):
    df_daily_all.columns = df_daily_all.columns.str.upper()
    df_daily_all = df_daily_all[['ID', 'DATE_POSTED', 'DATE_CREATED', 'TITLE', 'JOB_CATEGORY',
       'ORGANIZATION', 'ORGANIZATION_URL', 'DATE_VALIDTHROUGH', 'LOCATIONS_RAW',
       'LOCATION_TYPE', 'LOCATION_REQUIREMENTS_RAW', 'EMPLOYMENT_TYPE', 'URL',
       'SOURCE_TYPE', 'SOURCE', 'SOURCE_DOMAIN', 'ORGANIZATION_LOGO',
       'CITIES_DERIVED', 'REGIONS_DERIVED', 'COUNTRIES_DERIVED',
       'LOCATIONS_DERIVED', 'TIMEZONES_DERIVED', 'LATS_DERIVED',
       'LNGS_DERIVED', 'REMOTE_DERIVED', 'RECRUITER_NAME', 'RECRUITER_TITLE',
       'RECRUITER_URL', 'LINKEDIN_ORG_EMPLOYEES', 'LINKEDIN_ORG_URL',
       'LINKEDIN_ORG_SIZE', 'LINKEDIN_ORG_SLOGAN', 'LINKEDIN_ORG_INDUSTRY',
       'LINKEDIN_ORG_FOLLOWERS', 'LINKEDIN_ORG_HEADQUARTERS',
       'LINKEDIN_ORG_TYPE', 'LINKEDIN_ORG_FOUNDEDDATE',
       'LINKEDIN_ORG_SPECIALTIES', 'LINKEDIN_ORG_LOCATIONS',
       'LINKEDIN_ORG_DESCRIPTION', 'LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED',
       'SENIORITY', 'DIRECTAPPLY', 'LINKEDIN_ORG_SLUG']]
    return df_daily_all

df_daily_all = update_columns(df_daily_all)
df_daily_all.head()

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1604628941,2025-04-30T01:09:33,2025-04-30T02:22:46.095216,AI / ML Engineer (H/F)- Équipe Data,Data Engineer,WINAMAX,https://www.linkedin.com/company/winamax,2025-05-30T01:09:33,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,PARIS CEDEX 07,Partnership,2006.0,"['Poker en ligne', ""Jeux d'argent"", 'Poker Liv...","['Libre réponse 80986, PARIS CEDEX 07, 75342, ...",Winamax est une entreprise dynamique et innova...,False,Mid-Senior level,False,winamax
1,1604169133,2025-04-29T23:52:27,2025-04-30T00:03:54.940518,Data Engineer,Data Engineer,TechnologyOne,https://www.linkedin.com/company/technology-one,2025-05-29T23:51:43,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"Fortitude Valley, QLD",Public Company,,"['Enterprise software as a service', 'Informat...","['TechnologyOne HQ, 540 Wickham Street, Fortit...",TechnologyOne is here to make life simple for ...,False,Mid-Senior level,False,technology-one
2,1602292235,2025-04-29T14:47:36,2025-04-29T15:03:59.322895,Data Centre Engineer,Data Engineer,Peoplebank,https://www.linkedin.com/company/peoplebank,2025-05-29T14:47:36,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"Sydney, NSW",Privately Held,1990.0,"['Digital / Digital Transformation', 'Business...","['345 George Street, Level 13, Sydney, NSW 200...","Peoplebank, part of RGF Staffing ANZ is a lead...",True,Mid-Senior level,False,peoplebank
3,1601928637,2025-04-29T10:52:47,2025-04-29T12:26:12.438087,Data Engineer,Data Engineer,Downer,https://www.linkedin.com/company/downer,2025-05-29T10:52:47,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"North Ryde, NSW",Public Company,1933.0,"['Infrastructure', 'Construction', 'Renewables...","['Triniti Business Campus, 39 Delhi Road, Nort...",Enabling communities to thrive. \n\nIt’s what ...,False,Entry level,False,downer
4,1601759296,2025-04-29T09:36:00,2025-04-29T10:27:22.051813,Data Engineer,Data Engineer,Ampstek,https://www.linkedin.com/company/ampstek,2025-05-29T09:36:00,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"Princeton, NJ",Privately Held,2014.0,"['Information Technologies', 'Big Data', 'Hado...","['103 Carnegie Center Drive, Suite 300, Prince...",Ampstek supplies thousands of tech and digital...,True,Mid-Senior level,True,ampstek


## Connect to  Snowflake database for raw data initial processing

In [29]:
#Establish a connection to Snowflake

def connect_to_snowflake():
    try:

        conn = snowflake.connector.connect(
            user="NIKKILW2025",
            password=snowflake_password,
            account="gbszkwp-by30611",
            warehouse="SNOWFLAKE_LEARNING_WH",
            database="linkedin_db",
            schema="linkedin_raw"
        )
        print("Connection to Snowflake established successfully.")
        return conn
    except Exception as e:
        print(f"Error connecting to Snowflake: {e}")
        return None

conn = connect_to_snowflake()


Connection to Snowflake established successfully.


In [30]:
#query the raw data
def query_raw_api_data(conn):
    query = """
        SELECT * FROM linkedin_job_api_cleaned_data
    """

    df = pd.read_sql(query, conn)
    print(df.shape)
    return df

df = query_raw_api_data(conn)
df.head()

  df = pd.read_sql(query, conn)


(213, 31)


Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1599037799,Infrastructure DevOps Engineer-AWS Cloud Engin...,Data Engineer,2025-04-28,Melbourne,VIC,FULL_TIME,Renaissance InfoSystems,https://www.linkedin.com/company/renaissance-i...,https://au.linkedin.com/jobs/view/infrastructu...,...,"Melbourne, Victoria",Privately Held,2006.0,[''],"['Suite 508/Level 5, 343 Little Collins Street...",Renaissance InfoSystems is a technology and di...,True,Mid-Senior level,True,renaissance-infosystems
1,1599037955,Senior Data Engineer,Data Engineer,2025-04-28,Greater Sydney Area,,FULL_TIME,One51 | Data & Analytics Consultancy,https://www.linkedin.com/company/one51consulting,https://au.linkedin.com/jobs/view/senior-data-...,...,"Sydney, NSW",Privately Held,2020.0,"['Business Intelligence', 'Data Warehousing', ...","['333 George Street, Level 13, Sydney, NSW 200...",Drawing on a wealth of expertise and a deep un...,False,Mid-Senior level,False,one51consulting
2,1598878880,Senior Data Engineer,Data Engineer,2025-04-28,Brisbane,QLD,CONTRACTOR,Data#3,https://www.linkedin.com/company/data3,https://au.linkedin.com/jobs/view/senior-data-...,...,"Toowong, Queensland",Public Company,1977.0,"['Cloud Solutions', 'Mobility Solutions', 'Sec...","['555 Coronation Dr, Toowong, Queensland 4066,...","Data#3 Limited (DTL), is focused on helping cu...",False,Livello medio-alto,False,data3
3,1598879223,Senior Data Operations Engineer,Data Engineer,2025-04-28,Melbourne,VIC,FULL_TIME,Bupa,https://www.linkedin.com/company/bupa,https://au.linkedin.com/jobs/view/senior-data-...,...,"UK, Australia, Spain, Chile, Poland, New Zeala...",Privately Held,1947.0,"['health', 'healthcare', 'hospitals', 'health ...","['Main locations, Main locations, UK, Australi...","Bupa's purpose is helping people live longer, ...",False,Intermedio,False,bupa
4,1598880069,Data Engineer,Data Engineer,2025-04-28,Sydney,NSW,CONTRACTOR,Whizdom,https://www.linkedin.com/company/whizdom-recru...,https://au.linkedin.com/jobs/view/data-enginee...,...,"Canberra, Australian Capital Territory",Privately Held,2006.0,"['IT Recruitment', 'Recruitment for Government...","['28-34 Thynne St, Unit 7, Canberra, Australia...","Established in 2006, Whizdom is an Australian ...",True,Mellannivå,True,whizdom-recruitment


### Process the df_daily_all data 


In [31]:
def extract_job_date(date_created):
    #extract the job_date from job_created
    df_daily_all['job_date'] = pd.to_datetime(df_daily_all['DATE_CREATED']).dt.date
    return df_daily_all['job_date']

#Apply the function to the DataFrame
df_daily_all['job_date'] = extract_job_date(df_daily_all['DATE_CREATED'])
df_daily_all['job_date'][:5]


0    2025-04-30
1    2025-04-30
2    2025-04-29
3    2025-04-29
4    2025-04-29
Name: job_date, dtype: object

In [32]:
#Extrac the job city and state from the LOCATIONS_RAW field

def extract_city(list):
    """
    Extracts the city from the given text using regex.
    """
    # Regex pattern to match the city in the LOCATIONS_RAW field
    text = str(list)
    city_pattern = r"'addressLocality':\s*'(.*)',\s'addressRegion':"
    match = re.search(city_pattern, text)
    if match:
        city = match.group(1)
        if 'sidney' in city.lower() or 'sídney' in city.lower() or '悉尼' in city.lower(): #fix wrong city name
            return "Sydney"
        return city
    else:
        return None



def extract_state(list):
    """
    Extracts the state from the given text using regex.
    """
    # Regex pattern to match the region/state in the LOCATIONS_RAW field
    text = str(list)
    state_pattern = r"'addressRegion':\s*(.*)',\s'streetAddress'"
    match = re.search(state_pattern, text)
    if match:
        state = match.group(1)
        state= state.replace("'", "").strip()
        return state
    else:
        return None


#Extract city and state from Locations Raw
df_daily_all['city'] = df_daily_all['LOCATIONS_RAW'].apply(extract_city)


df_daily_all['state'] = df_daily_all['LOCATIONS_RAW'].apply(extract_state)



df_daily_all.head()

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG,job_date,city,state
0,1604628941,2025-04-30T01:09:33,2025-04-30T02:22:46.095216,AI / ML Engineer (H/F)- Équipe Data,Data Engineer,WINAMAX,https://www.linkedin.com/company/winamax,2025-05-30T01:09:33,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Poker en ligne', ""Jeux d'argent"", 'Poker Liv...","['Libre réponse 80986, PARIS CEDEX 07, 75342, ...",Winamax est une entreprise dynamique et innova...,False,Mid-Senior level,False,winamax,2025-04-30,Other Side Of The Moon,
1,1604169133,2025-04-29T23:52:27,2025-04-30T00:03:54.940518,Data Engineer,Data Engineer,TechnologyOne,https://www.linkedin.com/company/technology-one,2025-05-29T23:51:43,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Enterprise software as a service', 'Informat...","['TechnologyOne HQ, 540 Wickham Street, Fortit...",TechnologyOne is here to make life simple for ...,False,Mid-Senior level,False,technology-one,2025-04-30,Brisbane,QLD
2,1602292235,2025-04-29T14:47:36,2025-04-29T15:03:59.322895,Data Centre Engineer,Data Engineer,Peoplebank,https://www.linkedin.com/company/peoplebank,2025-05-29T14:47:36,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Digital / Digital Transformation', 'Business...","['345 George Street, Level 13, Sydney, NSW 200...","Peoplebank, part of RGF Staffing ANZ is a lead...",True,Mid-Senior level,False,peoplebank,2025-04-29,Melbourne,VIC
3,1601928637,2025-04-29T10:52:47,2025-04-29T12:26:12.438087,Data Engineer,Data Engineer,Downer,https://www.linkedin.com/company/downer,2025-05-29T10:52:47,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Infrastructure', 'Construction', 'Renewables...","['Triniti Business Campus, 39 Delhi Road, Nort...",Enabling communities to thrive. \n\nIt’s what ...,False,Entry level,False,downer,2025-04-29,Brisbane,QLD
4,1601759296,2025-04-29T09:36:00,2025-04-29T10:27:22.051813,Data Engineer,Data Engineer,Ampstek,https://www.linkedin.com/company/ampstek,2025-05-29T09:36:00,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,"['Information Technologies', 'Big Data', 'Hado...","['103 Carnegie Center Drive, Suite 300, Prince...",Ampstek supplies thousands of tech and digital...,True,Mid-Senior level,True,ampstek,2025-04-29,Melbourne,VIC


In [33]:
#Extract the employment type from the EMPLOYMENT_TYPE field

def extract_employment_type(df_daily_all):
    """
    Extracts the employment type from the given text using regex.
    """
    df_daily_all['employment_type'] = (
        df_daily_all['EMPLOYMENT_TYPE']
        .astype(str)
        .str.replace(r"[\[\]']", '', regex=True)
        .str.strip()
    )

    df_daily_all.drop(columns=['EMPLOYMENT_TYPE'], inplace=True)

    return df_daily_all['employment_type']


df_daily_all['employment_type'] = extract_employment_type(df_daily_all)
df_daily_all['employment_type'].tail()

35    CONTRACTOR
36     FULL_TIME
37     TEMPORARY
38     FULL_TIME
39    CONTRACTOR
Name: employment_type, dtype: object

In [34]:
#Extract the employment size
def extract_employee_size(LINKEDIN_ORG_SIZE):
    """
    Extracts the employee size from the given text using regex.
    """
    df_daily_all['org_size'] = (
        df_daily_all['LINKEDIN_ORG_SIZE']
        .astype(str)
        .str.replace(r"employees", '', regex=True)
        .str.strip()
    )

    return df_daily_all['org_size']


df_daily_all['org_size'] = extract_employee_size(df_daily_all['LINKEDIN_ORG_SIZE'])
df_daily_all['org_size'].tail()

35    1,001-5,000
36        10,001+
37           2-10
38          11-50
39        201-500
Name: org_size, dtype: object

In [35]:
df_daily_all.columns = df_daily_all.columns.str.upper()
df_daily_all.head(2)

Unnamed: 0,ID,DATE_POSTED,DATE_CREATED,TITLE,JOB_CATEGORY,ORGANIZATION,ORGANIZATION_URL,DATE_VALIDTHROUGH,LOCATIONS_RAW,LOCATION_TYPE,...,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORG_SIZE
0,1604628941,2025-04-30T01:09:33,2025-04-30T02:22:46.095216,AI / ML Engineer (H/F)- Équipe Data,Data Engineer,WINAMAX,https://www.linkedin.com/company/winamax,2025-05-30T01:09:33,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,Winamax est une entreprise dynamique et innova...,False,Mid-Senior level,False,winamax,2025-04-30,Other Side Of The Moon,,FULL_TIME,201-500
1,1604169133,2025-04-29T23:52:27,2025-04-30T00:03:54.940518,Data Engineer,Data Engineer,TechnologyOne,https://www.linkedin.com/company/technology-one,2025-05-29T23:51:43,"[{'@type': 'Place', 'address': {'@type': 'Post...",,...,TechnologyOne is here to make life simple for ...,False,Mid-Senior level,False,technology-one,2025-04-30,Brisbane,QLD,FULL_TIME,"1,001-5,000"


In [36]:
#Only keep the relevant columns

df_daily_all = df_daily_all[['ID', 'TITLE', 'JOB_CATEGORY',
       'JOB_DATE', 'CITY', 'STATE', 'EMPLOYMENT_TYPE' ,
       'ORGANIZATION', 'ORGANIZATION_URL', 'URL',
       'SOURCE_TYPE', 'SOURCE', 'SOURCE_DOMAIN',
       'ORGANIZATION_LOGO', 'REMOTE_DERIVED', 'RECRUITER_NAME', 'RECRUITER_TITLE',
       'RECRUITER_URL', 'LINKEDIN_ORG_URL',
       'ORG_SIZE', 'LINKEDIN_ORG_INDUSTRY',
       'LINKEDIN_ORG_HEADQUARTERS',
       'LINKEDIN_ORG_TYPE', 'LINKEDIN_ORG_FOUNDEDDATE',
       'LINKEDIN_ORG_SPECIALTIES', 'LINKEDIN_ORG_LOCATIONS',
       'LINKEDIN_ORG_DESCRIPTION','LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED',
       'SENIORITY', 'DIRECTAPPLY',
       'LINKEDIN_ORG_SLUG']]


df_daily_all.head()

Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
0,1604628941,AI / ML Engineer (H/F)- Équipe Data,Data Engineer,2025-04-30,Other Side Of The Moon,,FULL_TIME,WINAMAX,https://www.linkedin.com/company/winamax,https://au.linkedin.com/jobs/view/ai-ml-engine...,...,PARIS CEDEX 07,Partnership,2006.0,"['Poker en ligne', ""Jeux d'argent"", 'Poker Liv...","['Libre réponse 80986, PARIS CEDEX 07, 75342, ...",Winamax est une entreprise dynamique et innova...,False,Mid-Senior level,False,winamax
1,1604169133,Data Engineer,Data Engineer,2025-04-30,Brisbane,QLD,FULL_TIME,TechnologyOne,https://www.linkedin.com/company/technology-one,https://au.linkedin.com/jobs/view/data-enginee...,...,"Fortitude Valley, QLD",Public Company,,"['Enterprise software as a service', 'Informat...","['TechnologyOne HQ, 540 Wickham Street, Fortit...",TechnologyOne is here to make life simple for ...,False,Mid-Senior level,False,technology-one
2,1602292235,Data Centre Engineer,Data Engineer,2025-04-29,Melbourne,VIC,CONTRACTOR,Peoplebank,https://www.linkedin.com/company/peoplebank,https://au.linkedin.com/jobs/view/data-centre-...,...,"Sydney, NSW",Privately Held,1990.0,"['Digital / Digital Transformation', 'Business...","['345 George Street, Level 13, Sydney, NSW 200...","Peoplebank, part of RGF Staffing ANZ is a lead...",True,Mid-Senior level,False,peoplebank
3,1601928637,Data Engineer,Data Engineer,2025-04-29,Brisbane,QLD,FULL_TIME,Downer,https://www.linkedin.com/company/downer,https://au.linkedin.com/jobs/view/data-enginee...,...,"North Ryde, NSW",Public Company,1933.0,"['Infrastructure', 'Construction', 'Renewables...","['Triniti Business Campus, 39 Delhi Road, Nort...",Enabling communities to thrive. \n\nIt’s what ...,False,Entry level,False,downer
4,1601759296,Data Engineer,Data Engineer,2025-04-29,Melbourne,VIC,CONTRACTOR,Ampstek,https://www.linkedin.com/company/ampstek,https://au.linkedin.com/jobs/view/data-enginee...,...,"Princeton, NJ",Privately Held,2014.0,"['Information Technologies', 'Big Data', 'Hado...","['103 Carnegie Center Drive, Suite 300, Prince...",Ampstek supplies thousands of tech and digital...,True,Mid-Senior level,True,ampstek


In [37]:
#Merge the new data with the existing data

def merge_duplicates(df, df_daily_all):
    # Merge the two DataFrames
    merged_df = pd.concat([df, df_daily_all], axis=0)

    # Drop duplicates based on 'job_id' and keep the last occurrence
    merged_df.drop_duplicates(subset=['ID'], keep='last', inplace=True)

    # Reset the index
    merged_df.reset_index(drop=True, inplace=True)

    return merged_df

df_merged = merge_duplicates(df, df_daily_all)
df_merged.shape


(241, 31)

In [38]:
df_merged.tail()

Unnamed: 0,ID,TITLE,JOB_CATEGORY,JOB_DATE,CITY,STATE,EMPLOYMENT_TYPE,ORGANIZATION,ORGANIZATION_URL,URL,...,LINKEDIN_ORG_HEADQUARTERS,LINKEDIN_ORG_TYPE,LINKEDIN_ORG_FOUNDEDDATE,LINKEDIN_ORG_SPECIALTIES,LINKEDIN_ORG_LOCATIONS,LINKEDIN_ORG_DESCRIPTION,LINKEDIN_ORG_RECRUITMENT_AGENCY_DERIVED,SENIORITY,DIRECTAPPLY,LINKEDIN_ORG_SLUG
236,1601227461,Technical Business Analyst – Data Migration,Data Analyst,2025-04-29,Sydney,NSW,CONTRACTOR,Ampstek,https://www.linkedin.com/company/ampstek,https://au.linkedin.com/jobs/view/technical-bu...,...,"Princeton, NJ",Privately Held,2014.0,"['Information Technologies', 'Big Data', 'Hado...","['103 Carnegie Center Drive, Suite 300, Prince...",Ampstek supplies thousands of tech and digital...,True,Mid-Senior level,True,ampstek
237,1601228059,Lead Data Analyst,Data Analyst,2025-04-29,Newcastle,NSW,FULL_TIME,Endava,https://www.linkedin.com/company/endava,https://au.linkedin.com/jobs/view/lead-data-an...,...,"London, UK",Public Company,,"['Digital Strategy', 'Continuous Delivery & De...","['125 Old Broad Street, London, UK EC2N 1AR, G...","For over two decades, we have been harnessing ...",False,Mid-Senior level,True,endava
238,1601227998,CRM Data Analyst,Data Analyst,2025-04-29,Brisbane,QLD,TEMPORARY,Rainy Day Recruitment,https://www.linkedin.com/company/rainydayrecru...,https://au.linkedin.com/jobs/view/crm-data-ana...,...,"Tonsley , South Australia",Privately Held,2020.0,"['Recruitment', 'Diversity Consultation', 'Rec...","['6 MAB Eastern Promenade, Suite 29, Tonsley I...",VISION > To be recognised as the recruiter of ...,True,Not Applicable,True,rainydayrecruitment
239,1601320428,Reliability Centred Maintenance (RCM) – Senior...,Data Analyst,2025-04-29,Burwood,NSW,FULL_TIME,Powerdata Group Consulting,https://www.linkedin.com/company/powerdata-gro...,https://au.linkedin.com/jobs/view/reliability-...,...,"Armadale, Victoria",Privately Held,2003.0,"['big data', 'data analytics', 'cyber security...","['501 Dandenong Rd, Armadale, Victoria 3143, A...",PowerData Group Consulting is dedicated to emp...,False,Mid-Senior level,True,powerdata-group-consulting
240,1604409159,Mining Data Analyst - FIFO,Data Analyst,2025-04-30,Perth,WA,CONTRACTOR,Peoplebank,https://www.linkedin.com/company/peoplebank,https://au.linkedin.com/jobs/view/mining-data-...,...,"Sydney, NSW",Privately Held,1990.0,"['Digital / Digital Transformation', 'Business...","['345 George Street, Level 13, Sydney, NSW 200...","Peoplebank, part of RGF Staffing ANZ is a lead...",True,Not Applicable,False,peoplebank


In [39]:
#Load processed data into Snowflake

def load_to_snowflake(df_merged):
    # Create a Snowflake connection engine
   engine = create_engine(
        'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'.format(
        user="NIKKILW2025",
        password=snowflake_password,
        account="gbszkwp-by30611",
        warehouse="SNOWFLAKE_LEARNING_WH",
        database="linkedin_db",
        schema="linkedin_raw"
    )
   )

   table_name = "linkedin_job_api_cleaned_data"

   df_merged.to_sql(
        name=table_name,
        con=engine,
        if_exists='replace',
        index=False
    )

   print(f"Data loaded to Snowflake table {table_name} successfully.")


load_to_snowflake(df_merged)

Data loaded to Snowflake table linkedin_job_api_cleaned_data successfully.


In [None]:
# #Translate non English info into English

# translator = Translator()

# #Example text to detect if language is English or not
# def is_non_english(text):
#     try:
#         if pd.isnull(text):
#             return False

#         text = str(text).strip()
#         if len(text) == 0 or all(c in string.punctuation for c in text):
#             return False

#         lang = detect(str(text))
#         return lang != 'en'

#     except LangDetectException:
#         return False
#     except Exception as e:
#         print(f"Error detecting language: {e}")
#         return False


# #Only transalate if the text is not English
# def translate_text(text):
#     try:
#         if pd.isnull(text):
#             return text
#         text = str(text).strip()
#         if len(text) == 0 or all(c in string.punctuation for c in text):
#             return text
#         if is_non_english(text):
#             translated = translator.translate(text, dest='en')
#             return translated.text
#         else:
#             return text
#     except Exception as e:
#         print(f"Error translating text: {e}")
#         return text


# #Apply the translation function to the DataFrame
# def translate_dataframe(df):
#     try:
#         df_copy = df.copy()
#         #only process columns that are text datatype
#         text_cols = df_copy.select_dtypes(include=['object']).columns
#         for col in text_cols:
#             df_copy[col] = df_copy[col].apply(lambda x: translate_text(x))
#         return df_copy
#     except Exception as e:
#         print(f"Error translating DataFrame: {e}")
#         return df


# df_translated = translate_dataframe(df)
# df_translated.head()
