In [1]:
%%capture
!pip install polars lets-plot pyarrow numpy

In [2]:
import polars as pl
from lets_plot import *

LetsPlot.setup_html()

In [3]:
data = pl.read_csv('data/jobs.csv')
print('Data shape:', data.shape)
data.head()

Data shape: (34430012, 5)


title,org_name,org_link,date_posted,date_valid
str,str,str,str,str
"""AI Engineer - Machine Learning""","""Cynch AI""","""https://www.linkedin.com/compa…","""2024-03-15T23:15:41.000Z""","""2024-10-22T10:29:32.000Z"""
"""Payments Business Analyst""","""Ztek Consulting""","""https://www.linkedin.com/compa…","""2024-05-15T22:38:01.000Z""","""2024-06-14T22:38:01.000Z"""
"""HR | Payroll Coordinator (Part…","""Weitzman""","""https://www.linkedin.com/compa…","""2024-05-15T21:45:07.000Z""","""2024-11-11T21:45:07.000Z"""
"""Associate Recruiter Work from …","""HCA Healthcare""","""https://www.linkedin.com/compa…","""2024-05-03T22:04:02.000Z""","""2024-06-02T22:04:02.000Z"""
"""Office Manager""","""LGI Homes""","""https://www.linkedin.com/compa…","""2024-05-22T17:14:46.000Z""","""2024-06-21T17:14:46.000Z"""


In [4]:
# convert date_posted and date_valid to datetime, format: 2024-10-22T10:29:32.000Z
# clean up the title
data = data.with_columns(
    date_posted=pl.col('date_posted').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S.%fZ", strict=False, ambiguous='null').dt.date(),
    date_valid=pl.col('date_valid').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S.%fZ", strict=False, ambiguous='null').dt.date(),
    title=pl.col('title').str.to_lowercase().str.replace_all(r'[^a-zA-Z ]', '').str.replace_all(r'[\s]+',' ').str.strip_chars(' '),
)
# drop na values and duplicates
data = data.drop_nulls().unique(subset=['title', 'org_name', 'date_posted', 'date_valid'])
print('Data shape:', data.shape)
data.head()

  date_posted=pl.col('date_posted').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S.%fZ", strict=False, ambiguous='null').dt.date(),
  date_valid=pl.col('date_valid').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S.%fZ", strict=False, ambiguous='null').dt.date(),


Data shape: (22026803, 5)


title,org_name,org_link,date_posted,date_valid
str,str,str,date,date
"""tmobile retail associate manag…","""Arch Telecom""","""https://www.linkedin.com/compa…",2024-05-03,2024-06-02
"""assistant branch manager bilin…","""Heights Finance Corporation""","""https://www.linkedin.com/compa…",2024-09-11,2024-10-11
"""forklift operator st shift""","""Mullican Flooring""","""https://www.linkedin.com/compa…",2024-07-01,2024-07-28
"""locum physician mddo internal …","""LocumJobsOnline""","""https://www.linkedin.com/compa…",2025-02-02,2025-03-04
"""talent manager contract financ…","""Robert Half Executive Search.""","""https://www.linkedin.com/compa…",2025-03-19,2025-04-18


In [5]:
# group the data by year and month posted
data.with_columns(
    year=pl.col('date_posted').dt.year(),
    month=pl.col('date_posted').dt.month(),
) \
    .group_by(['year', 'month']) \
    .len() \
    .sort(['year', 'month']) \
    .pivot(
        index='year',
        on='month',
        values='len',
    )

year,10,1,2,3,4,5,6,7,8,9,11,12
i32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
2020,8.0,,,,,,,,,,,
2021,83.0,2.0,14.0,11.0,14.0,39.0,84.0,84.0,19.0,16.0,27.0,33.0
2022,134.0,31.0,27.0,34.0,35.0,375.0,52.0,51.0,224.0,187.0,118.0,196.0
2023,17303.0,263.0,460.0,9243.0,11086.0,12751.0,19343.0,22140.0,13941.0,13890.0,22703.0,41973.0
2024,1506238.0,54870.0,95306.0,168096.0,398400.0,1690043.0,1147579.0,1496382.0,2007163.0,1776077.0,1470437.0,1495242.0
2025,,1474540.0,1530008.0,1853935.0,2189690.0,1485803.0,,,,,,


Let's plot the number of job post weekly from 2024-01-01

In [6]:
from datetime import date, datetime
# plot number of jobs weekly
(ggplot(
    {
        'epoch': data['date_posted'].dt.epoch(time_unit='s').to_list(),
    },
    aes(x='epoch')
) + geom_histogram(
    binwidth=86400 * 7, # interval of 1 week
    fill='blue',
    color='black',
    alpha=0.5,
) + labs(
    title='Jobs posted weekly from 2024-01-01',
    x='Date',
    y='Number of jobs',
) + scale_x_continuous(
        breaks=pl.date_range(date(2024, 1, 1), date(2025, 6, 1), "1mo", eager=True).dt.epoch(time_unit='s').to_list(),
        labels=pl.date_range(date(2024, 1, 1), date(2025, 6, 1), "1mo", eager=True).dt.strftime('%Y-%m').to_list(),
        limits=[
            datetime(2024, 1, 1).timestamp(),
            datetime(2025, 6, 1).timestamp()
        ]
) + ggsize(1200, 400))

Our data is very likely not complete before 2024-05-01

In [7]:
# if within 6 months, there are 3 or more jobs posted with the same title, org_name, then consider it as a fake job
# loop through the data with a sliding window of 6 months

date_range = pl.date_range(
    data['date_posted'].min(),
    data['date_posted'].max(),
    interval='1w',
    eager=True,
).to_list()

fake_jobs = []

for d in date_range:
    fakes = data.filter(
        (pl.col('date_posted') >= d) & (pl.col('date_posted') <= d + pl.duration(days=180))
    )\
        .group_by(['title', 'org_link']) \
        .len() \
        .filter(pl.col('len') >= 3) \
        .select(['title', 'org_link'])
    fake_jobs.append(fakes)

fake_jobs = pl.concat(fake_jobs).unique(subset=['title', 'org_link']).\
    with_columns(
        is_fake=pl.lit(1),
    )
fake_jobs.head()

title,org_link,is_fake
str,str,i32
"""class a cdl driver""","""https://www.linkedin.com/compa…",1
"""lake ranger""","""https://www.linkedin.com/compa…",1
"""distinguished engineer risk ma…","""https://www.linkedin.com/compa…",1
"""business development represent…","""https://www.linkedin.com/compa…",1
"""travel licensed clinical socia…","""https://www.linkedin.com/compa…",1


In [8]:
# percentage of fake jobs from 2024-05-01
df = data\
    .filter((pl.col('date_posted') >= datetime(2024, 5, 1))) \
    .join(fake_jobs, on=['title', 'org_link'], how='left')\
    .group_by(['is_fake']) \
    .len().with_columns(
        is_fake = pl.col('is_fake').fill_null(0),
    )
perc = df.filter(pl.col('is_fake') == 1).select(pl.col('len')).to_numpy()[0][0] / df['len'].sum() * 100
print('Percentage of fake jobs:', perc)

Percentage of fake jobs: 42.08978997769129


In [9]:
# percentage of companies with fake jobs from 2024-05-01
df = data\
    .filter((pl.col('date_posted') >= datetime(2024, 5, 1))) \
    .join(fake_jobs, on=['title', 'org_link'], how='left')\
    .with_columns(
        is_fake=pl.col('is_fake').fill_null(0),
    ) \
    .group_by(['org_link']) \
    .agg([
        pl.col('is_fake').sum().alias('is_fake'),
    ])
perc = df.filter(pl.col('is_fake') > 0).shape[0] / df.shape[0] * 100
print('Percentage of companies with fake jobs:', perc)

Percentage of companies with fake jobs: 17.81409458899046


In [10]:
import numpy as np
# plot in histogram
df = data\
    .filter((pl.col('date_posted') >= datetime(2024, 5, 1))) \
    .join(fake_jobs, on=['title', 'org_link'], how='left')\
    .with_columns(
        is_fake=pl.col('is_fake').fill_null(0),
    )

(ggplot() + geom_histogram(
    data= {
        'epoch': df['date_posted'].dt.epoch(time_unit='s').to_list(),
        'fill': np.full(len(df), 'all'),
    },
    mapping = aes(x='epoch', fill='fill'),
    binwidth=86400 * 7, # interval of 1 week
) + geom_histogram(
    data={
        'epoch': df.filter(pl.col('is_fake') == 1)['date_posted'].dt.epoch(time_unit='s').to_list(),
        'fill': np.full(len(df.filter(pl.col('is_fake') == 1)), 'fake'),
    },
    mapping = aes(x='epoch', fill='fill'),
    binwidth=86400 * 7, # interval of 1 week
) + labs(
    title='Jobs posted weekly from 2024-01-01',
    x='Date',
    y='Number of jobs',
) + scale_x_continuous(
    breaks=pl.date_range(date(2024, 1, 1), date(2025, 6, 1), "1mo", eager=True).dt.epoch(time_unit='s').to_list(),
    labels=pl.date_range(date(2024, 1, 1), date(2025, 6, 1), "1mo", eager=True).dt.strftime('%Y-%m').to_list(),
    limits=[
        datetime(2024, 5, 1).timestamp(),
        datetime(2025, 6, 1).timestamp()
    ]
) + scale_fill_manual(
    name='Jobs',
    values={
        'all': '#165baa',
        'fake': '#a155b9',
    },
    labels=['Real', 'Fake'],
) + theme(
    legend_position='right',
    legend_box='vertical',
) + ggsize(800, 600))