In [1]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Load Datasets
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [None]:
df_usa = df[df['job_country'] == 'United States'].copy()

df_usa

In [None]:
# Code below checks the range of years in our the column 'job_posted_date'
# We just want to investigate a single year

# Ensure 'job_posted_date' is in datetime format
df_usa['job_posted_date'] = pd.to_datetime(df_usa['job_posted_date'])

# Extract the year from the 'job_posted_date' column
df_usa['year'] = df_usa['job_posted_date'].dt.year

# Get the range of years
year_range = df_usa['year'].min(), df_usa['year'].max()

year_range

In [None]:
df_usa['job_posted_month'] = df_usa['job_posted_date'].dt.strftime('%B')

df_usa

In [None]:
df_usa_pivot = df_usa.pivot_table(index = 'job_posted_month', columns = 'job_title_short', aggfunc = 'size')

df_usa_pivot

In [6]:
df_usa_pivot.reset_index(inplace=True)
df_usa_pivot['month_no'] = pd.to_datetime(df_usa_pivot['job_posted_month'], format='%B').dt.month



In [None]:
df_usa_pivot

In [8]:
df_usa_pivot.sort_values('month_no', inplace=True)
df_usa_pivot.set_index('job_posted_month', inplace=True)
df_usa_pivot.drop(columns='month_no', inplace=True)

In [None]:
df_usa_pivot

In [None]:
df_usa_pivot.plot(kind = 'line')

In [None]:
top_3 = df_usa['job_title_short'].value_counts().head(3)
top_3 = top_3.index.tolist()
top_3

In [None]:
df_usa_pivot[top_3].plot(kind='line')
plt .title('Monthly Job Postings for Top Data Jobs in the US')
plt.xlabel('2023')
plt.ylabel('Job Count')
plt.xticks(rotation = 45, ha = 'right')
plt.legend()
plt.show()