In [2]:
import pandas as pd
import altair as alt

!kaggle datasets download -d hummaamqaasim/jobs-in-data

jobs-in-data.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip jobs-in-data.zip

In [3]:
df = pd.read_csv("jobs_in_data.csv")
len(df.index)

9355

In [4]:
data = df[df.work_year == 2023]
data = data.sample(frac=0.6, random_state=9)
data.shape

(4472, 12)

In [5]:
data.head()

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
1508,2023,Data Scientist,Data Science and Research,USD,130000,130000,United States,Senior,Full-time,Remote,United States,M
817,2023,Data Scientist,Data Science and Research,USD,160000,160000,United States,Senior,Full-time,In-person,United States,M
5914,2023,Data Analyst,Data Analysis,USD,120000,120000,United States,Senior,Full-time,Remote,United States,M
5002,2023,Data Analyst,Data Analysis,USD,80000,80000,United States,Senior,Full-time,Remote,United States,M
2845,2023,Data Scientist,Data Science and Research,USD,130500,130500,United States,Mid-level,Full-time,In-person,United States,M


In [43]:
# Let's implement filtering using dynamic queries. 
selection = alt.selection_point(fields=["job_category"])
selection2 = alt.selection_point(fields=["job_title"])

# Create a container for our two different views
base = alt.Chart(data, title="Salaries for Data Related Jobs").properties(width=1200, height=800)

# Let's specify our overview chart
bars = base.mark_bar(size=70).encode(
    x = alt.X("median(salary_in_usd)", title='Median of Salary (USD)'),
    y = alt.Y(field='job_category', type='nominal', title='Job Category',
              sort=alt.EncodingSortField(field='salary_in_usd', op='count', order='descending')),
    color=alt.condition(selection, alt.value('seagreen'), alt.value('lightgray'))
).add_params(selection).properties(
    height=800, width=600
).interactive()

# Create a detail chart
detail = hist = base.mark_circle(size=25).encode(
    y = alt.Y(field='job_title', type='nominal', title='Job Title',
              sort=alt.EncodingSortField(field='salary_in_usd', op='count', order='descending')),
    yOffset="jitter:Q",
    x=alt.X("salary_in_usd", title='Salary (USD)'),
    tooltip=["job_title", "salary_in_usd", "employee_residence"],
    color=alt.Color('job_title', scale=alt.Scale(scheme='category20')).legend(None)
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
).transform_filter(selection).properties(height=800, width=600).interactive()

viz = alt.hconcat(bars, detail)
#viz
viz.save('index.html')