In [13]:
import pandas as pd

from datetime import datetime
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np

dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [None]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

In [None]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(salary):
    return salary * 1.03

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply([projected_salary])
df_salary[['salary_year_avg', 'salary_year_inflated']]

In [None]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

In [None]:
df['job_skills'][1]

In [None]:
import ast

In [None]:
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list)

In [23]:
type(df['job_skills'][1])

list

In [27]:
df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)

In [29]:
type(df['job_skills'][1])

list

In [35]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(row):
    if 'Senior' in row['job_title_short']:
        return row['salary_year_avg'] * 1.03
    else:
        return row['salary_year_avg'] * 1.05

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis = 1)
df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,114975.00
77,Data Engineer,140000.0,147000.00
92,Data Engineer,120000.0,126000.00
100,Data Scientist,228222.0,239633.10
109,Data Analyst,89000.0,93450.00
...,...,...,...
785624,Data Engineer,139216.0,146176.80
785641,Data Engineer,150000.0,157500.00
785648,Data Scientist,221875.0,232968.75
785682,Data Scientist,157500.0,165375.00


## Practice Problems

Convert the job_posted_date column to a string format 'YYYY-MM-DD' and create a new column job_posted_date_str.

In [None]:
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['str_job_posted'] = df['job_posted_date'].apply(lambda posted_date: posted_date.strftime('%Y-%m-%d'))

df[['job_posted_date', 'str_job_posted']]

Unnamed: 0,job_posted_date,str_job_posted
0,2023-06-16 13:44:15,2023-06-16
1,2023-01-14 13:18:07,2023-01-14
2,2023-10-10 13:14:55,2023-10-10
3,2023-07-04 13:01:41,2023-07-04
4,2023-08-07 14:29:36,2023-08-07
...,...,...
785736,2023-03-13 06:16:16,2023-03-13
785737,2023-03-12 06:18:18,2023-03-12
785738,2023-03-12 06:32:36,2023-03-12
785739,2023-03-12 06:32:15,2023-03-12


Calculate the number of days since each job was posted. Create a new column days_since_posted that contains this value. Use the job_posted_date column.

Note: You need to import the datetime library and use the datetime module to get the current date using .now().

In [None]:
from datetime import datetime

current_date = datetime.now()
df['days_passed'] = df['job_posted_date'].apply(lambda date: (current_date - date).days)
df[['job_posted_date', 'days_passed']]

Unnamed: 0,job_posted_date,days_passed
0,2023-06-16 13:44:15,514
1,2023-01-14 13:18:07,667
2,2023-10-10 13:14:55,398
3,2023-07-04 13:01:41,496
4,2023-08-07 14:29:36,462
...,...,...
785736,2023-03-13 06:16:16,609
785737,2023-03-12 06:18:18,610
785738,2023-03-12 06:32:36,610
785739,2023-03-12 06:32:15,610


1. Create a copy of the DataFrame called df_filtered and drop the NaN values for salary_year_avg.
2. Then, create a new column salary_category that categorizes the salary_year_avg into three categories: 'Low' for salaries less than 60,000, 'Medium' for salaries between 60,000 and 100,000, and 'High' for salaries greater than 100,000.
3. Then show the df_filtered DataFrame and the salary_year_avg and salary_category columns.

In [44]:
df_filtered = df.dropna(subset=['salary_year_avg']).copy()

df_filtered['salary_category'] = df['salary_year_avg'].apply(lambda salary: 'Low' if salary < 60000 else 'Medium' if salary <= 100000 else 'High')
df_filtered[['salary_year_avg', 'salary_category']]

Unnamed: 0,salary_year_avg,salary_category
28,109500.0,High
77,140000.0,High
92,120000.0,High
100,228222.0,High
109,89000.0,Medium
...,...,...
785624,139216.0,High
785641,150000.0,High
785648,221875.0,High
785682,157500.0,High


## Extra Practices

In [52]:
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_posted_str'] = df['job_posted_date'].apply(lambda date: date.strftime('%Y-%m-%d'))
df[['job_posted_str', 'job_posted_date']]

Unnamed: 0,job_posted_str,job_posted_date
0,2023-06-16,2023-06-16 13:44:15
1,2023-01-14,2023-01-14 13:18:07
2,2023-10-10,2023-10-10 13:14:55
3,2023-07-04,2023-07-04 13:01:41
4,2023-08-07,2023-08-07 14:29:36
...,...,...
785736,2023-03-13,2023-03-13 06:16:16
785737,2023-03-12,2023-03-12 06:18:18
785738,2023-03-12,2023-03-12 06:32:36
785739,2023-03-12,2023-03-12 06:32:15


In [53]:
from datetime import datetime

In [61]:
current_date = datetime.now()
df['days_passed'] = df['job_posted_date'].apply(lambda date: (current_date - date).days)
df[['job_posted_date', 'days_passed']]

Unnamed: 0,job_posted_date,days_passed
0,2023-06-16 13:44:15,514
1,2023-01-14 13:18:07,667
2,2023-10-10 13:14:55,398
3,2023-07-04 13:01:41,496
4,2023-08-07 14:29:36,462
...,...,...
785736,2023-03-13 06:16:16,609
785737,2023-03-12 06:18:18,610
785738,2023-03-12 06:32:36,610
785739,2023-03-12 06:32:15,610


In [67]:
filter_df = df.dropna(subset='salary_year_avg').copy()
filter_df['salary_category'] = df['salary_year_avg'].apply(lambda salary:'Low' if salary < 60000 else 'Medium' if salary <= 100000 else 'High')
filter_df[['salary_year_avg', 'salary_category']]

Unnamed: 0,salary_year_avg,salary_category
28,109500.0,High
77,140000.0,High
92,120000.0,High
100,228222.0,High
109,89000.0,Medium
...,...,...
785624,139216.0,High
785641,150000.0,High
785648,221875.0,High
785682,157500.0,High


Extra Practice 2

In [11]:
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_posted_converted'] = df['job_posted_date'].apply(lambda date: date.strftime('%Y-%m-%b'))
df[['job_posted_date', 'job_posted_converted']].head()

Unnamed: 0,job_posted_date,job_posted_converted
0,2023-06-16 13:44:15,2023-06-Jun
1,2023-01-14 13:18:07,2023-01-Jan
2,2023-10-10 13:14:55,2023-10-Oct
3,2023-07-04 13:01:41,2023-07-Jul
4,2023-08-07 14:29:36,2023-08-Aug


In [18]:
current_time = datetime.now()
df['days_passed'] = df['job_posted_date'].apply(lambda date: (current_time - date).days)
df[['job_posted_date', 'days_passed']].head()

Unnamed: 0,job_posted_date,days_passed
0,2023-06-16 13:44:15,514
1,2023-01-14 13:18:07,667
2,2023-10-10 13:14:55,398
3,2023-07-04 13:01:41,496
4,2023-08-07 14:29:36,462


In [23]:
df_filtered = df.dropna(subset='salary_year_avg').copy()
df_filtered['salary_category'] = df['salary_year_avg'].apply(lambda salary: 'Low' if salary < 60000 else 'Medium' if salary <=100000 else 'High')
df_filtered[['salary_year_avg', 'salary_category']]

Unnamed: 0,salary_year_avg,salary_category
28,109500.0,High
77,140000.0,High
92,120000.0,High
100,228222.0,High
109,89000.0,Medium
...,...,...
785624,139216.0,High
785641,150000.0,High
785648,221875.0,High
785682,157500.0,High
