In [1]:
# Importing the libraries
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt

# loading the data
df = load_dataset('lukebarousse/data_jobs')['train'].to_pandas()

# cleaning data
df['job_posted_date'] = pd.to_datetime(df.job_posted_date)

In [36]:
df[df.salary_year_avg.notna()]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [46]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

## Calculate The Projected Salary for Next Year based on Inflation

In [None]:
# I can just do that, as it's a simple problem. But I want to practise apply
df['salary_year_inflated'] = df['salary_year_avg'] * 1.03 # create a simple adjusted value column

`.apply()`: Apply functions to columns or rows.

this is for the column

In [55]:
df['salary_year_inflated'] = df['salary_year_avg'].apply(lambda salary: salary * 1.03)
df[df['salary_year_avg'].notna()][['salary_year_avg','salary_year_inflated']] # put the desired columns u wanna see inside [[]] because they are two

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [60]:
def inflation(salary):
    return salary*1.03
df['salary_inflated'] = df['salary_year_avg'].apply(inflation)
df[df['salary_year_avg'].notna()][['salary_year_avg','salary_inflated']]

Unnamed: 0,salary_year_avg,salary_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


### Data Cleaning Converting job_skills from a string to a list

In [67]:
[df['job_skills'][1]] # it's between '' so a string and it should be a list


["['python', 'databricks', 'azure', 'spark']"]

In [73]:
from ast import literal_eval    # ast -> abstract syntax tree not really important
ast.literal_eval(df.job_skills[1])
# I can't use literal_eval for the whole column I've tried, that's why u should use apply

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

For the whole column

In [81]:
def clean_list (skil):
    if pd.notna(skil): 
        return ast.literal_eval(skil) 
    else: pass
df['job_skills'] = df['job_skills'].apply(clean_list)
df['job_skills']

0                                                      None
1                [r, python, sql, nosql, power bi, tableau]
2         [python, sql, c#, azure, airflow, dax, docker,...
3         [python, c++, java, matlab, aws, tensorflow, k...
4         [bash, python, oracle, aws, ansible, puppet, j...
                                ...                        
785736    [bash, python, perl, linux, unix, kubernetes, ...
785737                               [sas, sas, sql, excel]
785738                                  [powerpoint, excel]
785739    [python, go, nosql, sql, mongo, shell, mysql, ...
785740                                          [aws, flow]
Name: job_skills, Length: 785741, dtype: object

Since we have nan values lets adjust our code to add in a condition to check if the value is not NaN.

- If it's not NaN it returns `True` and applies `ast.literal_eval()` function on it.
- if it's a Nan value then it returns `False` and the NaN value doesn't change.

In [92]:
# Convert string representation to actual list, checking for NaN values first

df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)
                                          # to the variable skill        # what we wanna do to the variable

SyntaxError: incomplete input (861994453.py, line 3)

### Calculate projected salaries next year, but:

- For senior roles (e.g., Senior Data Analysts), assume the rate is 5%
- For all other roles, assume rate is 3%

this is for the row

In [116]:
def a_rise(row):
    if 'Senior' in row['job_title_short']:
        return row['salary_year_avg'] * 1.05 
    else: return  row['salary_year_avg'] * 1.03
df['salary_rise'] = df.apply(a_rise, axis= 1)   # notice that i didn't specify any column before apply that's because i am modyfing rows and that was done in the function def.
df[df['salary_year_avg'].notna()][['job_title_short','salary_year_avg','salary_rise']].tail(10)

Unnamed: 0,job_title_short,salary_year_avg,salary_rise
785340,Senior Data Scientist,196800.0,206640.0
785395,Data Engineer,64800.0,66744.0
785488,Data Scientist,115000.0,118450.0
785515,Data Analyst,105000.0,108150.0
785563,Data Scientist,136400.0,140492.0
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.0
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.0
785692,Data Scientist,157500.0,162225.0


In [None]:
def projected_salary(row):
    if 'Senior' in row['job_title_short']:
        return  1.05 * row['salary_year_avg']
    else:
        return  1.03 * row['salary_year_avg']

df['salary_year_inflated'] = df.apply(projected_salary, axis=1) # notice that i didn't specify any column before apply that's because i am modyfing rows and that was done in the function def

df[pd.notna(df['salary_year_avg'])][['job_title_short', 'salary_year_avg', 'salary_year_inflated']].head(20)

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.0
77,Data Engineer,140000.0,144200.0
92,Data Engineer,120000.0,123600.0
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.0
116,Data Scientist,114000.0,117420.0
146,Data Engineer,129500.0,133385.0
180,Data Analyst,90250.0,92957.5
212,Data Scientist,157500.0,162225.0
257,Data Scientist,103128.0,106221.84


In [None]:
df['salary_year_inflated'] = df.apply(lambda row: 1.05 * row['salary_year_avg'] if 'Senior' in row['job_title_short'] else 1.03 * row['salary_year_avg'], axis=1)

df[pd.notna(df['salary_year_avg'])][['job_title_short', 'salary_year_avg', 'salary_year_inflated']]