In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [3]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']
# In the dataframe(df) I'm using the notna method of pandas to filter the 'salary_year_avg' column, and then I want it to return the salary_year_avg column as well, which is why it's been put into parenthesis a second time.

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

## Calculate Projected Salary Next Year
This is done by applying the inflation/recession rate of the current year. 
In this case, we are going to apply the inflation rate of 3%

In [4]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None, args=(), by_row: "Literal[False, 'compat']" = 'compat', engine: "Literal['python', 'numba']" = 'python', engine_kwargs: 'dict[str, bool] | None' = None, **kwargs) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.
    
    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:
 

In [7]:
# we are craeting this function to then apply it to the salary_year_avg column of df so that all the NaN values in this column will be replaced with the projected salary (with the 3% inflation rate)
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(salary): #this is where the function starts
    return salary * 1.03

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [None]:
# here, we are writing the same code as above but with an anonymous/lambda function
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)
# in the above lambda function, I am naming the variable as 'salary'
df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [11]:
# u can also rewrite it as follows:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.03

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [17]:
# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [44]:
type(df['job_skills'])

pandas.core.series.Series

In [None]:
import ast #the ast module stands for Abstract Syntax Trees and it's a part of the Python Standard Library

ast.literal_eval(df['job_skills'][1])
#this is returning an error bec my job_skills col is already a list and i obviously can't convert a list into a list using ast.literal_eval
#ast.literal_eval is meant to convert a str to a list.

ValueError: malformed node or string: ['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [51]:
type(df['job_skills'][1])

list

In [None]:
# u cannot run the ast.literal_eval function on an entire column. so u create a function for it instead as shown below:
def clean_list(skill_list):
    if pd.notna(skill_list):
       return ast.literal_eval(skill_list)  #this gives a value error because there are None values in this column and the ast.literal_eval function doesn't/can't apply to None values.
                                            #to prevent the value error, we are writing the if statement.

df['job_skills'] = df['job_skills'].apply(clean_list)

#this should've worked according to the video but it didn't since my skill_list is already a list, so pd.notna() tries to apply itself to each item inside the list, returning an array of True/False values — and Python doesn’t know how to evaluate that as a single if condition.

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

The stuff done below is the solution given by Chat GPT to the above error:

In [54]:
import ast

def clean_list(skill_list):
    if isinstance(skill_list, list):
        return skill_list
    elif isinstance(skill_list, str):
        try:
            return ast.literal_eval(skill_list)
        except:
            return []
    else:
        return []


In [55]:
df['job_skills'] = df['job_skills'].apply(clean_list)

In [56]:
print(type(df['job_skills'][0]))  # Should be <class 'list'>
print(df['job_skills'].head())   # Should show actual lists like ['python', 'sql']

<class 'list'>
0                                                   []
1           [r, python, sql, nosql, power bi, tableau]
2    [python, sql, c#, azure, airflow, dax, docker,...
3    [python, c++, java, matlab, aws, tensorflow, k...
4    [bash, python, oracle, aws, ansible, puppet, j...
Name: job_skills, dtype: object


Going back to everything done in the video:

In [None]:
df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)
#this is just a way to do the same thing using lambda. It's obviously giving me an error bec my skill_list was alraedy a list. so it's giving me the same error as i got before using chatgpt

ValueError: The truth value of an empty array is ambiguous. Use `array.size > 0` to check that an array is not empty.

## Calculate Projected Salary Next Year
Now we are going to use the apply method on a row (earlier we used it on a column - job_skills).
- Here we are assuming the following:
  - Senior roles assume 5%
  - Other roles assume 3%

In [None]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

In [None]:
def projected_salary(row):
    if'Senior' in row['job_title_short']:
        return 1.05 * row['salary_year_avg']   #u can put all this code in a lambda function as well.
    else:
        return 1.03 * row['salary_year_avg']

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00
