In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loading data
dataset = load_dataset("lukebarousse/data_jobs")
df = dataset["train"].to_pandas()

In [3]:
# Cleaning up data
df["job_posted_date"] = pd.to_datetime(df["job_posted_date"])

The `SettingWithCopyWarning` is still raised when using `df_notna = df.loc[df["salary_year_avg"].notna()]` because `pandas` is not sure whether `df_notna` is a view or a copy of the original DataFrame. This is because the `loc[]` method can return either a view or a copy, depending on the specific indexing operation.

**View**:

- When selecting a **single** row/column using `loc[]`, `pandas` returns a view of the original DataFrame. <br> For example: `df.loc[0, 'column_name']` or `df.loc[:, 'column_name']`
- When selecting a **range** of rows/columns using `loc[]`, `pandas` returns a view of the original DataFrame. <br> For example: `df.loc[0:10, 'column_name']` or `df.loc[:, 'column_name1':'column_name2']`

**Copy**:

- When selecting a subset of **rows & columns** using `loc[]`, `pandas` returns a copy of the original DataFrame. <br> For example: `df.loc[0:10, ['column_name1', 'column_name2']]`
- When using a conditional statement to select rows using `loc[]`, `pandas` **likely** returns a copy of the original DataFrame. <br> For example: `df.loc[df['column_name'] > 0, 'column_name']`

**Ambiguous cases**:

- When using a **combination** of indexing operations, such as `df.loc[df['column_name'] > 0, 'column_name1':'column_name2']`, `pandas` may return **either** a view or a copy of the original DataFrame, depending on the specific indexing operation or DataFrame's internal structure

The decision by `pandas` to return a view or copy is complex and depends on factors like:

    Data types of all columns
    Whether the original DataFrame is a view itself
    The specific indexing operations used
    Internal pandas representation (block manager, etc.)

**--> Find the sources?** <br>
<https://stackoverflow.com/questions/48173980/pandas-knowing-when-an-operation-affects-the-original-dataframe> <br>
<https://stackoverflow.com/questions/23296282/what-rules-does-pandas-use-to-generate-a-view-vs-a-copy> <br>
<https://stackoverflow.com/questions/57490268/pandas-returning-a-view-versus-a-copy-warning-when-constructing-a-new-datafra> <br>
<https://www.dataquest.io/blog/settingwithcopywarning/>


`apply()` to "condition of row":

If using `axis=0` in `apply()`: `KeyError: 'job_title_short'` would be raised because when `project_salary()` is called, it receives a `Series` representing a single column, not a row with multiple columns including `job_title_short` and `salary_year_avg`. --> index names of the `DataFrame` become index labels of the newly created `Series`, in this case, a range starting from 0, with a step=1, and a length equal to the number of rows in the original `DataFrame`.

If using `axis=1`: `apply()` iterates through each row of the `DataFrame`. For each row, it creates a `Series` object where the **column names become the index labels**, and the values in the row become the values of the `Series`. This `Series` is then passed as the `row` argument to `project_salary()`.


In [40]:
# inplace from reset_index() defaulting False creates a copy from original df that equals to using .copy() method
df_notna = df.loc[df["salary_year_avg"].notna()].reset_index(drop=True)


def project_salary(row):
    if "Senior" in row["job_title_short"]:
        return row["salary_year_avg"] * 1.05
    else:
        return row["salary_year_avg"] * 1.03


df_notna["salary_year_inflated"] = df_notna[
    ["job_title_short", "salary_year_avg"]
].apply(project_salary, axis=1)
df_notna[["job_title_short", "salary_year_avg", "salary_year_inflated"]].tail(10)

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
21993,Senior Data Scientist,196800.0,206640.0
21994,Data Engineer,64800.0,66744.0
21995,Data Scientist,115000.0,118450.0
21996,Data Analyst,105000.0,108150.0
21997,Data Scientist,136400.0,140492.0
21998,Data Engineer,139216.0,143392.48
21999,Data Engineer,150000.0,154500.0
22000,Data Scientist,221875.0,228531.25
22001,Data Scientist,157500.0,162225.0
22002,Data Scientist,157500.0,162225.0


In [None]:
# More concise alternative by using lambda with apply() instead of a separate function
df_notna["salary_year_projected"] = df_notna.apply(
    lambda row: (
        row["salary_year_avg"] * 1.05
        if "Senior" in row["job_title_short"]
        else row["salary_year_avg"] * 1.03
    ),
    axis=1,
)
df_notna[["job_title_short", "salary_year_avg", "salary_year_projected"]].tail(10)

Unnamed: 0,job_title_short,salary_year_avg,salary_year_projected
21993,Senior Data Scientist,196800.0,206640.0
21994,Data Engineer,64800.0,66744.0
21995,Data Scientist,115000.0,118450.0
21996,Data Analyst,105000.0,108150.0
21997,Data Scientist,136400.0,140492.0
21998,Data Engineer,139216.0,143392.48
21999,Data Engineer,150000.0,154500.0
22000,Data Scientist,221875.0,228531.25
22001,Data Scientist,157500.0,162225.0
22002,Data Scientist,157500.0,162225.0


`ValueError` from `ast.literal_eval()`: is likely due to the changing data type of the `job_skills` column after the first execution of the code. From the 2nd execution, the values in the `job_skills` are already in list type.

ValueError when using `pd.notna()`: `pd.notna()` is designed to work with scalar values or NumPy arrays, applying it to a list directly can produce ambiguous results, leading to the ValueError.

ValueError when using `job_skill is not None`: `ast.literal_eval()` is designed to work on strings, not on lists. This mismatch causes the ValueError: malformed node or string because it tries to parse a list object as a string.
--> Solution: using a separate function to introduce a condition to check the data type before applying `ast.literal_eval()` and then call the funtion from inside `apply()`

`pdb`, a Python debugger can be used to step through the code and see where the error is occurring


In [5]:
import ast  # for converting a list-like object inside a string container to a list


def convert_to_list(job_skill):
    if isinstance(job_skill, (list, type(None))):
        return job_skill
    try:
        return ast.literal_eval(job_skill)
    except ValueError:
        # might lose some granularity in inspecting potential issues with data when combining `ValueError` with `TypeError` in 1 `except` block --> can create 2 separate `except` blocks
        return job_skill


df["job_skills"] = df["job_skills"].apply(convert_to_list)

In [6]:
""" Alternative less robust approach:
def convert_to_list(job_skill):
    if isinstance(job_skill, str):  # or `if type(job_skill) == "str"`
        try:
            return ast.literal_eval(
                job_skill
            )  # might lose some granularity in understanding potential issues with data by combining `ValueError` with `TypeError` in `except` block
        except (ValueError, TypeError):
            return job_skill
    else:
        return job_skill


df["job_skills"] = df["job_skills"].apply(convert_to_list)
"""

' Alternative less robust approach:\ndef convert_to_list(job_skill):\n    if isinstance(job_skill, str):  # or `if type(job_skill) == "str"`\n        try:\n            return ast.literal_eval(\n                job_skill\n            )  # might lose some granularity in understanding potential issues with data by combining `ValueError` with `TypeError` in `except` block\n        except (ValueError, TypeError):\n            return job_skill\n    else:\n        return job_skill\n\n\ndf["job_skills"] = df["job_skills"].apply(convert_to_list)\n'

In [7]:
""" The least robust approach as it doesn't handle other invalid Python literal
df["job_skills"] = df["job_skills"].apply(
    lambda job_skill: ast.literal_eval(job_skill) if pd.notna(job_skill) else job_skill
)
"""

' The least robust approach as it doesn\'t handle other invalid Python literal\ndf["job_skills"] = df["job_skills"].apply(\n    lambda job_skill: ast.literal_eval(job_skill) if pd.notna(job_skill) else job_skill\n)\n'