In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
# Loads all the DataFrames we are going to work with.
df1 = pd.read_csv("../data/1_DataScience_salaries_2024.csv")
df2 = pd.read_csv("../data/2_ds_salaries.csv")
df3 = pd.read_csv("../data/3_jobs_in_data.csv")

# Configures pandas to display all columns
pd.set_option('display.max_columns', None)

## 1. Data cleansing

### 1.1. Visualización de los 3 datasets

##### DF1

In [34]:
print(df1.info()) # Shows if there are nulls and the type of data in each column
df1.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14838 entries, 0 to 14837
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           14838 non-null  int64 
 1   experience_level    14838 non-null  object
 2   employment_type     14838 non-null  object
 3   job_title           14838 non-null  object
 4   salary              14838 non-null  int64 
 5   salary_currency     14838 non-null  object
 6   salary_in_usd       14838 non-null  int64 
 7   employee_residence  14838 non-null  object
 8   remote_ratio        14838 non-null  int64 
 9   company_location    14838 non-null  object
 10  company_size        14838 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.2+ MB
None


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021,MI,FT,Data Scientist,30400000,CLP,40038,CL,100,CL,L
1,2021,MI,FT,BI Data Analyst,11000000,HUF,36259,HU,50,US,L
2,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
3,2021,MI,FT,ML Engineer,8500000,JPY,77364,JP,50,JP,S
4,2022,SE,FT,Lead Machine Learning Engineer,7500000,INR,95386,IN,50,IN,L


##### DF2

In [35]:
print(df2.info()) # Shows if there are nulls and the type of data in each column
df2.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB
None


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


##### DF3

In [36]:
print(df3.info()) # Shows if there are nulls and the type of data in each column
df3.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9355 entries, 0 to 9354
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           9355 non-null   int64 
 1   job_title           9355 non-null   object
 2   job_category        9355 non-null   object
 3   salary_currency     9355 non-null   object
 4   salary              9355 non-null   int64 
 5   salary_in_usd       9355 non-null   int64 
 6   employee_residence  9355 non-null   object
 7   experience_level    9355 non-null   object
 8   employment_type     9355 non-null   object
 9   work_setting        9355 non-null   object
 10  company_location    9355 non-null   object
 11  company_size        9355 non-null   object
dtypes: int64(3), object(9)
memory usage: 877.2+ KB
None


Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
0,2023,Data DevOps Engineer,Data Engineering,EUR,88000,95012,Germany,Mid-level,Full-time,Hybrid,Germany,L
1,2023,Data Architect,Data Architecture and Modeling,USD,186000,186000,United States,Senior,Full-time,In-person,United States,M
2,2023,Data Architect,Data Architecture and Modeling,USD,81800,81800,United States,Senior,Full-time,In-person,United States,M
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M


### 1.2. Adaptación de los DataFrames

Before unifying, I make some modifications to the DataFrames to avoid having NaNs when I put them together.

First of all, I think about the columns I am interested in and edit each DataFrame to keep that data and match the name of the columns and the type of representation of each one in the 3 datasets.

#### DF1

In [37]:
# Modify the columns that I think are convenient for a better understanding.

# Edit column 'experience_level'
df1['experience_level'] = df1['experience_level'].str.strip() # Removes any blank spaces at the beginning or at the end

def cat_experience_level_1(level): # Function to edit the column 'experience_level'.
    if level == 'EN':
        return 'Junior'
    elif level == 'MI':
        return 'Intermediate'
    elif level == 'SE':
        return 'Senior'
    return 'Expert'

df1['experience_level'] = df1['experience_level'].apply(cat_experience_level_1) # Modification of the column 'experience_level'


# Edit column 'employment_type'
df1['employment_type'] = df1['employment_type'].str.strip() # Removes any blank spaces at the beginning or at the end

def cat_employment_type(type): # Function to edit the column 'employment_type'
    if type == 'FT':
        return 'Full-time'
    elif type == 'PT':
        return 'Part-time'
    elif type == 'CT':
        return 'Contract'
    return 'Freelance'

df1['employment_type'] = df1['employment_type'].apply(cat_employment_type) # Modification of the column 'employment_type'


# Edit column 'remote_ratio'
def cat_remote_ratio(ratio): # Function to edit the column 'remote_ratio'
    if ratio == 0:
        return 'In-person'
    elif ratio == 50:
        return 'Hybrid'
    return 'Remote'

df1['remote_ratio'] = df1['remote_ratio'].apply(cat_remote_ratio) # Modification of the column 'remote_ratio'
df1['work_setting'] = df1['remote_ratio'] # Creation of a column equivalent to 'work_setting' with the name we want to use
df1.drop(columns = 'remote_ratio', inplace = True) # Removal of the old column

In [38]:
df1.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,company_location,company_size,work_setting
0,2021,Intermediate,Full-time,Data Scientist,30400000,CLP,40038,CL,CL,L,Remote
1,2021,Intermediate,Full-time,BI Data Analyst,11000000,HUF,36259,HU,US,L,Hybrid
2,2020,Intermediate,Full-time,Data Scientist,11000000,HUF,35735,HU,HU,L,Hybrid
3,2021,Intermediate,Full-time,ML Engineer,8500000,JPY,77364,JP,JP,S,Hybrid
4,2022,Senior,Full-time,Lead Machine Learning Engineer,7500000,INR,95386,IN,IN,L,Hybrid


#### DF2

In [39]:
# Edit column 'experience_level'
df2['experience_level'] = df2['experience_level'].str.strip() # Removes any blank spaces at the beginning or at the end
df2['experience_level'] = df2['experience_level'].apply(cat_experience_level_1)

# Edit column 'employment_type'
df2['employment_type'] = df2['employment_type'].str.strip() # Removes any blank spaces at the beginning or at the end
df2['employment_type'] = df2['employment_type'].apply(cat_employment_type)

# Edit column 'remote_ratio'
df2['remote_ratio'] = df2['remote_ratio'].apply(cat_remote_ratio)
df2['work_setting'] = df2['remote_ratio'] # Creation of a column equivalent to 'work_setting' with the name we want to use
df2.drop(columns = 'remote_ratio', inplace = True) # Removal of the old column

In [40]:
df2.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,company_location,company_size,work_setting
0,2023,Senior,Full-time,Principal Data Scientist,80000,EUR,85847,ES,ES,L,Remote
1,2023,Intermediate,Contract,ML Engineer,30000,USD,30000,US,US,S,Remote
2,2023,Intermediate,Contract,ML Engineer,25500,USD,25500,US,US,S,Remote
3,2023,Senior,Full-time,Data Scientist,175000,USD,175000,CA,CA,M,Remote
4,2023,Senior,Full-time,Data Scientist,120000,USD,120000,CA,CA,M,Remote


#### DF3

In [41]:
df3.experience_level.value_counts()

experience_level
Senior         6709
Mid-level      1869
Entry-level     496
Executive       281
Name: count, dtype: int64

In [42]:
df3.employment_type.value_counts()

employment_type
Full-time    9310
Contract       19
Part-time      15
Freelance      11
Name: count, dtype: int64

In [43]:
# Edit column 'experience_level'
df3['experience_level'] = df3['experience_level'].str.strip() # Removes any blank spaces at the beginning or at the end

def cat_experience_level_2(level): # Function to edit the column 'experience_level'
    if level == 'Entry-level':
        return 'Junior'
    elif level == 'Mid-level':
        return 'Intermediate'
    elif level == 'Executive':
        return 'Expert'
    return 'Senior'

df3['experience_level'] = df3['experience_level'].apply(cat_experience_level_2) # Modification of the column 'experience_level'


# Edit column 'employment_type'
df3['employment_type'] = df3['employment_type'].str.strip() # Removes any blank spaces at the beginning or at the end

In [44]:
df3.head(5)

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
0,2023,Data DevOps Engineer,Data Engineering,EUR,88000,95012,Germany,Intermediate,Full-time,Hybrid,Germany,L
1,2023,Data Architect,Data Architecture and Modeling,USD,186000,186000,United States,Senior,Full-time,In-person,United States,M
2,2023,Data Architect,Data Architecture and Modeling,USD,81800,81800,United States,Senior,Full-time,In-person,United States,M
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M
