In [893]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [894]:
# Cargamos todos los DataFrames con los que vamos a trabajar
df1 = pd.read_csv("./data/1_DataScience_salaries_2024.csv")
df2 = pd.read_csv("./data/2_ds_salaries.csv")
df3 = pd.read_csv("./data/3_jobs_in_data.csv")

# Configuramos pandas para que muestre todas las columnas
pd.set_option('display.max_columns', None)

### 1. Visualización de los 3 datasets

##### DF1

In [895]:
print(df1.info()) # Vemos si hay nulos y el tipo de los datos de cada columna
df1.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14838 entries, 0 to 14837
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           14838 non-null  int64 
 1   experience_level    14838 non-null  object
 2   employment_type     14838 non-null  object
 3   job_title           14838 non-null  object
 4   salary              14838 non-null  int64 
 5   salary_currency     14838 non-null  object
 6   salary_in_usd       14838 non-null  int64 
 7   employee_residence  14838 non-null  object
 8   remote_ratio        14838 non-null  int64 
 9   company_location    14838 non-null  object
 10  company_size        14838 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.2+ MB
None


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021,MI,FT,Data Scientist,30400000,CLP,40038,CL,100,CL,L
1,2021,MI,FT,BI Data Analyst,11000000,HUF,36259,HU,50,US,L
2,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
3,2021,MI,FT,ML Engineer,8500000,JPY,77364,JP,50,JP,S
4,2022,SE,FT,Lead Machine Learning Engineer,7500000,INR,95386,IN,50,IN,L


##### DF2

In [896]:
print(df2.info()) # Vemos si hay nulos y el tipo de los datos de cada columna
df2.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB
None


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


##### DF3

In [897]:
print(df3.info()) # Vemos si hay nulos y el tipo de los datos de cada columna
df3.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9355 entries, 0 to 9354
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           9355 non-null   int64 
 1   job_title           9355 non-null   object
 2   job_category        9355 non-null   object
 3   salary_currency     9355 non-null   object
 4   salary              9355 non-null   int64 
 5   salary_in_usd       9355 non-null   int64 
 6   employee_residence  9355 non-null   object
 7   experience_level    9355 non-null   object
 8   employment_type     9355 non-null   object
 9   work_setting        9355 non-null   object
 10  company_location    9355 non-null   object
 11  company_size        9355 non-null   object
dtypes: int64(3), object(9)
memory usage: 877.2+ KB
None


Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
0,2023,Data DevOps Engineer,Data Engineering,EUR,88000,95012,Germany,Mid-level,Full-time,Hybrid,Germany,L
1,2023,Data Architect,Data Architecture and Modeling,USD,186000,186000,United States,Senior,Full-time,In-person,United States,M
2,2023,Data Architect,Data Architecture and Modeling,USD,81800,81800,United States,Senior,Full-time,In-person,United States,M
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M


### 2. Adaptación de los DataFrames

Antes de unificar, hago unas modificaciones en los DataFrames para evitar tener NaNs cuando los junte.

Primero pienso en las columnas que me interesan y edito cada DataFrame para quedarme con esos datos e igualar el nombre de las columnas y el tipo de representación de cada una en los 3 datasets.

#### DF1

In [899]:
# Modifico las columnas que creo convenientes para una mejor comprensión

# Edición columna 'experience_level'
df1['experience_level'] = df1['experience_level'].str.strip() # Elimina cualquier espacio en blanco que esté al principio o al final

def cat_experience_level(level): # Función para editar la columna 'experience_level'
    if level == 'EN':
        return 'Junior'
    elif level == 'MI':
        return 'Intermediate'
    elif level == 'SE':
        return 'Senior'
    return 'Expert'

df1['experience_level'] = df1['experience_level'].apply(cat_experience_level) # Modificación de la columna 'experience_level'


# Edición columna 'employment_type'
df1['employment_type'] = df1['employment_type'].str.strip() # Elimina cualquier espacio en blanco que esté al principio o al final

def cat_employment_type(type): # Función para editar la columna 'employment_type'
    if type == 'FT':
        return 'Full-time'
    elif type == 'PT':
        return 'Part-time'
    elif type == 'CT':
        return 'Contract'
    return 'Freelance'

df1['employment_type'] = df1['employment_type'].apply(cat_employment_type) # Modificación de la columna 'employment_type'


# Edición columna 'remote_ratio'
def cat_remote_ratio(ratio): # Función para editar la columna 'remote_ratio'
    if ratio == 0:
        return 'In-person'
    elif ratio == 50:
        return 'Hybrid'
    return 'Remote'

df1['remote_ratio'] = df1['remote_ratio'].apply(cat_remote_ratio) # Modificación de la columna 'remote_ratio'
df1['work_setting'] = df1['remote_ratio'] # Creación de una columna equivalente a 'work_setting' con el nombre que queremos
df1.drop(columns = 'remote_ratio', inplace = True) # Eliminación de la antigua columna

In [900]:
df1.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,company_location,company_size,work_setting
0,2021,Intermediate,Full-time,Data Scientist,30400000,CLP,40038,CL,CL,L,Remote
1,2021,Intermediate,Full-time,BI Data Analyst,11000000,HUF,36259,HU,US,L,Hybrid
2,2020,Intermediate,Full-time,Data Scientist,11000000,HUF,35735,HU,HU,L,Hybrid
3,2021,Intermediate,Full-time,ML Engineer,8500000,JPY,77364,JP,JP,S,Hybrid
4,2022,Senior,Full-time,Lead Machine Learning Engineer,7500000,INR,95386,IN,IN,L,Hybrid


#### DF2

In [902]:
# Edición columna 'experience_level'
df2['experience_level'] = df2['experience_level'].str.strip() # Elimina cualquier espacio en blanco que esté al principio o al final
df2['experience_level'] = df2['experience_level'].apply(cat_experience_level)

# Edición columna 'employment_type'
df2['employment_type'] = df2['employment_type'].str.strip() # Elimina cualquier espacio en blanco que esté al principio o al final
df2['employment_type'] = df2['employment_type'].apply(cat_employment_type)

# Modificación de la columna 'remote_ratio'
df2['remote_ratio'] = df2['remote_ratio'].apply(cat_remote_ratio)
df2['work_setting'] = df2['remote_ratio'] # Creación de una columna equivalente a 'work_setting' con el nombre que queremos
df2.drop(columns = 'remote_ratio', inplace = True) # Eliminación de la antigua columna

In [903]:
df2.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,company_location,company_size,work_setting
0,2023,Senior,Full-time,Principal Data Scientist,80000,EUR,85847,ES,ES,L,Remote
1,2023,Intermediate,Contract,ML Engineer,30000,USD,30000,US,US,S,Remote
2,2023,Intermediate,Contract,ML Engineer,25500,USD,25500,US,US,S,Remote
3,2023,Senior,Full-time,Data Scientist,175000,USD,175000,CA,CA,M,Remote
4,2023,Senior,Full-time,Data Scientist,120000,USD,120000,CA,CA,M,Remote


#### DF3

In [905]:
df3.experience_level.value_counts()

experience_level
Senior         6709
Mid-level      1869
Entry-level     496
Executive       281
Name: count, dtype: int64

In [906]:
df3.employment_type.value_counts()

employment_type
Full-time    9310
Contract       19
Part-time      15
Freelance      11
Name: count, dtype: int64

In [907]:
# Modifico las columnas que creo convenientes para una mejor comprensión

# Edición columna 'experience_level'
df3['experience_level'] = df3['experience_level'].str.strip() # Elimina cualquier espacio en blanco que esté al principio o al final

def cat_experience_level(level): # Función para editar la columna 'experience_level'
    if level == 'Entry-level':
        return 'Junior'
    elif level == 'Mid-level':
        return 'Intermediate'
    elif level == 'Executive':
        return 'Expert'
    return 'Senior'

df3['experience_level'] = df3['experience_level'].apply(cat_experience_level) # Modificación de la columna 'experience_level'


# Edición columna 'employment_type'
df3['employment_type'] = df3['employment_type'].str.strip() # Elimina cualquier espacio en blanco que esté al principio o al final

In [908]:
df3.head(5)

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
0,2023,Data DevOps Engineer,Data Engineering,EUR,88000,95012,Germany,Intermediate,Full-time,Hybrid,Germany,L
1,2023,Data Architect,Data Architecture and Modeling,USD,186000,186000,United States,Senior,Full-time,In-person,United States,M
2,2023,Data Architect,Data Architecture and Modeling,USD,81800,81800,United States,Senior,Full-time,In-person,United States,M
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M


### 3. Unificación de los DataFrames

In [909]:
# Unir los datasets
df_combined = pd.concat([df1, df2, df3], ignore_index = True)
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27948 entries, 0 to 27947
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           27948 non-null  int64 
 1   experience_level    27948 non-null  object
 2   employment_type     27948 non-null  object
 3   job_title           27948 non-null  object
 4   salary              27948 non-null  int64 
 5   salary_currency     27948 non-null  object
 6   salary_in_usd       27948 non-null  int64 
 7   employee_residence  27948 non-null  object
 8   company_location    27948 non-null  object
 9   company_size        27948 non-null  object
 10  work_setting        27948 non-null  object
 11  job_category        9355 non-null   object
dtypes: int64(3), object(9)
memory usage: 2.6+ MB


##### - Descripción de lo que representa cada columna del dataset:

- *job_title*: Título del trabajo o puesto del empleado.   
Ejemplo: Data Scientist, Machine Learning Engineer...

- *job_category*: Categoría del trabajo.   
Ejemplo: Data Science and Research, Data Management and Strategy...

- *experience_level*: Nivel de experiencia del empleado.   
Se codifica de la siguiente manera:
    - EN: Entry-level / Junior.
    - MI: Mid-level / Intermediate.
    - SE: Senior-level / Senior.
    - EX: Executive-level / Director.

- *salary*: Salario bruto del empleado, sin conversiones de moneda.

- *salary_currency*: Moneda en la que se paga el salario.
Ejemplo: USD (dólar estadounidense), EUR (euro), GBP (libra esterlina)...

- *salary_in_usd*: Salario convertido a dólares estadounidenses (USD) para facilitar la comparación.

- *company_location*: País en el que se encuentra la sede de la empresa.
Ejemplo: US (Estados Unidos), ES (España)...

- *employee_residence*: País de residencia del empleado.
Ejemplo: US (Estados Unidos), ES (España)...

- *work_setting*: Proporción del trabajo que se realiza de forma remota.   
Se codifica de la siguiente manera:
    - 0: Sin trabajo remoto.
    - 50: Trabajo remoto parcial.
    - 100: Trabajo remoto completo.

- *employment_type*: Tipo de empleo, indicando si es a tiempo completo, a tiempo parcial, etc.   
Se codifica de la siguiente manera:
    - FT: Full-time (Tiempo completo).
    - PT: Part-time (Medio tiempo).
    - CT: Contract (Contrato).
    - FL: Freelance (Autónomo).

- *company_size*: Tamaño de la empresa, normalmente indicado por el número de empleados.   
Las categorías son las siguientes:
    - S: Small (Pequeña) - 1 a 50 empleados.
    - M: Medium (Mediana) - 51 a 250 empleados.
    - L: Large (Grande) - Más de 250 empleados.

- *work_year*: Año en el que se realizó el trabajo o se obtuvo el salario registrado.

In [910]:
# Ordeno las columnas según el orden que creo conveniente
new_order = ['job_title', 'job_category', 'experience_level', 'salary', 'salary_currency', 'salary_in_usd', 'company_location', 'employee_residence',
             'work_setting', 'employment_type', 'company_size', 'work_year']

# Reordeno las columnas
df_combined = df_combined[new_order]

df_combined

Unnamed: 0,job_title,job_category,experience_level,salary,salary_currency,salary_in_usd,company_location,employee_residence,work_setting,employment_type,company_size,work_year
0,Data Scientist,,Intermediate,30400000,CLP,40038,CL,CL,Remote,Full-time,L,2021
1,BI Data Analyst,,Intermediate,11000000,HUF,36259,US,HU,Hybrid,Full-time,L,2021
2,Data Scientist,,Intermediate,11000000,HUF,35735,HU,HU,Hybrid,Full-time,L,2020
3,ML Engineer,,Intermediate,8500000,JPY,77364,JP,JP,Hybrid,Full-time,S,2021
4,Lead Machine Learning Engineer,,Senior,7500000,INR,95386,IN,IN,Hybrid,Full-time,L,2022
...,...,...,...,...,...,...,...,...,...,...,...,...
27943,Data Specialist,Data Management and Strategy,Senior,165000,USD,165000,United States,United States,Remote,Full-time,L,2021
27944,Data Scientist,Data Science and Research,Senior,412000,USD,412000,United States,United States,Remote,Full-time,L,2020
27945,Principal Data Scientist,Data Science and Research,Intermediate,151000,USD,151000,United States,United States,Remote,Full-time,L,2021
27946,Data Scientist,Data Science and Research,Junior,105000,USD,105000,United States,United States,Remote,Full-time,S,2020


In [911]:
# Modificación de la columna 'company_location' y 'employee_residence'
set_company_location = set(df_combined.company_location.to_list())
set_employee_residence = set(df_combined.employee_residence.to_list())

# Unir los sets para obtener todos los elementos únicos
set_combined = set_company_location.union(set_employee_residence)

# Imprimir el conjunto combinado
print(set_combined)

{'BO', 'SE', 'FR', 'JE', 'US', 'Philippines', 'SI', 'Dominican Republic', 'Bolivia', 'Brazil', 'LB', 'Kuwait', 'United Kingdom', 'Russia', 'RO', 'BG', 'Japan', 'Malta', 'Switzerland', 'Andorra', 'Pakistan', 'AU', 'ES', 'HR', 'GH', 'Gibraltar', 'MT', 'EG', 'UG', 'SK', 'DZ', 'Ukraine', 'PE', 'Egypt', 'DK', 'Costa Rica', 'HN', 'Argentina', 'Bulgaria', 'Luxembourg', 'CR', 'United Arab Emirates', 'Algeria', 'RS', 'Belgium', 'Finland', 'India', 'PK', 'IQ', 'Germany', 'Malaysia', 'Chile', 'Ghana', 'Slovenia', 'CA', 'IR', 'Moldova', 'UA', 'GB', 'NZ', 'GI', 'DE', 'FI', 'Poland', 'AL', 'Portugal', 'Latvia', 'Ireland', 'RU', 'China', 'Kenya', 'MK', 'Nigeria', 'Uganda', 'Central African Republic', 'AT', 'Thailand', 'Qatar', 'Bosnia and Herzegovina', 'CO', 'Lithuania', 'VN', 'IT', 'UZ', 'Canada', 'TH', 'Czech Republic', 'Honduras', 'Peru', 'CN', 'MU', 'NL', 'KR', 'CL', 'Indonesia', 'OM', 'DO', 'JP', 'Greece', 'MY', 'HK', 'Armenia', 'PR', 'Hong Kong', 'PT', 'NG', 'Georgia', 'KE', 'CH', 'GR', 'New Ze

In [912]:
# Me quedo solo con los países que están reflejados con su diminutivo (dos caracteres)
diminutives = {country for country in set_combined if len(country) == 2}
diminutives

{'AD',
 'AE',
 'AL',
 'AM',
 'AR',
 'AS',
 'AT',
 'AU',
 'BA',
 'BE',
 'BG',
 'BO',
 'BR',
 'BS',
 'CA',
 'CF',
 'CH',
 'CL',
 'CN',
 'CO',
 'CR',
 'CY',
 'CZ',
 'DE',
 'DK',
 'DO',
 'DZ',
 'EC',
 'EE',
 'EG',
 'ES',
 'FI',
 'FR',
 'GB',
 'GE',
 'GH',
 'GI',
 'GR',
 'HK',
 'HN',
 'HR',
 'HU',
 'ID',
 'IE',
 'IL',
 'IN',
 'IQ',
 'IR',
 'IT',
 'JE',
 'JP',
 'KE',
 'KR',
 'KW',
 'LB',
 'LT',
 'LU',
 'LV',
 'MA',
 'MD',
 'MK',
 'MT',
 'MU',
 'MX',
 'MY',
 'NG',
 'NL',
 'NO',
 'NZ',
 'OM',
 'PE',
 'PH',
 'PK',
 'PL',
 'PR',
 'PT',
 'QA',
 'RO',
 'RS',
 'RU',
 'SA',
 'SE',
 'SG',
 'SI',
 'SK',
 'TH',
 'TN',
 'TR',
 'UA',
 'UG',
 'US',
 'UZ',
 'VN',
 'ZA'}

In [913]:
country_mapping = {
    'AD': 'Andorra',
    'AE': 'Emiratos Árabes Unidos',
    'AL': 'Albania',
    'AM': 'Armenia',
    'AR': 'Argentina',
    'AS': 'Samoa Americana',
    'AT': 'Austria',
    'AU': 'Australia',
    'BA': 'Bosnia y Herzegovina',
    'BE': 'Bélgica',
    'BG': 'Bulgaria',
    'BO': 'Bolivia',
    'BR': 'Brasil',
    'BS': 'Bahamas',
    'CA': 'Canadá',
    'CF': 'República Centroafricana',
    'CH': 'Suiza',
    'CL': 'Chile',
    'CN': 'China',
    'CO': 'Colombia',
    'CR': 'Costa Rica',
    'CY': 'Chipre',
    'CZ': 'República Checa',
    'DE': 'Alemania',
    'DK': 'Dinamarca',
    'DO': 'República Dominicana',
    'DZ': 'Argelia',
    'EC': 'Ecuador',
    'EE': 'Estonia',
    'EG': 'Egipto',
    'ES': 'España',
    'FI': 'Finlandia',
    'FR': 'Francia',
    'GB': 'Reino Unido',
    'GE': 'Georgia',
    'GH': 'Ghana',
    'GI': 'Gibraltar',
    'GR': 'Grecia',
    'HK': 'Hong Kong',
    'HN': 'Honduras',
    'HR': 'Croacia',
    'HU': 'Hungría',
    'ID': 'Indonesia',
    'IE': 'Irlanda',
    'IL': 'Israel',
    'IN': 'India',
    'IQ': 'Irak',
    'IR': 'Irán',
    'IT': 'Italia',
    'JE': 'Jersey',
    'JP': 'Japón',
    'KE': 'Kenia',
    'KR': 'Corea del Sur',
    'KW': 'Kuwait',
    'LB': 'Líbano',
    'LT': 'Lituania',
    'LU': 'Luxemburgo',
    'LV': 'Letonia',
    'MA': 'Marruecos',
    'MD': 'Moldavia',
    'MK': 'Macedonia del Norte',
    'MT': 'Malta',
    'MU': 'Mauricio',
    'MX': 'México',
    'MY': 'Malasia',
    'NG': 'Nigeria',
    'NL': 'Países Bajos',
    'NO': 'Noruega',
    'NZ': 'Nueva Zelanda',
    'OM': 'Omán',
    'PE': 'Perú',
    'PH': 'Filipinas',
    'PK': 'Pakistán',
    'PL': 'Polonia',
    'PR': 'Puerto Rico',
    'PT': 'Portugal',
    'QA': 'Catar',
    'RO': 'Rumanía',
    'RS': 'Serbia',
    'RU': 'Rusia',
    'SA': 'Arabia Saudita',
    'SE': 'Suecia',
    'SG': 'Singapur',
    'SI': 'Eslovenia',
    'SK': 'Eslovaquia',
    'TH': 'Tailandia',
    'TN': 'Túnez',
    'TR': 'Turquía',
    'UA': 'Ucrania',
    'UG': 'Uganda',
    'US': 'Estados Unidos',
    'UZ': 'Uzbekistán',
    'VN': 'Vietnam',
    'ZA': 'Sudáfrica'
}

df_combined['company_location'] = df_combined['company_location'].replace(country_mapping)
df_combined['employee_residence'] = df_combined['employee_residence'].replace(country_mapping)
df_combined.head(5)

Unnamed: 0,job_title,job_category,experience_level,salary,salary_currency,salary_in_usd,company_location,employee_residence,work_setting,employment_type,company_size,work_year
0,Data Scientist,,Intermediate,30400000,CLP,40038,Chile,Chile,Remote,Full-time,L,2021
1,BI Data Analyst,,Intermediate,11000000,HUF,36259,Estados Unidos,Hungría,Hybrid,Full-time,L,2021
2,Data Scientist,,Intermediate,11000000,HUF,35735,Hungría,Hungría,Hybrid,Full-time,L,2020
3,ML Engineer,,Intermediate,8500000,JPY,77364,Japón,Japón,Hybrid,Full-time,S,2021
4,Lead Machine Learning Engineer,,Senior,7500000,INR,95386,India,India,Hybrid,Full-time,L,2022


In [914]:
set(df_combined.job_title.to_list())

{'3D Computer Vision Researcher',
 'AI Architect',
 'AI Developer',
 'AI Engineer',
 'AI Product Manager',
 'AI Programmer',
 'AI Research Engineer',
 'AI Research Scientist',
 'AI Scientist',
 'AI Software Engineer',
 'AWS Data Architect',
 'Admin & Data Analyst',
 'Analytics Engineer',
 'Analytics Engineering Manager',
 'Applied Data Scientist',
 'Applied Machine Learning Engineer',
 'Applied Machine Learning Scientist',
 'Applied Research Scientist',
 'Applied Scientist',
 'Autonomous Vehicle Technician',
 'Azure Data Engineer',
 'BI Analyst',
 'BI Data Analyst',
 'BI Data Engineer',
 'BI Developer',
 'Big Data Architect',
 'Big Data Developer',
 'Big Data Engineer',
 'Business Data Analyst',
 'Business Intelligence',
 'Business Intelligence Analyst',
 'Business Intelligence Data Analyst',
 'Business Intelligence Developer',
 'Business Intelligence Engineer',
 'Business Intelligence Lead',
 'Business Intelligence Manager',
 'Business Intelligence Specialist',
 'CRM Data Analyst',
 '

In [915]:
df_combined.job_category.value_counts()

job_category
Data Science and Research         3014
Data Engineering                  2260
Data Analysis                     1457
Machine Learning and AI           1428
Leadership and Management          503
BI and Visualization               313
Data Architecture and Modeling     259
Data Management and Strategy        61
Data Quality and Operations         55
Cloud and Database                   5
Name: count, dtype: int64

In [916]:
# Eliminamos los NaNs de la columna 'job_category' y le damos el valor que le corresponde

# Importamos la librería necesaria para usar expresiones regulares
import re

def classify_job_title(title): # Clasifica 'job_category' según la columna 'job_title'
    
    title = title.lower() # Convierte el título a minúsculas para hacer la búsqueda insensible a mayúsculas/minúsculas
    
    # Categorías específicas basadas en patrones de títulos
    if re.search(r'\bdata scientist\b|\bdata science\b|\bresearch\b', title):
        return 'Data Science and Research'
    elif re.search(r'\bmachine learning\b|\bml\b|\bai\b|\bdeep learning\b', title):
        return 'Machine Learning and AI'
    elif re.search(r'\bbi\b|\bvisualization\b|\breporting\b|\bpower bi\b', title):
        return 'BI and Visualization'
    elif re.search(r'\bdata engineer\b|\bdata architect\b|\bdata platform\b|\bcloud\b', title):
        return 'Data Engineering'
    elif re.search(r'\bdata analysis\b|\bdata analyst\b|\banalyst\b|\bdata analytics\b', title):
        return 'Data Analysis'
    elif re.search(r'\bcloud\b|\bdatabase\b|\bdevops\b|\baws\b|\bazure\b', title):
        return 'Cloud and Database'
    elif re.search(r'\bleadership\b|\bmanager\b|\bmanagement\b|\bdirector\b', title):
        return 'Leadership and Management'
    elif re.search(r'\bdata quality\b|\boperations\b|\bdata governance\b', title):
        return 'Data Quality and Operations'
    elif re.search(r'\bdata architecture\b|\bmodeling\b|\bdata modeller\b', title):
        return 'Data Architecture and Modeling'
    elif re.search(r'\bstrategy\b|\bdata management\b|\bdata strategy\b', title):
        return 'Data Management and Strategy'
    else:
        return 'Unknown'  # Categoría por defecto si no coincide con ninguna

# Aplicamos la función para llenar los NaNs en job_category
df_combined['job_category'] = df_combined.apply(lambda row: classify_job_title(row['job_title']) if pd.isna(row['job_category']) else row['job_category'], axis = 1)

df_combined


Unnamed: 0,job_title,job_category,experience_level,salary,salary_currency,salary_in_usd,company_location,employee_residence,work_setting,employment_type,company_size,work_year
0,Data Scientist,Data Science and Research,Intermediate,30400000,CLP,40038,Chile,Chile,Remote,Full-time,L,2021
1,BI Data Analyst,BI and Visualization,Intermediate,11000000,HUF,36259,Estados Unidos,Hungría,Hybrid,Full-time,L,2021
2,Data Scientist,Data Science and Research,Intermediate,11000000,HUF,35735,Hungría,Hungría,Hybrid,Full-time,L,2020
3,ML Engineer,Machine Learning and AI,Intermediate,8500000,JPY,77364,Japón,Japón,Hybrid,Full-time,S,2021
4,Lead Machine Learning Engineer,Machine Learning and AI,Senior,7500000,INR,95386,India,India,Hybrid,Full-time,L,2022
...,...,...,...,...,...,...,...,...,...,...,...,...
27943,Data Specialist,Data Management and Strategy,Senior,165000,USD,165000,United States,United States,Remote,Full-time,L,2021
27944,Data Scientist,Data Science and Research,Senior,412000,USD,412000,United States,United States,Remote,Full-time,L,2020
27945,Principal Data Scientist,Data Science and Research,Intermediate,151000,USD,151000,United States,United States,Remote,Full-time,L,2021
27946,Data Scientist,Data Science and Research,Junior,105000,USD,105000,United States,United States,Remote,Full-time,S,2020


In [917]:
# Veo si hay filas duplicadas
duplicates = sum(df_combined.duplicated())
print(f'Hay {duplicates} filas duplicadas')

Hay 13400 filas duplicadas


In [918]:
# Elimino las filas duplicadas
df_combined.drop_duplicates(inplace = True)

In [919]:
# Verificamos que ya no hay nulos y cómo ha quedado el DataFrame después de limpiarlo
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14548 entries, 0 to 27947
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_title           14548 non-null  object
 1   job_category        14548 non-null  object
 2   experience_level    14548 non-null  object
 3   salary              14548 non-null  int64 
 4   salary_currency     14548 non-null  object
 5   salary_in_usd       14548 non-null  int64 
 6   company_location    14548 non-null  object
 7   employee_residence  14548 non-null  object
 8   work_setting        14548 non-null  object
 9   employment_type     14548 non-null  object
 10  company_size        14548 non-null  object
 11  work_year           14548 non-null  int64 
dtypes: int64(3), object(9)
memory usage: 1.4+ MB


In [920]:
df_combined

Unnamed: 0,job_title,job_category,experience_level,salary,salary_currency,salary_in_usd,company_location,employee_residence,work_setting,employment_type,company_size,work_year
0,Data Scientist,Data Science and Research,Intermediate,30400000,CLP,40038,Chile,Chile,Remote,Full-time,L,2021
1,BI Data Analyst,BI and Visualization,Intermediate,11000000,HUF,36259,Estados Unidos,Hungría,Hybrid,Full-time,L,2021
2,Data Scientist,Data Science and Research,Intermediate,11000000,HUF,35735,Hungría,Hungría,Hybrid,Full-time,L,2020
3,ML Engineer,Machine Learning and AI,Intermediate,8500000,JPY,77364,Japón,Japón,Hybrid,Full-time,S,2021
4,Lead Machine Learning Engineer,Machine Learning and AI,Senior,7500000,INR,95386,India,India,Hybrid,Full-time,L,2022
...,...,...,...,...,...,...,...,...,...,...,...,...
27943,Data Specialist,Data Management and Strategy,Senior,165000,USD,165000,United States,United States,Remote,Full-time,L,2021
27944,Data Scientist,Data Science and Research,Senior,412000,USD,412000,United States,United States,Remote,Full-time,L,2020
27945,Principal Data Scientist,Data Science and Research,Intermediate,151000,USD,151000,United States,United States,Remote,Full-time,L,2021
27946,Data Scientist,Data Science and Research,Junior,105000,USD,105000,United States,United States,Remote,Full-time,S,2020
