#### Imports

In [1708]:
import pandas as pd
import numpy as np
import re

#### Read linkedin csv

In [1709]:
linkedin = pd.read_csv('../data/linkedin.csv')
linkedin.sample(3)

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size
386,3770133317,Data Engineer,The Fountain Group,"· Orange, CA",20 $/h - 35 $/h,Presencial,Prácticas,Prácticas,De 501 a 1.000 empleados · Dotación y selecció...
46,3728891680,Deep Learning Engineer - Robustness Against Ad...,Expedition Technology Inc,"· Herndon, VA",120.000 $/año - 220.000 $/año,Presencial,Jornada completa,Sin experiencia,De 51 a 200 empleados · Fabricación de product...
177,3761481929,Software Company DevOps Engineer,QDStaff,"· Boca Ratón, FL",60.000 $/año - 100.000 $/año,Presencial,Jornada completa,Algo de responsabilidad,Entre 1 y 10 empleados · Dotación y selección ...


#### Cleaning 'company_state' column

In [1710]:
states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'NC', 'SC', 'CO', 'CT', 'ND', 'SD', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA',
          'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NJ', 'NY', 'NH', 'NM', 'OH', 'OK', 'OR', 'PA', 'RI', 'TN', 'TX', 'UT',
          'UT', 'VT', 'VA', 'WV', 'WA', 'WI', 'WY']
linkedin['company_state'] = linkedin['company_state'].apply(lambda row: 'Estados Unidos' if row == '· Estados Unidos' else row)
linkedin['company_state'].value_counts()

company_state
Estados Unidos               82
· Nueva York, NY             31
· Seattle, WA                19
· San Francisco, CA          19
· Dallas, TX                 14
                             ..
· Somerville, MA              1
· Lake Buena Vista, FL        1
· Inver Grove Heights, MN     1
· Portland, OR                1
· Parsippany, NJ              1
Name: count, Length: 239, dtype: int64

In [1711]:
def fclean_company_state(df, states):
    for i in states:
        linkedin['company_state'] = linkedin['company_state'].apply(lambda row: i if i in str(row) else row)
    return df

linkedin = fclean_company_state(linkedin, states)

In [1712]:
def sclean_company_state(df, states):
    for key, value in states.items():
        linkedin['company_state'] = linkedin['company_state'].apply(lambda row: value if key in str(row) else row)
    return df

left_states = {
    'Virginia': 'VA',
    'Minnesota': 'MN',
    'Washington': 'WA',
    'San Francisco': 'CA',
    'Nebraska': 'NE',
    'California': 'CA',
    'Texas': 'TX',
    'Nueva York': 'NY',
    'Michigan': 'MI',
    'Luisiana': 'LA',
    'Florida': 'FL',
    'Maryland': 'MD',
    'Nevada': 'NV',
    'Oregón': 'OR',
    'Oklahoma': 'OK',
    'Georgia': 'GA',
    'Carolina del Sur': 'SC',
    'Delaware': 'DE',
    'Ohio': 'OH',
    'Arkansas': 'AR',
    'Iowa': 'IA',
    'Carolina del Norte': 'NC',
    'Indiana': 'IN',
    'Nashville': 'TN',
    'Massachusetts': 'MA',
    'Tennessee': 'TN',
    'Enid': 'OK',
    'Chicago': 'IL',
    'América del Norte': 'Estados Unidos',
    'Des Moines': 'IA',
    'Pittsburgh': 'PA',
    'Cincinnati': 'OH',
    'St. Louis': 'MO'
}
linkedin = sclean_company_state(linkedin, left_states)
linkedin['company_state'].unique()

array(['NC', 'VA', 'Estados Unidos', 'CA', 'MI', 'WA', 'NE', 'TX', 'MN',
       'MA', 'NY', 'IL', 'OH', 'GA', 'FL', 'WI', 'PA', 'OR', 'NJ', 'MD',
       'AZ', 'MO', 'KS', 'CT', 'UT', '· Unión Europea', 'IN', 'TN', 'KY',
       'LA', 'NV', 'OK', 'CO', 'IA', 'VT', '· Illinois, Estados Unidos',
       'ID', 'SC', 'HI', 'DE', '· Jacksonville y alrededores', 'SD', 'AR',
       'RI', 'NM', 'NH'], dtype=object)

#### Cleaning 'remote_ratio' column

In [1713]:
def clean_remote_ratio(df, dict_):
    for key, val in dict_.items():
        df['remote_ratio'] = df['remote_ratio'].apply(lambda row: key if key in str(row) else str(row))
    df['remote_ratio'] = df['remote_ratio'].apply(lambda row: np.nan if str(row) == 'nan' else str(row))
    return df

remote_dict = {
    'Presencial': 0,
    'Híbrido': 50,
    'En remoto': 100
}
linkedin = clean_remote_ratio(linkedin, remote_dict)
linkedin['remote_ratio'].value_counts()

remote_ratio
En remoto     257
Híbrido       201
Presencial    172
Name: count, dtype: int64

In [1714]:
linkedin.loc[linkedin['remote_ratio'] == 'nan', 'remote_ratio'] = np.nan
linkedin['remote_ratio'].value_counts()

remote_ratio
En remoto     257
Híbrido       201
Presencial    172
Name: count, dtype: int64

#### Cleaning 'experience_level' column

In [1715]:
linkedin.sample(5)
linkedin['experience_level'].value_counts()

experience_level
Intermedio                 276
Sin experiencia            166
Prácticas                  104
Algo de responsabilidad     80
Director                    18
Ejecutivo                    2
Name: count, dtype: int64

#### Cleaning 'employment_type' column

In [1716]:
def clean_employment_type(df, list_):
    for i in list_:
        df['employment_type'] = df['employment_type'].apply(lambda row: i if i in str(row) else str(row))
    return df

remote_dict = [
    'Jornada completa',
    'Contrato por obra',
    'Media jornada',
    'Prácticas'
]
linkedin = clean_employment_type(linkedin, remote_dict)
linkedin['employment_type'].value_counts()

employment_type
Jornada completa     404
Contrato por obra    111
Prácticas             82
Media jornada         54
Name: count, dtype: int64

#### Cleaning 'company_size' column

In [1717]:
linkedin['company_size'].value_counts()

company_size
De 51 a 200 empleados · Servicios y consultoría de TI                       42
De 51 a 200 empleados · Dotación y selección de personal                    39
De 1.001 a 5.000 empleados · Servicios y consultoría de TI                  27
De 11 a 50 empleados · Dotación y selección de personal                     25
Más de 10.001 empleados · Hospitales y atención sanitaria                   24
                                                                            ..
De 1.001 a 5.000 empleados · Servicios profesionales                         1
De 1.001 a 5.000 empleados · Servicios de diseño                             1
Ve una comparación con los otros 661 solicitantes. Probar Premium gratis     1
Ve una comparación con los otros 306 solicitantes. Probar Premium gratis     1
De 201 a 500 empleados · Tecnología, información e internet                  1
Name: count, Length: 159, dtype: int64

In [1718]:
linkedin['employees'] = linkedin['company_size'].apply(lambda row: row.split('·')[0].strip())
linkedin['employees'].value_counts()

employees
De 51 a 200 empleados                                                       137
Más de 10.001 empleados                                                     132
De 1.001 a 5.000 empleados                                                  112
De 11 a 50 empleados                                                         69
De 201 a 500 empleados                                                       58
De 501 a 1.000 empleados                                                     56
De 5.001 a 10.000 empleados                                                  42
Entre 1 y 10 empleados                                                       37
Nagesh Kumar Gona ✈ busca personal para este empleo                           1
Ve una comparación con los otros 137 solicitantes. Probar Premium gratis      1
Ve una comparación con los otros 306 solicitantes. Probar Premium gratis      1
Ve una comparación con los otros 661 solicitantes. Probar Premium gratis      1
Ve una comparación con los otr

In [1719]:
pattern1 = r"Más de (\d+\.?\d*) empleados"
pattern2 = r"De (\d+\.?\d*) a (\d+\.?\d*) empleados"
pattern3 = r"Entre (\d+\.?\d*) y (\d+\.?\d*) empleados"
def clean_employee(row):
    match1 = re.match(pattern1, str(row))
    match2 = re.match(pattern2, str(row))
    match3 = re.match(pattern3, str(row))
    if match1:
        return match1.group(1)
    elif match2:
        return match2.group(2)
    elif match3:
        return match3.group(2)
    else:
        return np.nan

linkedin['employees'] = linkedin['employees'].apply(clean_employee)
linkedin['employees'].value_counts()

employees
200       137
10.001    132
5.000     112
50         69
500        58
1.000      56
10.000     42
10         37
Name: count, dtype: int64

In [1720]:
def standarize_company_size(row):
    if pd.notnull(row):
        if int(row.replace('.', '')) < 50:
            return 'S'
        elif int(row.replace('.', '')) <= 250:
            return 'M'
        elif int(row.replace('.', '')) > 250:
            return 'L'
        else:
            return np.nan

linkedin['company_size'] = linkedin['employees'].apply(standarize_company_size)
linkedin['company_size'].value_counts()

company_size
L    400
M    206
S     37
Name: count, dtype: int64

#### Cleaning 'salary_range' column

In [1721]:
linkedin['salary_range'].unique()

array(['100.000 $/año - 150.000 $/año', '283.780 $/año - 331.640 $/año',
       '145.000 $/año - 180.000 $/año', '95.000 $/año - 130.000 $/año',
       '167.200 $/año - 250.800 $/año', '200.000 $/año - 240.000 $/año',
       '70.000 $/yr', nan, '200.000 $/año - 300.000 $/año',
       '120.000 $/año - 220.000 $/año', '144.000 $/año - 270.250 $/año',
       '170.112 $/año - 237.000 $/año', '170.000 $/año - 190.000 $/año',
       '20 $/hr', '145.000 $/año - 195.000 $/año',
       '67.733 $/año - 91.440 $/año', '95.000 $/año - 120.000 $/año',
       '78 $/h - 104 $/h', '122.000 $/año - 190.000 $/año',
       '101.000 $/año - 179.000 $/año', '48 $/h - 65 $/h',
       '125.000 $/año - 160.000 $/año', '850.000 $/año - 1.100.000 $/año',
       '35 $/h - 55 $/h', '75 $/h - 85 $/h', '45 $/h - 55 $/h',
       '150.000 $/año - 250.000 $/año', '40 $/h - 43 $/h',
       '150.000 $/año - 300.000 $/año', '90 $/hr',
       '260.000 $/año - 300.000 $/año', '150.000 $/año - 200.000 $/año',
       '90.000

In [1722]:
hour_pattern = r"(\d+) \$\/hr"
year_pattern = r"(\d+\.?\d*) \$\/yr"
hour_range_pattern = r"(\d+\,?\d*) \$\/h - (\d+\,?\d*) \$\/h"
year_range_pattern = r"(\d+\.?\d*) \$\/año - (\d+\.?\d*) \$\/año"
month_range_pattern = r"(\d+\.?\d*) \$\/mes - (\d+\.?\d*) \$\/mes"


def clean_salary(row):
    match_h = re.match(hour_pattern, str(row))
    match_y = re.match(year_pattern, str(row))
    match_rh = re.match(hour_range_pattern, str(row))
    match_ry = re.match(year_range_pattern, str(row))
    match_rm = re.match(month_range_pattern, str(row))
    if match_h:
        num_h = match_h.group(1)
        num_w = int(num_h) * 40
        return int(num_w * 48)
    elif match_y:
        return int(match_y.group(1).replace('.', ''))
    elif match_rh:
        min_ = match_rh.group(1)
        max_ = match_rh.group(2)
        num_h = float(min_.replace(',', '.')) + float(max_.replace(',', '.')) / 2
        num_w = num_h * 40
        return int(num_w * 48)
    elif match_ry:
        min_ = match_ry.group(1)
        max_ = match_ry.group(2)
        if len(min_) <= 3:
            min_ += '000'
        if len(max_) <= 3:
            max_ += '000'
        num_y = float(min_.replace('.', '')) + float(max_.replace('.', '')) / 2
        return int(num_y)
    elif match_rm:
        min_ = match_rm.group(1)
        max_ = match_rm.group(2)
        num_m = float(min_.replace('.', '')) + float(max_.replace('.', '')) / 2
        return int(num_m * 12)
    else:
        return np.nan

linkedin['salary_range'] = linkedin['salary_range'].apply(clean_salary)
linkedin['salary_range'].value_counts()

salary_range
230000.0    23
207750.0    21
250000.0    19
279125.0    19
225000.0    18
            ..
277500.0     1
224640.0     1
208000.0     1
235200.0     1
177600.0     1
Name: count, Length: 127, dtype: int64

In [1723]:
linkedin.sample(5)

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size,employees
514,3773895778,Data Analyst I,Centene Corporation,IA,207750.0,En remoto,Jornada completa,Sin experiencia,L,10.001
109,3755427585,Machine Learning Engineer,hackajob,CA,205000.0,En remoto,Jornada completa,Intermedio,M,200.0
260,3765447371,Predictive Modeler/Data Scientist 212176,Medix™,IL,107500.0,En remoto,Contrato por obra,Intermedio,L,1.0
196,3761926222,Data Analyst,Hexaware Technologies,VA,265000.0,Híbrido,Jornada completa,Intermedio,L,10.001
558,3775534052,Online Data Analyst,TELUS International,TN,,En remoto,Media jornada,Sin experiencia,L,10.001


#### Cleaning 'job_title' column

In [1724]:
linkedin['job_title'].value_counts()

job_title
Machine Learning Engineer                       60
Data Analyst                                    52
Data Engineer                                   48
Data Scientist                                  39
Data Analyst  I                                 19
                                                ..
Business Data Scientist, Early Career (2024)     1
Data Scientist - Investment Management           1
Python Data Engineer                             1
AI Research Engineer                             1
Sr. Data Scientist                               1
Name: count, Length: 356, dtype: int64

In [1725]:
def standarize_job(row):
    title = str(row).lower()
    if 'data engineer' in title:
        return 'Data Engineer'
    elif 'data analyst' in title:
        return 'Data Analyst'
    elif 'data scientist' in title:
        return 'Data Scientist'
    elif 'data' in title and 'junior' not in title:
        if 'engineer' in title:
            return 'Data Engineer'
        elif 'analyst' in title:
            return 'Data Analyst'
    elif ('machine' in title and 'learning' in title) or ('ml' in title and 'engineer' in title):
        return 'Machine Learning Engineer'
    elif 'deep' in title and 'learning' in title and 'engineer' in title:
        return 'Deep Learning Engineer'
    elif 'junior' in title:
        if 'data' in title and 'engineer' in title:
            return 'Data Engineer'
        elif 'data' in title and 'analyst' in title:
            return 'Data Analyst'
    elif 'business' in title:
        if 'intelligence' in title and 'analyst' in title:
            return 'BI Analyst'
        elif 'analyst' in title:
            return 'Business Analyst'
        else:
            return 'BI Engineer'
    elif 'artificial intelligence' in title or 'ai' in title:
        if 'research engineer' in title:
            return 'Research Engineer'
        elif 'ml' in title:
            return 'Machine Learning Engineer'
        elif 'engineer' in title:
            return 'AI Engineer'
        else:
            return title
    else:
        return str(row)


linkedin['job_title'] = linkedin['job_title'].apply(standarize_job)
linkedin['job_title'].unique()

array(['Machine Learning Engineer', 'CYBER INTEL ANALYST - PEN TESTER',
       'Deep Learning Engineer',
       'Senior Algorithms Engineer (Image Processing)', 'Data Engineer',
       'Data Scientist',
       'Computer Vision Engineer (Multimodal with Large Language Models)',
       'BI Analyst', 'Data Analyst', 'Artificial Learning Engineer',
       'Research Engineer', 'Computer Vision Engineer',
       'RF Computational Engineer 3', 'Project Scheduler',
       'Adaptive Optics Senior Software Engineer',
       'Finance Analyst (Benelux)', 'Deep Learning Research Intern',
       'Core Infrastructure Engineer - Networking', 'Analytics Engineer',
       'Applied Scientist', None, 'Quantitative Researcher',
       'Computer Vision and Optimization Engineer', 'AI Engineer',
       'Software Company DevOps Engineer',
       'summer intern - r&d/artificial intelligence',
       'Remote Cruise Planner - Entry Level',
       'Remote Travel Consultant - Entry Level',
       'Remote Customer 

    Categories for 'job_title' column

In [1726]:
categories = [
    'Data Engineer',
    'Data Analyst',
    'Data Scientist',
    'Machine Learning Engineer',
    'Deep Learning Engineer',
    'BI Analyst',
    'Business Analyst',
    'BI Engineer',
    'Research Engineer',
    'AI Engineer'
]

In [1727]:
linkedin.shape

(651, 10)

In [1728]:
linkedin = linkedin[linkedin['job_title'].isin(categories)]
linkedin = linkedin.drop(columns=['employees'])

In [1729]:
linkedin.shape

(562, 9)

In [1730]:
linkedin[linkedin['job_id'] == 3756135992]

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size
116,3756135992,Data Scientist,Adobe,CA,120000.0,Híbrido,Prácticas,Prácticas,L


In [1731]:
linkedin.to_csv('../data/linkedin_standarized.csv', index=False)