#### Imports

In [2282]:
import pandas as pd
import numpy as np
import re

#### Read linkedin csv

In [2283]:
linkedin = pd.read_csv('../data/linkedin.csv')
linkedin.sample(3)

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size
571,3775377578,Senior Machine Learning Engineer,Jobot,"· Nueva York, NY",125.000 $/año - 175.000 $/año,Presencial,Jornada completa,Intermedio,De 501 a 1.000 empleados · Dotación y selecció...
583,3775617153,Data Labeling Analyst II,Facebook,"· Nueva York, NY",20 $/h - 24 $/h,En remoto,Contrato por obra,Sin experiencia,Más de 10.001 empleados · Desarrollo de software
168,3759719867,Machine Learning Engineer,Akkodis,"· Dearborn, MI",60 $/h - 68 $/h,Presencial,Contrato por obra,Intermedio,Más de 10.001 empleados · Servicios y consulto...


#### Cleaning 'company_state' column

In [2284]:
states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'NC', 'SC', 'CO', 'CT', 'ND', 'SD', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA',
          'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NJ', 'NY', 'NH', 'NM', 'OH', 'OK', 'OR', 'PA', 'RI', 'TN', 'TX', 'UT',
          'UT', 'VT', 'VA', 'WV', 'WA', 'WI', 'WY']
linkedin['company_state'].value_counts()

company_state
Estados Unidos                 87
· Nueva York, NY               32
· San Francisco, CA            20
· Seattle, WA                  19
· Dallas, TX                   14
                               ..
· Plano, TX                     1
· Lexington, KY                 1
Hedge Fund · Nueva York, NY     1
· Montpelier, VT                1
· Parsippany, NJ                1
Name: count, Length: 243, dtype: int64

In [2285]:
def fclean_company_state(df, states):
    for i in states:
        linkedin['company_state'] = linkedin['company_state'].apply(lambda row: i if i in str(row) else row)
    return df

linkedin = fclean_company_state(linkedin, states)

In [2286]:
def sclean_company_state(df, states):
    for key, value in states.items():
        linkedin['company_state'] = linkedin['company_state'].apply(lambda row: value if key in str(row) else row)
    return df

left_states = {
    'Virginia': 'VA',
    'Minnesota': 'MN',
    'Washington': 'WA',
    'San Francisco': 'CA',
    'Nebraska': 'NE',
    'California': 'CA',
    'Texas': 'TX',
    'Nueva York': 'NY',
    'Michigan': 'MI',
    'Luisiana': 'LA',
    'Florida': 'FL',
    'Maryland': 'MD',
    'Nevada': 'NV',
    'Oregón': 'OR',
    'Oklahoma': 'OK',
    'Georgia': 'GA',
    'Carolina del Sur': 'SC',
    'Delaware': 'DE',
    'Ohio': 'OH',
    'Arkansas': 'AR',
    'Iowa': 'IA',
    'Carolina del Norte': 'NC',
    'Indiana': 'IN',
    'Nashville': 'TN',
    'Massachusetts': 'MA',
    'Tennessee': 'TN',
    'Enid': 'OK',
    'Chicago': 'IL',
    'América del Norte': 'USA',
    'Estados Unidos': 'USA',
    'Des Moines': 'IA',
    'Pittsburgh': 'PA',
    'Cincinnati': 'OH',
    'St. Louis': 'MO',
    'Illinois': 'IL',
    'Jacksonville': 'FL'
}
linkedin = sclean_company_state(linkedin, left_states)
linkedin = linkedin[~linkedin['company_state'].str.contains('Unión Europea', na=False)]
linkedin['company_state'].unique()

array(['NC', 'VA', 'USA', 'CA', 'MI', 'WA', 'NE', 'TX', 'MN', 'MA', 'NY',
       'IL', 'OH', 'GA', 'FL', 'WI', 'PA', 'OR', 'NJ', 'MD', 'AZ', 'MO',
       'KS', 'CT', 'UT', 'IN', 'TN', 'KY', 'CO', 'LA', 'NV', 'OK', 'IA',
       'VT', 'ID', 'SC', 'HI', 'DE', 'SD', 'AR', 'RI', 'NM', 'NH'],
      dtype=object)

#### Cleaning 'remote_ratio' column

In [2287]:
def clean_remote_ratio(df, list_):
    for i in list_:
        df['remote_ratio'] = df['remote_ratio'].apply(lambda row: i if i in str(row) else str(row))
    df['remote_ratio'] = df['remote_ratio'].apply(lambda row: np.nan if str(row) == 'nan' else str(row))
    return df

remote_dict = [
    'Presencial',
    'Híbrido',
    'En remoto'
]
linkedin = clean_remote_ratio(linkedin, remote_dict)
linkedin['remote_ratio'].value_counts()

remote_ratio
En remoto     256
Híbrido       216
Presencial    175
Name: count, dtype: int64

In [2288]:
linkedin.loc[linkedin['remote_ratio'] == 'nan', 'remote_ratio'] = np.nan
linkedin['remote_ratio'].value_counts()

remote_ratio
En remoto     256
Híbrido       216
Presencial    175
Name: count, dtype: int64

#### Cleaning 'experience_level' column

In [2289]:
linkedin.sample(5)
linkedin['experience_level'].value_counts()

experience_level
Intermedio                 291
Sin experiencia            174
Prácticas                  102
Algo de responsabilidad     75
Director                    19
Ejecutivo                    2
Name: count, dtype: int64

#### Cleaning 'employment_type' column

In [2290]:
def clean_employment_type(df, list_):
    for i in list_:
        df['employment_type'] = df['employment_type'].apply(lambda row: i if i in str(row) else str(row))
    return df

remote_dict = [
    'Jornada completa',
    'Contrato por obra',
    'Media jornada',
    'Prácticas'
]
linkedin = clean_employment_type(linkedin, remote_dict)
linkedin['employment_type'].value_counts()

employment_type
Jornada completa     420
Contrato por obra    113
Prácticas             80
Media jornada         55
Name: count, dtype: int64

#### Cleaning 'company_size' column

In [2291]:
linkedin['company_size'].value_counts()

company_size
De 51 a 200 empleados · Servicios y consultoría de TI                       42
De 51 a 200 empleados · Dotación y selección de personal                    41
De 1.001 a 5.000 empleados · Servicios y consultoría de TI                  30
De 11 a 50 empleados · Dotación y selección de personal                     27
Más de 10.001 empleados · Hospitales y atención sanitaria                   24
                                                                            ..
De 1.001 a 5.000 empleados · Servicios de diseño                             1
Ve una comparación con los otros 661 solicitantes. Probar Premium gratis     1
Ve una comparación con los otros 308 solicitantes. Probar Premium gratis     1
De 1.001 a 5.000 empleados · Fabricación de alimentos y bebidas              1
De 201 a 500 empleados · Tecnología, información e internet                  1
Name: count, Length: 160, dtype: int64

In [2292]:
linkedin['employees'] = linkedin['company_size'].apply(lambda row: row.split('·')[0].strip())
linkedin['employees'].value_counts()

employees
De 51 a 200 empleados                                                       139
Más de 10.001 empleados                                                     134
De 1.001 a 5.000 empleados                                                  116
De 11 a 50 empleados                                                         74
De 201 a 500 empleados                                                       61
De 501 a 1.000 empleados                                                     57
De 5.001 a 10.000 empleados                                                  42
Entre 1 y 10 empleados                                                       37
Nagesh Kumar Gona ✈ busca personal para este empleo                           1
Ve una comparación con los otros 137 solicitantes. Probar Premium gratis      1
Ve una comparación con los otros 308 solicitantes. Probar Premium gratis      1
Ve una comparación con los otros 661 solicitantes. Probar Premium gratis      1
Ve una comparación con los otr

In [2293]:
pattern1 = r"Más de (\d+\.?\d*) empleados"
pattern2 = r"De (\d+\.?\d*) a (\d+\.?\d*) empleados"
pattern3 = r"Entre (\d+\.?\d*) y (\d+\.?\d*) empleados"
def clean_employee(row):
    match1 = re.match(pattern1, str(row))
    match2 = re.match(pattern2, str(row))
    match3 = re.match(pattern3, str(row))
    if match1:
        return match1.group(1)
    elif match2:
        return match2.group(2)
    elif match3:
        return match3.group(2)
    else:
        return np.nan

linkedin['employees'] = linkedin['employees'].apply(clean_employee)
linkedin['employees'].value_counts()

employees
200       139
10.001    134
5.000     116
50         74
500        61
1.000      57
10.000     42
10         37
Name: count, dtype: int64

In [2294]:
def standarize_company_size(row):
    if pd.notnull(row):
        if int(row.replace('.', '')) < 50:
            return 'S'
        elif int(row.replace('.', '')) <= 250:
            return 'M'
        elif int(row.replace('.', '')) > 250:
            return 'L'
        else:
            return np.nan

linkedin['company_size'] = linkedin['employees'].apply(standarize_company_size)
linkedin['company_size'].value_counts()

company_size
L    410
M    213
S     37
Name: count, dtype: int64

#### Cleaning 'salary_range' column

In [2295]:
linkedin['salary_range'].unique()

array(['100.000 $/año - 150.000 $/año', '283.780 $/año - 331.640 $/año',
       '145.000 $/año - 180.000 $/año', '95.000 $/año - 130.000 $/año',
       '167.200 $/año - 250.800 $/año', '200.000 $/año - 240.000 $/año',
       '70.000 $/yr', nan, '200.000 $/año - 300.000 $/año',
       '120.000 $/año - 220.000 $/año', '144.000 $/año - 270.250 $/año',
       '150.000 $/año - 200.000 $/año', '170.112 $/año - 237.000 $/año',
       '170.000 $/año - 190.000 $/año', '20 $/hr',
       '145.000 $/año - 195.000 $/año', '67.733 $/año - 91.440 $/año',
       '95.000 $/año - 120.000 $/año', '78 $/h - 104 $/h',
       '122.000 $/año - 190.000 $/año', '101.000 $/año - 179.000 $/año',
       '92.500 $/año - 119.500 $/año', '48 $/h - 65 $/h',
       '125.000 $/año - 160.000 $/año', '850.000 $/año - 1.100.000 $/año',
       '35 $/h - 55 $/h', '75 $/h - 85 $/h', '45 $/h - 55 $/h',
       '150.000 $/año - 250.000 $/año', '40 $/h - 43 $/h',
       '115.000 $/año - 130.000 $/año', '150.000 $/año - 300.000 $

In [2296]:
hour_pattern = r"(\d+) \$\/hr"
year_pattern = r"(\d+\.?\d*) \$\/yr"
hour_range_pattern = r"(\d+\,?\d*) \$\/h - (\d+\,?\d*) \$\/h"
year_range_pattern = r"(\d+\.?\d*) \$\/año - (\d+\.?\d*) \$\/año"
month_range_pattern = r"(\d+\.?\d*) \$\/mes - (\d+\.?\d*) \$\/mes"

In [2297]:
def clean_salary(row, type=None):
    match_h = re.match(hour_pattern, str(row))
    match_y = re.match(year_pattern, str(row))
    match_rh = re.match(hour_range_pattern, str(row))
    match_ry = re.match(year_range_pattern, str(row))
    match_rm = re.match(month_range_pattern, str(row))
    if match_h:
        num_h = match_h.group(1)
        # if type == 'max':
        #     num_h = int(num_h) + 5
        # elif type == 'min':
        #     num_h = int(num_h) - 5
        num_w = int(num_h) * 40
        return int(num_w * 48)
    elif match_y:
        sal = int(match_y.group(1).replace('.', ''))
        # if type == 'max':
        #     sal += 2000
        # elif type == 'min':
        #     sal -= 2000
        return sal
    elif match_rh:
        min_ = match_rh.group(1)
        max_ = match_rh.group(2)
        num_h = (float(min_.replace(',', '.')) + float(max_.replace(',', '.'))) / 2
        # if type == 'max':
        #     num_h = float(max_.replace(',', '.'))
        # elif type == 'min':
        #     num_h = float(min_.replace(',', '.'))
        num_w = num_h * 40
        return int(num_w * 48)
    elif match_ry:
        min_ = match_ry.group(1)
        max_ = match_ry.group(2)
        if '.' not in max_:
            max_ += '000'
        if '.' not in min_:
            min_ += '000'
        num_y = (float(min_.replace('.', '')) + float(max_.replace('.', ''))) / 2
        # if type == 'max':
        #     num_y = float(max_.replace('.', ''))
        # elif type == 'min':
        #     num_y = float(min_.replace('.', ''))
        return int(num_y)
    elif match_rm:
        min_ = match_rm.group(1)
        max_ = match_rm.group(2)
        num_m = (float(min_.replace('.', '')) + float(max_.replace('.', ''))) / 2
        # if type == 'max':
        #     num_m = float(max_.replace('.', ''))
        # elif type == 'min':
        #     num_m = float(min_.replace('.', ''))
        return int(num_m * 12)
    else:
        return np.nan

# linkedin['min_salary'] = linkedin['salary_range'].apply(lambda row: clean_salary(row, 'min'))
linkedin['salary'] = linkedin['salary_range'].apply(clean_salary)
# linkedin['max_salary'] = linkedin['salary_range'].apply(lambda row: clean_salary(row, 'max'))

In [2298]:
linkedin['salary'].unique()

array([125000., 307710., 162500., 112500., 209000., 220000.,  70000.,
           nan, 250000., 170000., 207125., 175000., 203556., 180000.,
        38400.,  79586., 107500., 174720., 156000., 140000., 106000.,
       108480., 142500.,  86400., 153600.,  96000., 200000.,  79680.,
       122500., 225000., 172800., 280000., 110000., 122880.,  77500.,
       150000.,  54720., 187250.,  80000., 155000., 182400., 100000.,
       160000., 550000., 152500., 121750., 103000., 102720., 124800.,
        62500., 105000.,  66240.,  52800.,  97500., 275000., 148800.,
        72500., 110400., 400000.,  49920., 214500., 118080.,  99000.,
       300000., 145000., 105600., 100800.,  75000.,  33600.,  11145.,
        87360., 149500., 135000., 129600.,  74668.,  41280., 106560.,
        60480., 120000.,  46080.,  34560., 139200., 117500., 187500.,
        43100.,  68500.,  90000., 350000., 151680., 184500., 185000.,
        67500.,  57600.,  87500., 500000., 140400., 158400., 144000.,
        97227., 1418

In [2299]:
linkedin.sample(5)

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size,employees,salary
599,3775780334,Senior Machine Learning (ML) Engineer,EVONA,CA,130.000 $/año - 205.000 $/año,Híbrido,Jornada completa,Intermedio,M,200.0,167500.0
224,3762687470,Data Scientist,Acunor,NY,140.000 $/año - 180.000 $/año,Híbrido,Jornada completa,Intermedio,M,50.0,160000.0
505,3773304594,Deep Learning Engineer Co-Founder,Stealth,FL,500.000 $/yr,Presencial,Jornada completa,Ejecutivo,S,10.0,500000.0
366,3769019542,Machine Learning Engineer,Kforce Inc,TX,40 $/h - 51 $/h,Presencial,Contrato por obra,Sin experiencia,L,5.0,87360.0
281,3765833525,Remote Travel Consultant - Entry Level,Travel by Tilly,IL,,En remoto,Media jornada,Sin experiencia,S,10.0,


#### Cleaning 'job_title' column

In [2300]:
linkedin['job_title'].value_counts()

job_title
Machine Learning Engineer                                    60
Data Analyst                                                 52
Data Engineer                                                48
Data Scientist                                               45
Data Analyst  I                                              19
                                                             ..
AI Research Engineer                                          1
Machine Learning Software Engineer                            1
Data Engineer with Mortgage Exp_Remote (Only W2 Contract)     1
Principal Data Scientist (healthcare)                         1
Sr. Data Scientist                                            1
Name: count, Length: 363, dtype: int64

In [2301]:
def standarize_job(row):
    title = str(row).lower()
    if 'data engineer' in title:
        return 'Data Engineer'
    elif 'data analyst' in title:
        return 'Data Analyst'
    elif 'data scientist' in title:
        return 'Data Scientist'
    elif 'data' in title and 'junior' not in title:
        if 'engineer' in title:
            return 'Data Engineer'
        elif 'analyst' in title:
            return 'Data Analyst'
    elif ('machine' in title and 'learning' in title) or ('ml' in title and 'engineer' in title):
        return 'Machine Learning Engineer'
    elif 'deep' in title and 'learning' in title and 'engineer' in title:
        return 'Deep Learning Engineer'
    elif 'junior' in title:
        if 'data' in title and 'engineer' in title:
            return 'Data Engineer'
        elif 'data' in title and 'analyst' in title:
            return 'Data Analyst'
    elif 'business' in title:
        if 'intelligence' in title and 'analyst' in title:
            return 'BI Analyst'
        elif 'analyst' in title:
            return 'Business Analyst'
        else:
            return 'BI Engineer'
    elif 'artificial intelligence' in title or 'ai' in title:
        if 'research engineer' in title:
            return 'Research Engineer'
        elif 'ml' in title:
            return 'Machine Learning Engineer'
        elif 'engineer' in title:
            return 'AI Engineer'
        else:
            return title
    else:
        return str(row)


linkedin['job_title'] = linkedin['job_title'].apply(standarize_job)
linkedin['job_title'].unique()

array(['Machine Learning Engineer', 'CYBER INTEL ANALYST - PEN TESTER',
       'Deep Learning Engineer',
       'Senior Algorithms Engineer (Image Processing)', 'Data Engineer',
       'Data Scientist',
       'Computer Vision Engineer (Multimodal with Large Language Models)',
       'BI Analyst', 'Data Analyst', 'Artificial Learning Engineer',
       'Research Engineer', 'Computer Vision Engineer',
       'RF Computational Engineer 3', 'Project Scheduler',
       'Adaptive Optics Senior Software Engineer',
       'Finance Analyst (Benelux)', 'Deep Learning Research Intern',
       'Analytics Engineer', 'Applied Scientist', None,
       'Quantitative Researcher',
       'Computer Vision and Optimization Engineer', 'AI Engineer',
       'Software Company DevOps Engineer',
       'summer intern - r&d/artificial intelligence',
       'Remote Cruise Planner - Entry Level',
       'Remote Travel Consultant - Entry Level',
       'Remote Customer Service Sales Rep - Entry Level',
       'Rem

    Categories for 'job_title' column

In [2302]:
categories = [
    'Data Engineer',
    'Data Analyst',
    'Data Scientist',
    'Machine Learning Engineer',
    'Deep Learning Engineer',
    'BI Analyst',
    'Business Analyst',
    'BI Engineer',
    'Research Engineer',
    'AI Engineer'
]

In [2303]:
linkedin.shape

(668, 11)

In [2304]:
linkedin = linkedin[linkedin['job_title'].isin(categories)]
linkedin = linkedin.drop(columns=['employees', 'salary_range'])

In [2305]:
linkedin.shape

(580, 9)

In [2306]:
linkedin[linkedin['job_id'] == 3756135992]

Unnamed: 0,job_id,job_title,company_name,company_state,remote_ratio,employment_type,experience_level,company_size,salary
123,3756135992,Data Scientist,Adobe,CA,Híbrido,Prácticas,Prácticas,L,86400.0


In [2307]:
linkedin.to_csv('../data/linkedin_standarized.csv', index=False)