#### Imports

In [1]:
import pandas as pd
import numpy as np
import re

#### Read linkedin csv

In [6]:
linkedin = pd.read_csv('../data/linkedin_boolean.csv')
linkedin.sample(3)
linkedin.shape

(293, 10)

#### Cleaning 'company_state' column

In [5]:
states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'NC', 'SC', 'CO', 'CT', 'ND', 'SD', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA',
          'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NJ', 'NY', 'NH', 'NM', 'OH', 'OK', 'OR', 'PA', 'RI', 'TN', 'TX', 'UT',
          'UT', 'VT', 'VA', 'WV', 'WA', 'WI', 'WY']
linkedin['company_state'].value_counts()

company_state
· United States ·        28
· New York, NY ·         17
· Santa Clara, CA ·      11
· San Francisco, CA ·    11
· Boston, MA ·           11
                         ..
· Monroe, WI ·            1
· Marysville, OH ·        1
· Orlando, FL ·           1
· Newport Beach, CA ·     1
· Louisville, KY ·        1
Name: count, Length: 143, dtype: int64

In [82]:
def fclean_company_state(df, states):
    for i in states:
        linkedin['company_state'] = linkedin['company_state'].apply(lambda row: i if i in str(row) else row)
    return df

linkedin = fclean_company_state(linkedin, states)

In [83]:
def sclean_company_state(df, states):
    for key, value in states.items():
        linkedin['company_state'] = linkedin['company_state'].apply(lambda row: value if key in str(row) else row)
    return df

left_states = {
    'Virginia': 'VA',
    'Minnesota': 'MN',
    'Washington': 'WA',
    'San Francisco': 'CA',
    'Nebraska': 'NE',
    'California': 'CA',
    'Texas': 'TX',
    'Nueva York': 'NY',
    'Michigan': 'MI',
    'Luisiana': 'LA',
    'Florida': 'FL',
    'Maryland': 'MD',
    'Nevada': 'NV',
    'Oregón': 'OR',
    'Oklahoma': 'OK',
    'Georgia': 'GA',
    'Carolina del Sur': 'SC',
    'Delaware': 'DE',
    'Ohio': 'OH',
    'Arkansas': 'AR',
    'Iowa': 'IA',
    'Carolina del Norte': 'NC',
    'Indiana': 'IN',
    'Nashville': 'TN',
    'Massachusetts': 'MA',
    'Tennessee': 'TN',
    'Enid': 'OK',
    'Chicago': 'IL',
    'América del Norte': 'USA',
    'Estados Unidos': 'USA',
    'Des Moines': 'IA',
    'Pittsburgh': 'PA',
    'Cincinnati': 'OH',
    'St. Louis': 'MO',
    'Illinois': 'IL',
    'Jacksonville': 'FL',
    'Cleveland': 'OH',
    'Los Ángeles': 'CA',
    'Urbana-Champaign': 'IL',
    'Houston': 'TX',
    'Richmond': 'VA'
}
linkedin = sclean_company_state(linkedin, left_states)
linkedin = linkedin[~linkedin['company_state'].str.contains('Unión Europea', na=False)]
linkedin['company_state'].unique()

array(['CA', 'NC', 'VA', 'NY', 'USA', 'WA', 'MI', 'NE', 'TX', 'IN', 'GA',
       'AZ', 'UT', 'OR', 'IL', 'MN', 'MA', 'PA', 'MD', 'NJ', 'CT', 'OH',
       'WI', 'FL', 'KS', 'SC', 'AR', 'TN', 'MO', 'CO', 'KY', 'LA', 'NV',
       'OK', 'IA', 'SD', 'VT', 'ID', 'AL', 'HI', 'DE', 'RI', 'NM', 'NH'],
      dtype=object)

#### Cleaning 'remote_ratio' column

In [84]:
def clean_remote_ratio(df, list_):
    for i in list_:
        df['remote_ratio'] = df['remote_ratio'].apply(lambda row: i if i in str(row) else str(row))
    df['remote_ratio'] = df['remote_ratio'].apply(lambda row: np.nan if str(row) == 'nan' else str(row))
    return df

remote_dict = [
    'Presencial',
    'Híbrido',
    'En remoto'
]
linkedin = clean_remote_ratio(linkedin, remote_dict)
linkedin['remote_ratio'].value_counts()

remote_ratio
En remoto     352
Híbrido       324
Presencial    299
Name: count, dtype: int64

#### Cleaning 'experience_level' column

In [86]:
linkedin.sample(5)
linkedin['experience_level'].value_counts()

experience_level
Intermedio                 461
Sin experiencia            302
Prácticas                  126
Algo de responsabilidad     88
Director                    17
Ejecutivo                    2
Name: count, dtype: int64

#### Cleaning 'employment_type' column

In [87]:
def clean_employment_type(df, list_):
    for i in list_:
        df['employment_type'] = df['employment_type'].apply(lambda row: i if i in str(row) else str(row))
    return df

remote_dict = [
    'Jornada completa',
    'Contrato por obra',
    'Media jornada',
    'Prácticas',
    'Temporal'
]
linkedin = clean_employment_type(linkedin, remote_dict)
linkedin['employment_type'].value_counts()

employment_type
Jornada completa     715
Contrato por obra    133
Prácticas            100
Media jornada         52
Name: count, dtype: int64

#### Cleaning 'company_size' column

In [88]:
linkedin['company_size'].value_counts()

company_size
De 51 a 200 empleados · Servicios y consultoría de TI            43
De 51 a 200 empleados · Dotación y selección de personal         41
Más de 10.001 empleados · Proveedores de entretenimiento         38
Más de 10.001 empleados · Hospitales y atención sanitaria        37
De 11 a 50 empleados · Dotación y selección de personal          33
                                                                 ..
De 1.001 a 5.000 empleados · Servicios públicos                   1
Más de 10.001 empleados · Aeronáutica y aviación                  1
De 1.001 a 5.000 empleados · Servicios profesionales              1
De 1.001 a 5.000 empleados · Servicios de diseño                  1
Más de 10.001 empleados · Transporte de mercancías y paquetes     1
Name: count, Length: 234, dtype: int64

In [89]:
linkedin['company_size'] = linkedin['company_size'].apply(lambda row: row.split('·')[0].strip())
linkedin['company_size'].value_counts()

company_size
Más de 10.001 empleados                                307
De 1.001 a 5.000 empleados                             168
De 51 a 200 empleados                                  164
De 11 a 50 empleados                                    96
De 201 a 500 empleados                                  80
De 501 a 1.000 empleados                                79
De 5.001 a 10.000 empleados                             60
Entre 1 y 10 empleados                                  44
Nagesh Kumar Gona ✈ busca personal para este empleo      1
Fabricación de productos de plástico                     1
Name: count, dtype: int64

In [90]:
def filter_employee(row):
    pattern1 = r"(Más de \d+\.?\d* empleados)"
    pattern2 = r"(De \d+\.?\d* a \d+\.?\d* empleados)"
    pattern3 = r"(Entre \d+\.?\d* y \d+\.?\d* empleados)"

    match1 = re.search(pattern1, str(row))
    match2 = re.search(pattern2, str(row))
    match3 = re.search(pattern3, str(row))

    if match1:
        return match1.group(1)
    elif match2:
        return match2.group(1)
    elif match3:
        return match3.group(1)
    else:
        return np.nan

def clean_employee(row):
    pattern1 = r"Más de (\d+\.?\d*) empleados"
    pattern2 = r"De (\d+\.?\d*) a (\d+\.?\d*) empleados"
    pattern3 = r"Entre (\d+\.?\d*) y (\d+\.?\d*) empleados"

    match1 = re.match(pattern1, str(row))
    match2 = re.match(pattern2, str(row))
    match3 = re.match(pattern3, str(row))

    if match1:
        return match1.group(1)
    elif match2:
        return match2.group(2)
    elif match3:
        return match3.group(2)
    else:
        return np.nan

linkedin['company_size'] = linkedin['company_size'].apply(filter_employee)
linkedin['employees'] = linkedin['company_size'].apply(clean_employee)
linkedin['employees'].value_counts()

employees
10.001    307
5.000     168
200       164
50         96
500        80
1.000      79
10.000     60
10         44
Name: count, dtype: int64

In [91]:
def standarize_company_size(row):
    if pd.notnull(row):
        if int(row.replace('.', '')) < 50:
            return 'S'
        elif int(row.replace('.', '')) <= 250:
            return 'M'
        elif int(row.replace('.', '')) > 250:
            return 'L'
        else:
            return np.nan

linkedin ['comp_size'] = linkedin['employees']
linkedin['employees'] = linkedin['company_size']
linkedin['company_size'] = linkedin['comp_size']
linkedin['company_size'] = linkedin['company_size'].apply(standarize_company_size)
linkedin = linkedin.drop(columns=['comp_size'])
linkedin['company_size'].value_counts()

company_size
L    694
M    260
S     44
Name: count, dtype: int64

#### Cleaning 'salary_range' column

In [92]:
linkedin['salary_range'].unique()

array(['204.000 $/año - 294.000 $/año', '100.000 $/año - 150.000 $/año',
       '283.780 $/año - 331.640 $/año', '160.000 $/año - 200.000 $/año',
       nan, '65.000 $/año - 185.000 $/año',
       '100.000 $/año - 500.000 $/año', '145.000 $/año - 180.000 $/año',
       '95.000 $/año - 130.000 $/año', '86.815 $/año - 138.904 $/año',
       '57.000 $/año - 85.000 $/año', '84.899 $/año - 172.663 $/año',
       '167.200 $/año - 250.800 $/año', '17,87 $/h - 23,96 $/h',
       '200.000 $/año - 240.000 $/año', '136.038 $/año - 182.490 $/año',
       '70.000 $/yr', '200.000 $/año - 300.000 $/año',
       '120.000 $/año - 220.000 $/año', '60.000 $/año - 90.000 $/año',
       '95.301 $/año - 138.186 $/año', '125.000 $/año - 185.000 $/año',
       '144.000 $/año - 270.250 $/año', '150.000 $/año - 200.000 $/año',
       '5.250 $/month', '83.890 $/año - 142.610 $/año',
       '120.000 $/año - 180.000 $/año', '170.112 $/año - 237.000 $/año',
       '170.000 $/año - 190.000 $/año', '20 $/hr',
       

In [93]:
hour_pattern = r"(\d+) \$\/hr"
year_pattern = r"(\d+\.?\d*) \$\/yr"
hour_range_pattern = r"(\d+\,?\d*) \$\/h - (\d+\,?\d*) \$\/h"
year_range_pattern = r"(\d+\.?\d*) \$\/año - (\d+\.?\d*) \$\/año"
month_range_pattern = r"(\d+\.?\d*) \$\/mes - (\d+\.?\d*) \$\/mes"

In [94]:
def clean_salary(row, type=None):
    match_h = re.match(hour_pattern, str(row))
    match_y = re.match(year_pattern, str(row))
    match_rh = re.match(hour_range_pattern, str(row))
    match_ry = re.match(year_range_pattern, str(row))
    match_rm = re.match(month_range_pattern, str(row))
    if match_h:
        num_h = match_h.group(1)
        num_w = int(num_h) * 40
        return int(num_w * 48)
    elif match_y:
        sal = int(match_y.group(1).replace('.', ''))
        return sal
    elif match_rh:
        min_ = match_rh.group(1)
        max_ = match_rh.group(2)
        num_h = (float(min_.replace(',', '.')) + float(max_.replace(',', '.'))) / 2
        num_w = num_h * 40
        return int(num_w * 48)
    elif match_ry:
        min_ = match_ry.group(1)
        max_ = match_ry.group(2)
        if '.' not in max_:
            max_ += '000'
        if '.' not in min_:
            min_ += '000'
        num_y = (float(min_.replace('.', '')) + float(max_.replace('.', ''))) / 2
        return int(num_y)
    elif match_rm:
        min_ = match_rm.group(1)
        max_ = match_rm.group(2)
        num_m = (float(min_.replace('.', '')) + float(max_.replace('.', ''))) / 2
        return int(num_m * 12)
    else:
        return np.nan

linkedin['salary'] = linkedin['salary_range'].apply(clean_salary)

In [95]:
linkedin['salary'].unique()

array([249000., 125000., 307710., 180000.,     nan, 300000., 162500.,
       112500., 112859.,  71000., 128781., 209000.,  40156., 220000.,
       159264.,  70000., 250000., 170000.,  75000., 116743., 155000.,
       207125., 175000., 113250., 150000., 203556.,  38400.,  52800.,
        79586., 160000., 141000., 176500., 107500., 174720., 156000.,
       140000., 106000., 108480., 130000., 142500.,  86400., 108750.,
       125250.,  93250., 153600.,  96000., 200000.,  79680., 122500.,
       225000., 172800., 280000., 110000., 122880.,  77500.,  54720.,
       187250.,  80000., 182400., 100000., 550000., 152500., 121750.,
       151750., 103000., 102720.,  78294., 124800.,  62500., 105000.,
        66240.,  97500.,  94080., 137000., 275000., 148800.,  72500.,
       110400., 400000.,  95300.,  49920., 138900., 214500.,  55440.,
       118080.,  99000., 145000., 105600., 196500., 181440., 100800.,
        33600.,  11145.,  87360., 122595., 149500., 152625., 135000.,
       129600.,  746

In [96]:
linkedin.sample(5)

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size,image_link,employees,salary
764,3774071889,Senior Data Engineer,Material Bank®,USA,50 $/h - 54 $/h,Híbrido,Contrato por obra,Algo de responsabilidad,L,https://media.licdn.com/dms/image/D4E0BAQF_HBC...,De 201 a 500 empleados,99840.0
710,3772905311,"Data Research Analyst, gt.school (Remote) - $6...",Crossover,TX,30 $/hr,En remoto,Jornada completa,Algo de responsabilidad,L,https://media.licdn.com/dms/image/C4E0BAQG8bdX...,De 5.001 a 10.000 empleados,57600.0
925,3776822881,Audit Data Architecture Engineer | Multiple Lo...,KPMG US,TX,140.000 $/año - 180.000 $/año,En remoto,Jornada completa,Intermedio,L,https://media.licdn.com/dms/image/D4E0BAQHbnWd...,Más de 10.001 empleados,160000.0
944,3776998906,Artificial Intelligence Engineer,Cygnus Professionals Inc.,USA,,En remoto,Prácticas,Prácticas,L,https://media.licdn.com/dms/image/C4E0BAQHa-ZT...,De 501 a 1.000 empleados,
90,3724403797,Data Analyst,"American Honda Motor Company, Inc.",OH,60.000 $/año - 90.000 $/año,En remoto,Contrato por obra,Sin experiencia,L,https://media.licdn.com/dms/image/C4E0BAQHNId7...,Más de 10.001 empleados,75000.0


#### Cleaning 'job_title' column

In [97]:
linkedin['job_title'].value_counts()

job_title
Machine Learning Engineer                                              90
Data Analyst                                                           65
Data Engineer                                                          65
Senior Data Scientist                                                  52
Data Scientist                                                         46
                                                                       ..
Data Scientist - LLM                                                    1
Actuarial Data Scientist                                                1
AI/ML Data Scientist                                                    1
Associate Data Scientist - Cybersecurity, Intern                        1
Senior Data Scientist, Clinical Analytics (On-Site, Illinois Based)     1
Name: count, Length: 525, dtype: int64

In [98]:
def standarize_job(row):
    title = str(row).lower()
    if 'data engineer' in title:
        return 'Data Engineer'
    elif 'data analyst' in title:
        return 'Data Analyst'
    elif 'data scientist' in title:
        return 'Data Scientist'
    elif 'data' in title and 'junior' not in title:
        if 'engineer' in title:
            return 'Data Engineer'
        elif 'analyst' in title:
            return 'Data Analyst'
    elif ('machine' in title and 'learning' in title) or ('ml' in title and 'engineer' in title):
        return 'Machine Learning Engineer'
    elif 'deep' in title and 'learning' in title and 'engineer' in title:
        return 'Deep Learning Engineer'
    elif 'junior' in title:
        if 'data' in title and 'engineer' in title:
            return 'Data Engineer'
        elif 'data' in title and 'analyst' in title:
            return 'Data Analyst'
    elif 'business' in title:
        if 'intelligence' in title and 'analyst' in title:
            return 'BI Analyst'
        elif 'analyst' in title:
            return 'Business Analyst'
        else:
            return 'BI Engineer'
    elif 'artificial intelligence' in title or 'ai' in title:
        if 'research engineer' in title:
            return 'Research Engineer'
        elif 'ml' in title:
            return 'Machine Learning Engineer'
        elif 'engineer' in title:
            return 'AI Engineer'
        else:
            return title
    else:
        return str(row)


linkedin['original_title'] = linkedin['job_title']
linkedin['job_title'] = linkedin['job_title'].apply(standarize_job)
linkedin['job_title'].unique()

array(['Data Scientist', 'Machine Learning Engineer',
       'CYBER INTEL ANALYST - PEN TESTER', 'Deep Learning Engineer',
       'Senior Algorithms Engineer (Image Processing)', 'Data Engineer',
       'Data Analyst', 'Deep Learning Researcher',
       'Computer Vision Engineer (Multimodal with Large Language Models)',
       None, 'BI Analyst', 'Artificial Learning Engineer',
       'Research Engineer', 'Computer Vision Engineer',
       'RF Computational Engineer 3', 'Project Scheduler',
       'Adaptive Optics Senior Software Engineer',
       'Finance Analyst (Benelux)', 'Deep Learning Research Intern',
       'Analytics Engineer', 'Applied Scientist',
       'Quantitative Researcher', 'Analyst, Analytics',
       'Computer Vision and Optimization Engineer', 'AI Engineer',
       'Software Company DevOps Engineer', 'BI Engineer',
       'summer intern - r&d/artificial intelligence',
       'Remote Cruise Planner - Entry Level',
       'Remote Travel Consultant - Entry Level',
    

    Categories for 'job_title' column

In [99]:
categories = [
    'Data Engineer',
    'Data Analyst',
    'Data Scientist',
    'Machine Learning Engineer',
    'Deep Learning Engineer',
    'BI Analyst',
    'Business Analyst',
    'BI Engineer',
    'Research Engineer',
    'AI Engineer'
]

In [100]:
linkedin.shape

(1000, 13)

In [101]:
linkedin = linkedin[linkedin['job_title'].isin(categories)]

In [102]:
linkedin.shape
linkedin.sample(3)

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size,image_link,employees,salary,original_title
27,3634189427,Data Scientist,Spotter,CA,100.000 $/año - 500.000 $/año,Híbrido,Jornada completa,Intermedio,M,https://media.licdn.com/dms/image/C560BAQEYvkA...,De 51 a 200 empleados,300000.0,Senior Data Scientist - LLM
603,3770165247,Data Analyst,TEKGENCE INC,OH,20 $/h - 23 $/h,Presencial,Contrato por obra,Sin experiencia,L,https://media.licdn.com/dms/image/C560BAQHeElf...,De 501 a 1.000 empleados,41280.0,Data Analyst
194,3751628399,Machine Learning Engineer,VeeAR Projects Inc.,TX,101.000 $/año - 179.000 $/año,Híbrido,Jornada completa,Sin experiencia,L,https://media.licdn.com/dms/image/C560BAQH9cWu...,De 201 a 500 empleados,140000.0,Machine Learning Engineer


In [103]:
linkedin = linkedin[linkedin['employment_type'] != 'Temporal']
linkedin = linkedin[~linkedin.drop(['salary', 'salary_range'], axis=1).isna().all(axis=1)]
linkedin = linkedin[linkedin['remote_ratio'].notna()]
linkedin = linkedin[linkedin['experience_level'].notna()]
linkedin = linkedin[linkedin['company_size'].notna()]
linkedin.isna().sum()

job_id               0
job_title            0
company_name         0
company_state        0
salary_range        74
remote_ratio         0
employment_type      0
experience_level     0
company_size         0
image_link           0
employees            0
salary              79
original_title       0
dtype: int64

In [104]:
linkedin.to_csv('../data/linkedin_standarized.csv', index=False)