#### Imports

In [100]:
import pandas as pd
import numpy as np
import re

#### Read linkedin csv

In [101]:
linkedin = pd.read_csv('../data/linkedin_boolean.csv')
linkedin.sample(3)
linkedin.shape

(1810, 10)

#### Read US cities

In [102]:
uscities = pd.read_csv('../data/uscities/uscities.csv')
uscities.sample(3)

Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,density,source,military,incorporated,timezone,ranking,zips,id
11699,Fisher,Fisher,IL,Illinois,17019,Champaign,40.3157,-88.3503,1797,521.6,shape,False,True,America/Chicago,3,61843,1840012224
23475,La Plant,La Plant,SD,South Dakota,46041,Dewey,45.1353,-100.6648,267,10.9,shape,False,False,America/Denver,3,57652,1840004019
2343,East Massapequa,East Massapequa,NY,New York,36059,Nassau,40.6742,-73.4359,19013,2172.5,shape,False,False,America/New_York,2,11762 11758,1840034047


#### Cleaning 'company_state' column

In [103]:
states_ids = uscities['state_id'].unique()
states_ids

array(['NY', 'CA', 'IL', 'FL', 'TX', 'PA', 'GA', 'DC', 'MA', 'AZ', 'MI',
       'WA', 'MN', 'CO', 'NV', 'MD', 'MO', 'OR', 'PR', 'IN', 'OH', 'VA',
       'NC', 'WI', 'RI', 'TN', 'UT', 'OK', 'CT', 'KY', 'LA', 'NE', 'AL',
       'NM', 'SC', 'IA', 'KS', 'AR', 'ID', 'NJ', 'HI', 'MS', 'AK', 'NH',
       'ND', 'ME', 'SD', 'WV', 'MT', 'DE', 'VT', 'WY'], dtype=object)

The first cleaning for the company state will be replacing those values that already have the state id by the state id.

In [104]:
def fclean_company_state(df, states):
    for i in states:
        df['company_state'] = df['company_state'].apply(lambda row: i if i in str(row) else row)
    return df

linkedin = fclean_company_state(linkedin, states_ids)

This second clean involves the creation of a dictionary with state names as keys and states ids as values by replacing those values that have the state name with the state id.

In [105]:
def sclean_company_state(df, states):
    for key, value in states.items():
        df['company_state'] = df['company_state'].apply(lambda row: value if key in str(row) else row)
    return df

state_names = uscities['state_name'].unique()
states_dict = {}
for i in states_ids:
    filtered_data = uscities[uscities['state_id'] == i]
    state_name = filtered_data['state_name'].iloc[0] if not filtered_data.empty else None
    states_dict[state_name] = i

linkedin = sclean_company_state(linkedin, states_dict)
linkedin['company_state'].unique()

array(['CA', 'MA', 'NV', 'WA', '· Greater Philadelphia ·', 'FL', 'TX',
       'NY', 'MI', '· United States ·', 'VA', 'GA', 'PA', 'NE', 'KS',
       'AL', 'NC', 'IL', 'OR', 'DC', 'KY', 'MD', 'AZ', 'NM', 'LA', 'DE',
       'CT', 'MO', 'TN', 'NJ', 'IN', 'CO', 'MN', 'OK', 'HI', 'OH', 'IA',
       'RI', '· San Francisco Bay Area ·', 'UT', '· Greater Houston ·',
       'WI', 'MS', 'AR', 'SC', '· Nashville Metropolitan Area ·',
       '· Greater Macon ·', '· Cincinnati Metropolitan Area ·',
       '· Greater Boston ·', '· Memphis Metropolitan Area ·', 'ND',
       '· Knoxville Metropolitan Area ·', 'ME', 'AK',
       '· Los Angeles Metropolitan Area ·', '· Greater Cleveland ·',
       '· Atlanta Metropolitan Area ·', 'ID', 'VT',
       '· Greater St. Louis ·', '· Greater Sacramento ·', 'WY'],
      dtype=object)

Seems that another clean is needed by replacing those that do not match with any key of the dictionary created before. They seem to be cities.

In [106]:
state_cities = {}
for i in states_ids:
    filtered_data = uscities[uscities['state_id'] == i]
    state_cities[i] = filtered_data['city'].to_list()
    print(f'{i}: {filtered_data["city"].to_list()}')

NY: ['New York', 'Brooklyn', 'Queens', 'Manhattan', 'Bronx', 'Buffalo', 'Rochester', 'Albany', 'Staten Island', 'Syracuse', 'Poughkeepsie', 'Yonkers', 'Binghamton', 'Utica', 'New Rochelle', 'Saratoga Springs', 'Glens Falls', 'Mount Vernon', 'Schenectady', 'Brentwood', 'White Plains', 'Levittown', 'Troy', 'Niagara Falls', 'Freeport', 'West Babylon', 'Hicksville', 'Coram', 'Valley Stream', 'East Meadow', 'Elmont', 'Central Islip', 'Commack', 'Long Beach', 'New City', 'Huntington Station', 'Baldwin', 'Spring Valley', 'Uniondale', 'Kiryas Joel', 'Franklin Square', 'Rome', 'Centereach', 'Ithaca', 'Bay Shore', 'Port Chester', 'Oceanside', 'North Tonawanda', 'Middletown', 'Jamestown', 'Harrison', 'Glen Cove', 'Shirley', 'West Islip', 'Monsey', 'Lindenhurst', 'Holbrook', 'Auburn', 'Deer Park', 'Elmira', 'Plainview', 'Rockville Centre', 'Peekskill', 'Watertown', 'Dix Hills', 'Medford', 'Kingston', 'Copiague', 'Garden City', 'Massapequa', 'East Patchogue', 'North Bellmore', 'Selden', 'Merrick', 

In [107]:
def check_city(row, key, cities):
    for i in cities:
        if i in str(row):
            return key
    return row

def tclean_company_state(df, states):
    for key, val in states.items():
        df['company_state'] = df['company_state'].apply(lambda row: check_city(row, key, val))
    return df

linkedin = tclean_company_state(linkedin, state_cities)
linkedin['company_state'].unique()

array(['CA', 'MA', 'NV', 'WA', 'PA', 'FL', 'TX', 'NY', 'MI',
       '· United States ·', 'VA', 'GA', 'NE', 'KS', 'AL', 'NC', 'IL',
       'OR', 'DC', 'KY', 'MD', 'AZ', 'NM', 'LA', 'DE', 'CT', 'MO', 'TN',
       'NJ', 'IN', 'CO', 'MN', 'OK', 'HI', 'OH', 'IA', 'RI', 'UT', 'WI',
       'MS', 'AR', 'SC', 'ND', 'ME', 'AK', 'ID', 'VT', 'WY'], dtype=object)

There's still a value that has no state id associated, then will replace it by null.

In [108]:
linkedin['company_state'] = linkedin['company_state'].apply(lambda row: np.nan if 'United States' in str(row) else row)
linkedin['company_state'].unique()

array(['CA', 'MA', 'NV', 'WA', 'PA', 'FL', 'TX', 'NY', 'MI', nan, 'VA',
       'GA', 'NE', 'KS', 'AL', 'NC', 'IL', 'OR', 'DC', 'KY', 'MD', 'AZ',
       'NM', 'LA', 'DE', 'CT', 'MO', 'TN', 'NJ', 'IN', 'CO', 'MN', 'OK',
       'HI', 'OH', 'IA', 'RI', 'UT', 'WI', 'MS', 'AR', 'SC', 'ND', 'ME',
       'AK', 'ID', 'VT', 'WY'], dtype=object)

The idea is that all the scrapped jobs have the company state info filled with non null values so:

A method to get those comany state in null, is to navigate to the company page in linkedin and scrap data to get the location.

It will be further work, so will be done later.

In [109]:
linkedin[linkedin['company_state'].isna()].sample(3)

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size,image_link
796,3770165539,Data Engineer,O'Reilly Auto Parts,,"$95,000/yr - $100,000/yr",Remote,Full-time,Mid-Senior level,"10,001+ employees · Retail",https://media.licdn.com/dms/image/C560BAQHJ49A...
207,3703165004,Senior Machine Learning Engineer,Crunchbase,,"$170,000/yr - $190,000/yr",Remote,Full-time,Mid-Senior level,51-200 employees · Software Development,https://media.licdn.com/dms/image/D560BAQHPfIP...
1481,3788997865,Data Architect,O'Reilly Auto Parts,,"$135,000/yr - $140,000/yr",Remote,Full-time,Mid-Senior level,"10,001+ employees · Retail",https://media.licdn.com/dms/image/C560BAQHJ49A...


#### Cleaning 'remote_ratio' column

In [110]:
def clean_remote_ratio(df, list_):
    print(list_)
    for i in list_:
        df['remote_ratio'] = df['remote_ratio'].apply(lambda row: np.nan if str(row) == 'nan' else str(row))
    return df

remote = linkedin['remote_ratio'].unique()
linkedin = clean_remote_ratio(linkedin, remote)
linkedin['remote_ratio'].value_counts()

['On-site' nan 'Hybrid' 'Remote']


remote_ratio
On-site    593
Hybrid     428
Remote     393
Name: count, dtype: int64

#### Cleaning 'experience_level' column

In [111]:
linkedin.sample(5)
linkedin['experience_level'].value_counts()

experience_level
Mid-Senior level    767
Entry level         540
Associate           111
Internship           62
Director              7
Executive             4
Name: count, dtype: int64

#### Cleaning 'employment_type' column

In [112]:
def clean_employment_type(df, list_):
    for i in list_:
        df['employment_type'] = df['employment_type'].apply(lambda row: i if i in str(row) else str(row))
    return df

employment = linkedin['employment_type'].to_list()
linkedin = clean_employment_type(linkedin, employment)
linkedin['employment_type'].value_counts()

employment_type
Full-time     1582
Contract       154
Internship      47
Part-time       24
Temporary        3
Name: count, dtype: int64

#### Cleaning 'company_size' column

In [113]:
linkedin['company_size'].value_counts()

company_size
10,001+ employees · Entertainment Providers          108
10,001+ employees · IT Services and IT Consulting     90
51-200 employees · Staffing and Recruiting            71
10,001+ employees · Hospitals and Health Care         60
10,001+ employees · Financial Services                59
                                                    ... 
501-1,000 employees · Maritime Transportation          1
501-1,000 employees · Law Practice                     1
11-50 employees · Farming                              1
201-500 employees · Events Services                    1
501-1,000 employees · Machinery Manufacturing          1
Name: count, Length: 376, dtype: int64

In [114]:
linkedin['employees'] = linkedin['company_size'].apply(lambda row: row.split('·')[0].strip())
linkedin['company_services'] = linkedin['company_size'].apply(lambda row: row.split('·')[-1].strip())
linkedin['employees'].value_counts()

employees
10,001+ employees                               719
1,001-5,000 employees                           336
51-200 employees                                231
5,001-10,000 employees                          148
501-1,000 employees                             134
201-500 employees                               124
11-50 employees                                  92
1-10 employees                                   23
Santha kumar Ramasamy is hiring for this job      1
Alexander Furmanski is hiring for this job        1
Felix Kimeu is hiring for this job                1
Name: count, dtype: int64

In [115]:
def filter_employee(row):
    pattern1 = r"(\d+\,?\d* employees)"
    pattern2 = r"(\d+\,?\d*-\d+\,?\d* employees)"

    match1 = re.search(pattern1, str(row))
    match2 = re.search(pattern2, str(row))

    if match2:
        return match2.group(1)
    elif match1:
        return match1.group(1)
    else:
        return np.nan

def clean_employee(row):
    pattern1 = r"(\d+\,?\d*) employees"
    pattern2 = r"(\d+\,?\d*)-(\d+\,?\d*) employees"

    match1 = re.match(pattern1, str(row))
    match2 = re.match(pattern2, str(row))

    if match2:
        return match2.group(2)
    elif match1:
        return match1.group(1)
    else:
        return np.nan

linkedin['employees'] = linkedin['employees'].apply(filter_employee)
linkedin['employees'].value_counts()

employees
1,001-5,000 employees     336
51-200 employees          231
5,001-10,000 employees    148
501-1,000 employees       134
201-500 employees         124
11-50 employees            92
1-10 employees             23
Name: count, dtype: int64

In [116]:
linkedin['employees'] = linkedin['employees'].apply(clean_employee)
linkedin['employees'].value_counts()

employees
5,000     336
200       231
10,000    148
1,000     134
500       124
50         92
10         23
Name: count, dtype: int64

In [117]:
def standarize_company_size(row):
    if pd.notnull(row):
        if int(row.replace(',', '')) < 50:
            return 'S'
        elif int(row.replace(',', '')) <= 250:
            return 'M'
        elif int(row.replace(',', '')) > 250:
            return 'L'
        else:
            return np.nan

linkedin['comp_size'] = linkedin['employees']
linkedin['employees'] = linkedin['company_size'].apply(lambda row: row.split('·')[0].strip())
linkedin['company_size'] = linkedin['comp_size']
linkedin['company_size'] = linkedin['company_size'].apply(standarize_company_size)
linkedin = linkedin.drop(columns=['comp_size'])
linkedin['company_size'].value_counts()

company_size
L    742
M    323
S     23
Name: count, dtype: int64

#### Cleaning 'salary_range' column

In [124]:
linkedin['salary_range'].sample(5)

1410              $60/hr - $70/hr
1538    $100,000/yr - $195,500/yr
1142      $66,400/yr - $99,600/yr
824                           NaN
318                           NaN
Name: salary_range, dtype: object

In [119]:
hour_range_pattern = r"\$(\d+\.?\d*)/hr - \$(\d+\.?\d*)/hr"
year_range_pattern = r"\$(\d+\,?\d+\.?\d*)/yr - \$(\d+\,?\d+\.?\d*)/yr"
month_range_pattern = r"\$(\d+\,?\d+\.?\d*)/month - \$(\d+\,?\d+\.?\d*)/month"
hour_pattern = r"\$(\d+)/hr"
year_pattern = r"\$(\d+\,?\d+\.?\d*)/yr"

In [127]:
def clean_salary(row, type=None):
    match_h = re.match(hour_pattern, str(row))
    match_y = re.match(year_pattern, str(row))
    match_rh = re.match(hour_range_pattern, str(row))
    match_ry = re.match(year_range_pattern, str(row))
    match_rm = re.match(month_range_pattern, str(row))
    
    if match_rh:
        min_ = match_rh.group(1)
        max_ = match_rh.group(2)
        num_h = (float(min_.replace(',', '.')) + float(max_.replace(',', '.'))) / 2
        num_w = num_h * 40
        return float(num_w * 48), float(min_.replace(',', '.'))*40*48, float(max_.replace(',', '.'))*40*48
    if match_h:
        num_h = match_h.group(1)
        num_w = int(num_h) * 40
        return int(num_w * 48), np.nan, np.nan
    elif match_ry:
        min_ = match_ry.group(1)
        max_ = match_ry.group(2)
        if ',' not in max_:
            max_ += '000'
        if ',' not in min_:
            min_ += '000'
        num_y = (float(str(min_).replace(',', '')) + float(str(max_).replace(',', ''))) / 2
        return float(num_y), float(str(min_).replace(',', '')), float(str(max_).replace(',', ''))
    elif match_y:
        sal = float(match_y.group(1).replace(',', ''))
        return sal, np.nan, np.nan
    elif match_rm:
        min_ = match_rm.group(1)
        max_ = match_rm.group(2)
        num_m = (float(min_.replace(',', '')) + float(max_.replace(',', ''))) / 2
        return float(num_m * 12), float(min_.replace(',', ''))*12, float(max_.replace(',', ''))*12
    else:
        return np.nan, np.nan, np.nan

linkedin['salary', 'min', 'max'] = linkedin['salary_range'].apply(clean_salary)

In [132]:
linkedin.sample(5)

Unnamed: 0,job_id,job_title,company_name,company_state,salary_range,remote_ratio,employment_type,experience_level,company_size,image_link,employees,company_services,"(salary, min, max)"
1600,3790449499,Data Engineer,Vestis Corporation,,"$88,000/yr - $111,050/yr",Remote,Full-time,Mid-Senior level,,https://media.licdn.com/dms/image/D560BAQHbhGI...,"10,001+ employees",Facilities Services,"(99525.0, 88000.0, 111050.0)"
14,3463197080,Senior Machine Learning Engineer - Ads Signal,TikTok,WA,"$177,688/yr - $266,000/yr",,Full-time,,,https://media.licdn.com/dms/image/C510BAQGCdTh...,"10,001+ employees",Entertainment Providers,"(221844.0, 177688.0, 266000.0)"
1746,3793688544,"Urgently Required: Big Data Engineer_San Jose,...",SPAR Information Systems LLC,CA,,Hybrid,Full-time,Entry level,L,https://media.licdn.com/dms/image/C560BAQF5Zlu...,"501-1,000 employees",Information Technology & Services,"(nan, nan, nan)"
84,3636799852,Junior Data Engineer II,"Kiss Products, Inc.",NY,$23/hr - $45/hr,,Full-time,Mid-Senior level,L,https://media.licdn.com/dms/image/C4D0BAQFzrgC...,"501-1,000 employees",Personal Care Product Manufacturing,"(65280.0, 44160.0, 86400.0)"
1496,3789318081,Junior Level Marketing Analyst,A10 Associates,MA,"$50,000/yr",Hybrid,Full-time,,S,https://media.licdn.com/dms/image/C4E0BAQGKybB...,1-10 employees,1-10 employees,"(50000.0, nan, nan)"


#### Cleaning 'job_title' column

In [134]:
linkedin['job_title'].unique()

array(['Python Developer', 'Deep Learning Engineer', 'Marketing Analyst',
       'Senior Data Scientist',
       'Senior/Staff Machine Learning Engineer - Prediction & Behavior ML',
       'Manager, Machine Learning Platform',
       'Machine Learning Engineer, TikTok Recommendation',
       'Machine Learning Engineer - Collision Avoidance System',
       'Senior/Staff Software Engineer - Machine Learning',
       'Data Engineer, E-Commerce', 'Data Architect Manager',
       'Machine Learning Engineer - Applied AIGC, TikTok Monetization GenAI',
       'Senior Machine Learning Engineer - Ads Signal',
       'Healthcare Data Analyst - OQPS 1002',
       'Machine Learning Engineer, Risk Data Mining - USDS',
       'Machine Learning Engineer, TikTok Branding Ads', 'Data Engineer',
       'Machine Learning Scientist',
       'Machine Learning Engineer, Ads Core - Targeting',
       'Autonomy Engineer - Deep Learning',
       'Secondary Marketing Analyst II',
       'Senior Machine Learning 

In [None]:
# def standarize_job(row):
#     title = str(row).lower()
#     if 'data engineer' in title:
#         return 'Data Engineer'
#     elif 'data analyst' in title:
#         return 'Data Analyst'
#     elif 'data scientist' in title:
#         return 'Data Scientist'
#     elif 'data' in title and 'junior' not in title:
#         if 'engineer' in title:
#             return 'Data Engineer'
#         elif 'analyst' in title:
#             return 'Data Analyst'
#     elif ('machine' in title and 'learning' in title) or ('ml' in title and 'engineer' in title):
#         return 'Machine Learning Engineer'
#     elif 'deep' in title and 'learning' in title and 'engineer' in title:
#         return 'Deep Learning Engineer'
#     elif 'junior' in title:
#         if 'data' in title and 'engineer' in title:
#             return 'Data Engineer'
#         elif 'data' in title and 'analyst' in title:
#             return 'Data Analyst'
#     elif 'business' in title:
#         if 'intelligence' in title and 'analyst' in title:
#             return 'BI Analyst'
#         elif 'analyst' in title:
#             return 'Business Analyst'
#         else:
#             return 'BI Engineer'
#     elif 'artificial intelligence' in title or 'ai' in title:
#         if 'research engineer' in title:
#             return 'Research Engineer'
#         elif 'ml' in title:
#             return 'Machine Learning Engineer'
#         elif 'engineer' in title:
#             return 'AI Engineer'
#         else:
#             return title
#     else:
#         return str(row)


# linkedin['original_title'] = linkedin['job_title']
# linkedin['job_title'] = linkedin['job_title'].apply(standarize_job)
# linkedin['job_title'].unique()

    Categories for 'job_title' column

In [None]:
categories = [
    'Data Engineer',
    'Data Analyst',
    'Data Scientist',
    'Machine Learning Engineer',
    'Deep Learning Engineer',
    'BI Analyst',
    'Business Analyst',
    'BI Engineer',
    'Research Engineer',
    'AI Engineer'
]

In [None]:
linkedin.shape

In [None]:
linkedin = linkedin[linkedin['job_title'].isin(categories)]

In [None]:
linkedin.shape
linkedin.sample(3)

In [None]:
linkedin = linkedin[linkedin['employment_type'] != 'Temporal']
linkedin = linkedin[~linkedin.drop(['salary', 'salary_range'], axis=1).isna().all(axis=1)]
linkedin = linkedin[linkedin['remote_ratio'].notna()]
linkedin = linkedin[linkedin['experience_level'].notna()]
linkedin = linkedin[linkedin['company_size'].notna()]
linkedin.isna().sum()

In [None]:
linkedin.to_csv('../data/linkedin_standarized.csv', index=False)