In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Loading the DataFrame from CSV

In [2]:
df = pd.read_csv('test1.csv', index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399 entries, 0 to 398
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_title        399 non-null    object
 1   company          399 non-null    object
 2   location         399 non-null    object
 3   compensation     20 non-null     object
 4   desc             225 non-null    object
 5   Seniority level  215 non-null    object
 6   Employment type  225 non-null    object
 7   Job function     215 non-null    object
 8   Industries       214 non-null    object
dtypes: object(9)
memory usage: 31.2+ KB


#### Dropping the unnnecessary columns
- Those withoud descriptions are basically irrelevant data.
- There may be some Functions and/or Industries that are blank, remove those as well.
- These are all entry level positions. Seniority Level can be dropped, or just fill the NaN cells with 'Entry Level'

In [4]:
df.dropna(subset=['desc', 'Job function', 'Industries'], inplace=True)
# df['Seniority level'].fillna('Entry Level', inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214 entries, 0 to 398
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_title        214 non-null    object
 1   company          214 non-null    object
 2   location         214 non-null    object
 3   compensation     19 non-null     object
 4   desc             214 non-null    object
 5   Seniority level  214 non-null    object
 6   Employment type  214 non-null    object
 7   Job function     214 non-null    object
 8   Industries       214 non-null    object
dtypes: object(9)
memory usage: 16.7+ KB


#### Here are some available Job Functions on LinkedIn for Easier Categorization.
- New column for the Functions are added, and redundant columns were removed as well
- Separate salary ranges in 'upper' and 'lower' columns, fill all NaN with zeroes.
- Rearrange the table

In [6]:
functions = [
'Analyst',
'Accounting/Auditing', 
'Administrative', 
'Arts and Design', 
'Business',
'Social Services',
'Consulting',
'Education',
'Engineering',
'Entrepreneurship',
'Finance',
'Healthcare Services',
'Human Resources',
'Information Technology',
'Legal',
'Marketing',
'Media and Communication',
'Military',
'Operations',
'Product Management',
'Project Management',
'Purchasing',
'Quality Assurance',
'Real Estate',
'Research',
'Sales',
'Support'
]

In [7]:
# Input would be from each Job Functions column

def match_func(s):  
    # Split the job func using the comman and the 'and' delimiter
    if ',' in s:
        func_split = s.split(', ')
        func_split = [x.replace('and ', '') for x in func_split]
    else:
        func_split = s.split(' and ')

    
    # Loop through the func_split array, and return the first Function that matches the masterlist
    for key in func_split:
        for func in functions:
            if key.strip().lower() in func.lower():
                return func
    return key      # uncategorizeable job functions
        

df['Industry'] = df['Job function'].apply(lambda x: match_func(x))

In [8]:
df['Country'] = df.location.apply(lambda x: x.split(', ')[-1])
df['Country'] = df['Country'].apply(lambda x: 'United States' if len(x) <= 2 or 'area' in x.lower() or 'metro' in x.lower() else x)

In [9]:
df['compensation'].fillna(0, inplace=True)

def clean_comp(c):
    if c == 0 or c == '':
        return 0, 0
    else:
        c = c.replace('$', '').replace(',','').replace('.00','')
        lower = c.split(' - ')[0].replace('/yr', '')
        upper = c.split(' - ')[1].replace('/yr', '')

        # convert all hourly figures to yearly
        if 'hr' in lower.lower() or 'hr' in upper.lower():
            lower = int(lower.replace('/hr', '')) * 2000
            upper = int(upper.replace('/hr', '')) * 2000

        return lower, upper

# print(clean_comp(''))

df['lower'] = df['compensation'].apply(lambda x: clean_comp(x)[0])
df['upper'] = df['compensation'].apply(lambda x: clean_comp(x)[1])
df.drop(inplace=True, columns=['Job function', 'compensation', 'Industries', 'Employment type', 'Seniority level'])

#### Some questions for the available data:
1. What function/Industry does recent job postings come from?
2. Which country is the most active in posting [xxxxx] related job in the past week?
3. Using those limited sample data with salary range, what would be the average/median salary for [xxxxx]?
    - What country is open to disclosing salary range in their job postings?
    - What industies?

In [10]:
df.head()

Unnamed: 0,job_title,company,location,desc,Industry,Country,lower,upper
0,"Junior Associate, Data & Analytics - CHI",AdTheorent,"Chicago, IL",The AdTheorent Data and Analytics (D&A) team i...,Information Technology,United States,0,0
1,"Junior Associate, Data & Analytics - NYC",AdTheorent,"New York, NY",The AdTheorent Data and Analytics (D&A) team i...,Information Technology,United States,0,0
2,"Junior Associate, Data & Analytics - LA",AdTheorent,"Los Angeles, CA",The AdTheorent Data and Analytics (D&A) team i...,Information Technology,United States,0,0
3,Junior Tax,Shopee,"Jakarta, Jakarta, Indonesia",Requirements: Minimum bachelor's degree in Acc...,Accounting/Auditing,Indonesia,0,0
4,Junior Data Engineer,EY,"Athens, Attiki, Greece","At EY, you’ll have the chance to build a caree...",Information Technology,Greece,0,0
