In [62]:
!pip install -q pandas kagglehub numpy scipy matplotlib scikit-learn ipykernel jupyter pytest nltk 

In [45]:
import kagglehub
import pandas as pd
import os

In [53]:
# Download latest version
path = kagglehub.dataset_download("arshkon/linkedin-job-postings")
df = pd.read_csv(os.path.join(path,"postings.csv"))

### EDA

In [51]:
df.shape

(123849, 31)

In [52]:
df["location"].value_counts().head()

location
United States    8125
New York, NY     2756
Chicago, IL      1834
Houston, TX      1762
Dallas, TX       1383
Name: count, dtype: int64

In [35]:
yearly_df = df[df["pay_period"] == "YEARLY"].shape

In [107]:
ny_df = df[df["location"] == "New York, NY"].reset_index()

In [37]:
ny_df["pay_period"].value_counts().head()

pay_period
YEARLY     1201
HOURLY      307
MONTHLY       2
WEEKLY        2
Name: count, dtype: int64

In [108]:
ny_df.shape

(2756, 32)

In [109]:
us_df = df[df["location"] == "United States"]

In [110]:
us_df.shape

(8125, 31)

### Cleaning

In [93]:
[c for c in ny_df.columns if "salary" in c]

['max_salary', 'med_salary', 'min_salary', 'normalized_salary']

In [99]:
ny_df[ny_df["max_salary"].notna()].shape

(1371, 32)

In [100]:
ny_df[ny_df["min_salary"].notna()].shape

(1371, 32)

In [101]:
ny_df[ny_df["med_salary"].notna()].shape

(141, 32)

In [103]:
ny_df[ny_df["max_salary"].notna() & ny_df["min_salary"].notna()].shape

(1371, 32)

#### Drop NaN values and non USD

In [194]:
salary_df = df[df["max_salary"].notna() & df["min_salary"].notna()]

In [114]:
salary_ny_df = ny_df[ny_df["max_salary"].notna() & ny_df["min_salary"].notna()]

#### Annualize Salary

In [195]:
salary_df["pay_period"].value_counts()

pay_period
YEARLY      19107
HOURLY      10212
MONTHLY       288
WEEKLY        177
BIWEEKLY        9
Name: count, dtype: int64

In [174]:
salary_df["max_salary"].describe()

count    2.979300e+04
mean     9.193942e+04
std      7.011101e+05
min      1.000000e+00
25%      4.828000e+01
50%      8.000000e+04
75%      1.400000e+05
max      1.200000e+08
Name: max_salary, dtype: float64

In [196]:
salary_scale = {
    "HOURLY": 2080,
    "WEEKLY": 52,
    "BIWEEKLY": 26,
    "MONTHLY":12
}
for period, scale in salary_scale.items():
    salary_df.loc[salary_df['pay_period'] == period, 'max_salary'] *= scale
    salary_df.loc[salary_df['pay_period'] == period, 'min_salary'] *= scale
    salary_df.loc[salary_df['pay_period'] == period, 'pay_period'] = "YEARLY"

In [180]:
salary_df["max_salary"].describe()

count    2.979300e+04
mean     2.639483e+05
std      6.096351e+06
min      1.000000e+00
25%      6.572800e+04
50%      1.010000e+05
75%      1.500000e+05
max      5.720000e+08
Name: max_salary, dtype: float64

In [181]:
salary_df["pay_period"].value_counts()

pay_period
YEARLY    29793
Name: count, dtype: int64

In [200]:
salary_df = salary_df[salary_df["currency"] == 'USD']

In [205]:
salary_df = salary_df[["description","max_salary","min_salary"]]

In [206]:
salary_df

Unnamed: 0,description,max_salary,min_salary
0,Job descriptionA leading real estate firm in N...,41600.0,35360.0
1,"At Aspen Therapy and Wellness , we are committ...",104000.0,62400.0
2,The National Exemplar is accepting application...,65000.0,45000.0
3,Senior Associate Attorney - Elder Law / Trusts...,175000.0,140000.0
4,Looking for HVAC service tech with experience ...,80000.0,60000.0
...,...,...,...
123837,"Position: Clinical Contracts Analyst, Req#: 63...",93600.0,72800.0
123839,This role handles all the onsite catering and ...,65000.0,50000.0
123843,Position: Quality Engineer I (Complaint Invest...,104000.0,62400.0
123844,Our Walnut Creek office is currently seeking a...,195000.0,120000.0


### Features

In [69]:
import nltk 
import re 
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/alexanderpeterson/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [207]:
description = salary_df["description"][2]
description

'The National Exemplar is accepting applications for an Assistant Restaurant Manager.\nWe offer highly competitive wages, healthcare, paid time off, complimentary dining privileges and bonus opportunities. \nWe are a serious, professional, long-standing neighborhood restaurant with over 41 years of service. If you are looking for a long-term fit with a best in class organization then you should apply now. \nPlease send a resumes to pardom@nationalexemplar.com. o'

In [216]:
from typing import Dict

def tokenize(text) -> Dict[str, int]:
    dataset = nltk.sent_tokenize(text) 
    for i in range(len(dataset)): 
        dataset[i] = dataset[i].lower() 
        dataset[i] = re.sub(r'\W', ' ', dataset[i]) 
        dataset[i] = re.sub(r'\s+', ' ', dataset[i]) 
    word2count = {} 
    for data in dataset: 
        words = nltk.word_tokenize(data) 
        for word in words: 
            if word not in word2count.keys(): 
                word2count[word] = 1
            else: 
                word2count[word] += 1
    return word2count

# tokenize(description)

In [233]:
from collections import Counter
import heapq 

# Get Counts across entire dataset
word2count = Counter({})
for description in salary_df["description"][0:5]:
    word2count.update(tokenize(description))

In [235]:
from nltk.corpus import stopwords
nltk.download('stopwords')
for word in stopwords.words('english'):
    del word2count[word]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexanderpeterson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [241]:
word2count

Counter({'team': 10,
         'work': 10,
         'estate': 8,
         'marketing': 8,
         'time': 6,
         'person': 5,
         'planning': 5,
         'quality': 5,
         'practice': 5,
         'law': 5,
         'job': 4,
         'new': 4,
         'experience': 4,
         'design': 4,
         'working': 4,
         'please': 4,
         'environment': 4,
         'agents': 4,
         'skills': 4,
         'social': 4,
         'committed': 4,
         'full': 4,
         'wellness': 4,
         'looking': 4,
         'professional': 4,
         'treatment': 4,
         'services': 4,
         'service': 4,
         'firm': 3,
         'communicate': 3,
         'receive': 3,
         'requests': 3,
         'gender': 3,
         'paid': 3,
         'preferred': 3,
         'years': 3,
         'therapy': 3,
         'clients': 3,
         'life': 3,
         'supervision': 3,
         'mental': 3,
         'communication': 3,
         'health': 3,
         'attor