In [4]:
import pandas as pd
import re
from collections import Counter

DATASET_PATH = '../dataset'

all_postings = pd.read_csv(f'{DATASET_PATH}/postings.csv')

In [2]:
programming_languages = ["Python", "Java", "JavaScript", "C\\+\\+", "C#", "Ruby", "Swift", "Kotlin",
    "Rust", "PHP", "TypeScript", "Scala", "Perl", "Haskell", "Lua",
    "Dart", "Objective-C", "MATLAB", "HTML", "CSS", "SQL", "NoSQL", "MongoDB", "MySQL", "PostgreSQL",
    "React", "Angular", "Vue", "Node.js", "Express", "Flask", "Django", "Spring", "Ruby on Rails",
    "TensorFlow", "PyTorch", "Keras", "Scikit-learn", "Pandas", "NumPy", "SciPy", "Matplotlib",
    "Kubernetes", "Docker", "Jenkins", "Travis CI", "GitLab CI", "GitLab", "GitHub", "Git",
    "AWS", "Azure", "Google Cloud", "GCP", "Heroku", "DigitalOcean", "Firebase", "Netlify",
    "Linux", "Unix", "Windows", "macOS", "iOS", "Android", "Raspberry Pi", "Arduino", "ESP32",
    "REST", "GraphQL", "gRPC", "SOAP", "WebSockets", "OAuth", "JWT", "OpenID", "SAML",
    "Agile", "Scrum", "Kanban", "XP", "Pair Programming", "TDD", "BDD", "CI/CD",]

# Define additional tech-related keywords
tech_keywords = [
    "data science", "data analytics", 'machine learning', 'deep learning', 'artificial intelligence', "frontend", 'backend', 'git', 'api', 'apis', 'json', 'sdk', 'developer'
]

# Combine programming languages and tech-related keywords
combined_keywords = programming_languages + tech_keywords

# Convert the combined list into a regex pattern ensuring whole-word matches
pattern = r"\b(?:{})\b".format("|".join(combined_keywords))

# Count matches in title and description
all_postings["match_count"] = all_postings["title"].str.count(pattern, flags=re.IGNORECASE).fillna(0) + \
                    all_postings["description"].str.count(pattern, flags=re.IGNORECASE).fillna(0)

# Filter jobs that match at least 3 of the combined keywords
tech_postings = all_postings[all_postings["match_count"] >= 3].drop(columns=["match_count"])

tech_postings.head(20)


Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
26,175485704,GOYT,Software Engineer,Job Description:GOYT is seeking a skilled and ...,,,"Denver, CO",76987056.0,273.0,,...,,1713281000000.0,,0,PART_TIME,,,,80202.0,8031.0
78,2234533717,Ideando Inc,Full Stack Engineer,"Location: Remote\nCompany Overview:SkillFit, a...",,,United States,69611476.0,21.0,,...,,1713493000000.0,,0,FULL_TIME,,,,,
108,3169712432,SysMind,Salesforce Vlocity Developer,Role: Salesforce Vlocity DeveloperLocation: Ne...,,,"Jersey City, NJ",85964.0,146.0,,...,,1713211000000.0,,0,CONTRACT,,,,7302.0,34017.0
116,3245063922,Saxon AI,Data Architect,Request: Data ArchitectLocation: San Francisco...,,,"San Francisco, CA",224935.0,7.0,,...,,1713537000000.0,,0,CONTRACT,,,,94101.0,6075.0
129,3366698309,Webologix Ltd/ INC,Anaplan Developer,Job Title: Anaplan Developer\n\nLocations: US ...,,,United States,14524845.0,2.0,,...,,1713471000000.0,,0,FULL_TIME,,,,,
146,3475933396,USLI,Senior Developer,This individual will work with a high performa...,,,Greater Philadelphia,33421.0,,,...,,1713538000000.0,,0,FULL_TIME,,,,,
163,3533320307,NLB Services,Java architect / Lead Java developer,Position: Java architect / Lead Java developer...,,,"Jersey City, NJ",490432.0,3.0,,...,,1712855000000.0,,0,FULL_TIME,,,,7302.0,34017.0
181,3586167732,StyleAI,Senior Software Engineer,"StyleAI is the AI-powered, all-in-one unified ...",,,San Francisco Bay Area,90662302.0,31.0,,...,,1713397000000.0,,0,FULL_TIME,,,,,
196,3625991523,Xoriant,DDI Engineer,Title: Infoblox/DNS EngineerLocation: 6860 Yos...,,,"Jersey City, NJ",166996.0,24.0,,...,,1713277000000.0,,0,CONTRACT,,,,7302.0,34017.0
237,3700068571,Akshaya Inc,Software Implementation Program Manager,Title: Software Implementation Program Manager...,,,"Cupertino, CA",18838319.0,2.0,,...,,1713562000000.0,,0,CONTRACT,,,,95014.0,6085.0


In [5]:
Counter(tech_postings['location'])

Counter({'United States': 1835,
         'New York, NY': 399,
         'Chicago, IL': 288,
         'Atlanta, GA': 253,
         'Dallas, TX': 234,
         'Austin, TX': 215,
         'Seattle, WA': 176,
         'Charlotte, NC': 169,
         'San Francisco, CA': 160,
         'Boston, MA': 157,
         'Plano, TX': 157,
         'McLean, VA': 146,
         'Houston, TX': 144,
         'Washington, DC': 130,
         'Richmond, VA': 123,
         'Phoenix, AZ': 121,
         'Irving, TX': 112,
         'New York City Metropolitan Area': 111,
         'New York, United States': 111,
         'San Jose, CA': 110,
         'Jersey City, NJ': 100,
         'San Diego, CA': 97,
         'Sunnyvale, CA': 90,
         'Columbus, OH': 86,
         'Denver, CO': 81,
         'Tampa, FL': 81,
         'Los Angeles, CA': 80,
         'Alpharetta, GA': 79,
         'California, United States': 73,
         'Pittsburgh, PA': 70,
         'Philadelphia, PA': 69,
         'Redmond, WA': 68,
      

In [6]:
state_abbreviations = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]

filtered_postings = tech_postings[tech_postings['location'].str[-2:].isin(state_abbreviations)]
filtered_postings.head()


Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
26,175485704,GOYT,Software Engineer,Job Description:GOYT is seeking a skilled and ...,,,"Denver, CO",76987056.0,273.0,,...,,1713281000000.0,,0,PART_TIME,,,,80202.0,8031.0
108,3169712432,SysMind,Salesforce Vlocity Developer,Role: Salesforce Vlocity DeveloperLocation: Ne...,,,"Jersey City, NJ",85964.0,146.0,,...,,1713211000000.0,,0,CONTRACT,,,,7302.0,34017.0
116,3245063922,Saxon AI,Data Architect,Request: Data ArchitectLocation: San Francisco...,,,"San Francisco, CA",224935.0,7.0,,...,,1713537000000.0,,0,CONTRACT,,,,94101.0,6075.0
163,3533320307,NLB Services,Java architect / Lead Java developer,Position: Java architect / Lead Java developer...,,,"Jersey City, NJ",490432.0,3.0,,...,,1712855000000.0,,0,FULL_TIME,,,,7302.0,34017.0
196,3625991523,Xoriant,DDI Engineer,Title: Infoblox/DNS EngineerLocation: 6860 Yos...,,,"Jersey City, NJ",166996.0,24.0,,...,,1713277000000.0,,0,CONTRACT,,,,7302.0,34017.0
