In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns

In [22]:
# Read the dataset
df = pd.read_csv('../dataset/postings.csv')

In [38]:
import re

# Sort only tech companies

# Define a list of programming languages, removing "Go" and ensuring accurate matches
programming_languages = [
    "Python", "Java", "JavaScript", "C\\+\\+", "C#", "Ruby", "Swift", "Kotlin",
    "Rust", "PHP", "TypeScript", "Scala", "Perl", "Haskell", "Lua",
    "Dart", "Objective-C", "MATLAB",
]

# Define additional tech-related keywords
tech_keywords = [
    "data science", "data analytics", 'machine learning', 'deep learning', 'artificial intelligence', "frontend", 'backend', 'git', 'api', 'apis', 'json', 'sdk', 'developer'
]

# Combine programming languages and tech-related keywords
combined_keywords = programming_languages + tech_keywords

# Convert the combined list into a regex pattern ensuring whole-word matches
pattern = r"\b(?:{})\b".format("|".join(combined_keywords))

# Count matches in title and description
df["match_count"] = df["title"].str.count(pattern, flags=re.IGNORECASE).fillna(0) + \
                    df["description"].str.count(pattern, flags=re.IGNORECASE).fillna(0)

# Filter jobs that match at least 2 of the combined keywords
tech_postings = df[df["match_count"] >= 3].drop(columns=["match_count"])

tech_postings.head(20)

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
26,175485704,GOYT,Software Engineer,Job Description:GOYT is seeking a skilled and ...,,,"Denver, CO",76987056.0,273.0,,...,,1713281000000.0,,0,PART_TIME,,,,80202.0,8031.0
78,2234533717,Ideando Inc,Full Stack Engineer,"Location: Remote\nCompany Overview:SkillFit, a...",,,United States,69611476.0,21.0,,...,,1713493000000.0,,0,FULL_TIME,,,,,
129,3366698309,Webologix Ltd/ INC,Anaplan Developer,Job Title: Anaplan Developer\n\nLocations: US ...,,,United States,14524845.0,2.0,,...,,1713471000000.0,,0,FULL_TIME,,,,,
146,3475933396,USLI,Senior Developer,This individual will work with a high performa...,,,Greater Philadelphia,33421.0,,,...,,1713538000000.0,,0,FULL_TIME,,,,,
163,3533320307,NLB Services,Java architect / Lead Java developer,Position: Java architect / Lead Java developer...,,,"Jersey City, NJ",490432.0,3.0,,...,,1712855000000.0,,0,FULL_TIME,,,,7302.0,34017.0
181,3586167732,StyleAI,Senior Software Engineer,"StyleAI is the AI-powered, all-in-one unified ...",,,San Francisco Bay Area,90662302.0,31.0,,...,,1713397000000.0,,0,FULL_TIME,,,,,
196,3625991523,Xoriant,DDI Engineer,Title: Infoblox/DNS EngineerLocation: 6860 Yos...,,,"Jersey City, NJ",166996.0,24.0,,...,,1713277000000.0,,0,CONTRACT,,,,7302.0,34017.0
260,3722423668,Wanderboat AI,Founding Front-End Engineer (Web),The ideal candidate will be responsible for de...,,,San Francisco Bay Area,99336130.0,15.0,,...,,1713496000000.0,,0,FULL_TIME,,,,,
266,3728459637,HireBus,Principal Backend Engineer,Principal Backend Engineer - Join HireBus and ...,225.0,YEARLY,United States,91326370.0,7.0,,...,,1713471000000.0,,0,FULL_TIME,USD,BASE_SALARY,212.5,,
283,3742692445,ZenithMinds Inc,Sr Data Engineer with Kafka,Data Engineer with Kafka (W2 Only)💯% Remote\nM...,,,"Austin, TX",81941852.0,39.0,,...,,1713209000000.0,,0,FULL_TIME,,,,78701.0,48453.0


In [39]:
# Combine 'description' and 'skills_desc' columns
tech_postings["combined_text"] = tech_postings["description"].fillna("") + " " + tech_postings["skills_desc"].fillna("")

# Function to clean and tokenize text
def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    words = text.lower().split()  # Convert to lowercase and split
    return words

# Flatten the list of words from all tech descriptions
all_words = []
for text in tech_postings["combined_text"].dropna():
    all_words.extend(tokenize(text))

# Count occurrences of each word
word_counts = Counter(all_words)

# Get the top 20 most common words (potential skills)
top_20_skills = word_counts.most_common(20)

# Convert to DataFrame for better readability
top_20_df = pd.DataFrame(top_20_skills, columns=["Skill", "Count"])

# Display the top skills
import ace_tools as tools
tools.display_dataframe_to_user(name="Top 20 Tech Skills", dataframe=top_20_df)

ModuleNotFoundError: No module named 'ace_tools'