# Job Market Analysis: Extracting and Analyzing Skills from Listings

In [1]:
import json
import pandas as pd

In [2]:
# Load data from the JSON file and create a DataFrame
file_path = "notebooks/claude_collection/skills_json_from_df_v2.json"

with open(file_path, "r") as file:
    data = json.load(file)

df = pd.DataFrame(data)

In [3]:
# Check information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   index            25 non-null     int64 
 1   job_description  25 non-null     object
 2   job_title        25 non-null     object
 3   skills           25 non-null     object
dtypes: int64(1), object(3)
memory usage: 932.0+ bytes


In [4]:
# Display first 5 rows
pd.set_option("display.max_colwidth", 110)
df.head(5)

Unnamed: 0,index,job_description,job_title,skills
0,0,"About the job\n \n \nAbout RevolutPeople deserve more from their money. More visibility, more c...",Head of Product (Crypto Exchange),"{'skills': [{'name': 'Crypto Exchange Product Development', 'reference_text': 'Completely owning and build..."
1,1,About the job\n \n \nASI Reisen ist eine der führenden Reiseplattformen für nachhaltige Aktiv- ...,Head of Product (w/m/d),"{'skills': [{'name': 'Team leadership', 'reference_text': 'Du führst das Tech Team bestehend aus ERP/Web D..."
2,2,"About the job\n \n \nDepartment: ProductEmployment Type: Full TimeLocation: Kraków, PolandRepo...",Head of Product - Voluum,"{'skills': [{'name': 'Product Strategy', 'reference_text': 'develop and articulate a clear and compelling ..."
3,3,About the job\n \n \nDescriptionAbout us:Pragmatic Play is a leading game developer providing p...,Senior Product Owner,"{'skills': [{'name': 'Live Casino product development', 'reference_text': 'Produce successful Live Casino ..."
4,4,About the job\n \n \nProyecto ambicioso|Start up en pleno crecimiento\nAbout Our ClientStart up...,CPO-B2C- Remoto 100%,"{'skills': [{'name': 'Product analysis and improvement', 'reference_text': 'Analisiis y mejora del product..."


## Cleaning Skills Column
To enhance readability and prepare the skills column for analysis, we will execute a series of transformations. Specifically, we will extract the names of skills from the existing structure. The initial format of each entry in the skills column was as follows:
{'skills': [{'name': 'Product Strategy', 'reference_text': 'develop and articulate...'}]}

The cleaning process will involve exclusively retaining a list of skill names, making the column more suitable for straightforward analysis.


In [5]:
# Clean skills column by extracting names
df["skills"] = df["skills"].apply(lambda x: [skill["name"] for skill in x["skills"]])

df.head(5)

Unnamed: 0,index,job_description,job_title,skills
0,0,"About the job\n \n \nAbout RevolutPeople deserve more from their money. More visibility, more c...",Head of Product (Crypto Exchange),"[Crypto Exchange Product Development, Product Roadmapping, People Management, UX/UI Design, Technical Back..."
1,1,About the job\n \n \nASI Reisen ist eine der führenden Reiseplattformen für nachhaltige Aktiv- ...,Head of Product (w/m/d),"[Team leadership, Technical expertise, Team development, Process improvement, Communication skills, Perfor..."
2,2,"About the job\n \n \nDepartment: ProductEmployment Type: Full TimeLocation: Kraków, PolandRepo...",Head of Product - Voluum,"[Product Strategy, Team Leadership, Execution, Market Research, Customer Focus, Cross-Functional Collabora..."
3,3,About the job\n \n \nDescriptionAbout us:Pragmatic Play is a leading game developer providing p...,Senior Product Owner,"[Live Casino product development, Game design and documentation, Product vision and stakeholder coordinati..."
4,4,About the job\n \n \nProyecto ambicioso|Start up en pleno crecimiento\nAbout Our ClientStart up...,CPO-B2C- Remoto 100%,"[Product analysis and improvement, Sales strategy, Collaboration, Innovation, Ideation and conceptualization]"


## Adding Columns for Easier Analysis

After cleaning `skills` to display a list of skills only, we are now introducing new columns to facilitate further analysis.

- **num_phrases**: This column counts the number of phrases within `skills`. Each phrase represents a group of words that were identified as important skills.

- **skills_text**: This one mirrors the content of the `skills` column but with the removal of ',' to represent individual words. It allows us to view each word individually.

- **num_words**:  `num_words` counts the total number of words in the `skills_text` column.

- **unique_words**: This column also counts the words in `skills_text` but without repetition, providing insight into the unique vocabulary present in the skills descriptions.

In [6]:
# Count number of phrases inside skills
df["num_phrases"] = df["skills"].str.len()

# Join skills into full text string
df["skills_text"] = df["skills"].apply(" ".join)

# Split skills into individual words
words = df["skills_text"].str.split(None)

# Count number of words
df["num_words"] = words.str.len()

# Get unique words per row
df["unique_words"] = words.apply(set)

# Count uniques and summarize
df["num_uniques"] = df["unique_words"].apply(len)
word_counts = df.groupby("job_title")["num_uniques"].sum()

display(df.head(5))

Unnamed: 0,index,job_description,job_title,skills,num_phrases,skills_text,num_words,unique_words,num_uniques
0,0,"About the job\n \n \nAbout RevolutPeople deserve more from their money. More visibility, more c...",Head of Product (Crypto Exchange),"[Crypto Exchange Product Development, Product Roadmapping, People Management, UX/UI Design, Technical Back...",6,Crypto Exchange Product Development Product Roadmapping People Management UX/UI Design Technical Backend D...,14,"{Analysis, Management, Technical, Backend, Product, Development, Roadmapping, People, Data, Crypto, Design...",13
1,1,About the job\n \n \nASI Reisen ist eine der führenden Reiseplattformen für nachhaltige Aktiv- ...,Head of Product (w/m/d),"[Team leadership, Technical expertise, Team development, Process improvement, Communication skills, Perfor...",12,Team leadership Technical expertise Team development Process improvement Communication skills Performance ...,25,"{improvement, development, Agile, management, Data-driven, leadership, Performance, Stress, skills, Commun...",21
2,2,"About the job\n \n \nDepartment: ProductEmployment Type: Full TimeLocation: Kraków, PolandRepo...",Head of Product - Voluum,"[Product Strategy, Team Leadership, Execution, Market Research, Customer Focus, Cross-Functional Collabora...",14,Product Strategy Team Leadership Execution Market Research Customer Focus Cross-Functional Collaboration P...,30,"{Execution, Advertising, Proven, Market, Experience, Technology, Product, Strategy, it, Customer, Strategi...",26
3,3,About the job\n \n \nDescriptionAbout us:Pragmatic Play is a leading game developer providing p...,Senior Product Owner,"[Live Casino product development, Game design and documentation, Product vision and stakeholder coordinati...",6,Live Casino product development Game design and documentation Product vision and stakeholder coordination ...,20,"{Casino, stakeholder, Live, product, Product, development, resolution, Game, vision, design, management, i...",17
4,4,About the job\n \n \nProyecto ambicioso|Start up en pleno crecimiento\nAbout Our ClientStart up...,CPO-B2C- Remoto 100%,"[Product analysis and improvement, Sales strategy, Collaboration, Innovation, Ideation and conceptualization]",5,Product analysis and improvement Sales strategy Collaboration Innovation Ideation and conceptualization,11,"{Innovation, analysis, Product, improvement, strategy, Collaboration, Ideation, conceptualization, and, Sa...",10


In [7]:
"""# Generate word cloud
wordcloud = WordCloud(width=900, height=380, colormap="summer").generate(df["skills"])

# Plot word cloud
plt.figure(figsize=(16, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("\nSkills Word Cloud\n", size=18, color="#222222")
plt.axis("off")

# Set the background color of the figure to white
plt.gcf().patch.set_facecolor("white")

# Adjust layout to center the plot
plt.tight_layout()

plt.show()"""

'# Generate word cloud\nwordcloud = WordCloud(width=900, height=380, colormap="summer").generate(df["skills"])\n\n# Plot word cloud\nplt.figure(figsize=(16, 6))\nplt.imshow(wordcloud, interpolation="bilinear")\nplt.title("\nSkills Word Cloud\n", size=18, color="#222222")\nplt.axis("off")\n\n# Set the background color of the figure to white\nplt.gcf().patch.set_facecolor("white")\n\n# Adjust layout to center the plot\nplt.tight_layout()\n\nplt.show()'

In [8]:
from collections import Counter

# Count the frequency of each skill
skill_counts = Counter(skill["name"] for skill in skills)

# Print the most common skills
most_common_skills = skill_counts.most_common()
print(f"Most common skills for {job_title}: {most_common_skills}")

NameError: name 'skills' is not defined

In [None]:
import re

data = """
name='Crypto Exchange Product Management' reference_text='Completely owning and building our Crypto Exchange product, increasing the number of tokens tradable in the application and providing the tools retail users expect from exchanges'
name='Team Leadership' reference_text="Setting your team's goals, success metrics, and roadmap to align with Revolut’s mission and drive maximum impact based on data analysis, market research, and company strategy"
name='Crypto Product Development' reference_text='Working with our Core Crypto team to expand and improve our suite of crypto products, including deposits, withdrawals, and staking, making them more accessible for our retail users'
name='UX Design' reference_text='Working closely with Design and UX Research to define the customer journey and create an amazing user experience'
name='Engineering Collaboration' reference_text='Liaising with Engineering to ensure effective delivery of the product'
name='Stakeholder Collaboration' reference_text='Collaborating with other stakeholders, such as Product Marketing, Legal, and Product Strategy'
"""

# Extract skills and reference texts
skill_data = re.findall(r"name='(.*?)' reference_text='(.*?)'", data)

# Print extracted skills and reference texts
for skill in skill_data:
    print(f"Skill: {skill[0]}\nReference Text: {skill[1]}\n")

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Assuming skill_data contains the extracted skills
all_skills = " ".join(skill[0] for skill in skill_data)

# Generate word cloud
wordcloud = WordCloud(width=900, height=380, colormap="summer").generate(all_skills)

# Plot word cloud
plt.figure(figsize=(16, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("\nSkills Word Cloud\n", size=18, color="#222222")
plt.axis("off")

# Set the background color of the figure to white
plt.gcf().patch.set_facecolor("white")

# Adjust layout to center the plot
plt.tight_layout()

plt.show()

In [None]:
from collections import Counter
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Tokenize the text
doc = nlp(all_skills)

# Perform frequency analysis
word_freq = Counter(token.text for token in doc)

# Display the top N most common words
top_n = 5
print(f"Top {top_n} Most Common Words:")
for word, count in word_freq.most_common(top_n):
    print(f"{word}: {count}")

In [None]:
# Display the top N most common words
top_n = 10
top_words, top_frequencies = zip(*word_freq.most_common(top_n))

# Normalize frequencies to use as sizes for dots
sizes = [freq * 10 for freq in top_frequencies]

# Create a scatter plot
plt.figure(figsize=(16, 4.5))
ax = plt.gca()

# Set the background color of the entire figure
fig = plt.gcf()
fig.patch.set_facecolor("white")

# Plot scatter points
plt.scatter(top_words, top_frequencies, alpha=0.7, edgecolors="w", marker="o")

# Style the plot
plt.style.use("classic")
ax.grid(True)

from matplotlib.colors import LogNorm

# Create a scatter plot with different colors and sizes
scatter = ax.scatter(
    top_words,
    top_frequencies,
    c=top_frequencies,
    cmap="summer",
    s=sizes,
    alpha=0.7,
    edgecolors="w",
    marker="o",
    norm=LogNorm(),
)

# Set plot title with background color
plt.title("\nTop 10 Word Frequency Scatter Plot\n", size=18, color="#222222")

# Label the axes
plt.xlabel("\nWords\n", size=14, color="#222222")
plt.ylabel("\nFrequency\n", size=14, color="#222222")

# Rotate x-axis labels for better readability
plt.xticks(rotation=35, ha="right", color="#333333")
plt.yticks(color="#333333")

# Remove top and right spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Set the left and bottom spines to grey color
ax.spines["left"].set_color("grey")
ax.spines["bottom"].set_color("grey")

# Display the plot
plt.show()

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# EXAPLES
job_skills = {
    "Data Scientist": ["Python", "Machine Learning", "Data Analysis", "Statistics"],
    "Software Engineer": ["Java", "JavaScript", "Software Development", "Databases"],
    "Product Manager": ["Product Strategy", "Market Research", "Project Management"],
    "UX Designer": ["User Experience", "Wireframing", "UI Design"],
    "Business Analyst": [
        "Business Analysis",
        "Data Modeling",
        "Requirements Gathering",
    ],
}

In [None]:
# Placeholder function to generate visualizations
def generate_visualization(job_type):
    clear_output(wait=True)
    print(f"Visualizations for {job_type}:\n")
    print(f"Skills: {', '.join(job_skills.get(job_type, []))}")

    # Get all skills for the selected job type
    all_skills = " ".join(skill[0] for skill in job_skills)

    # Generate word cloud
    wordcloud = WordCloud(width=900, height=380, colormap="summer").generate(all_skills)

    # Plot word cloud
    plt.figure(figsize=(16, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("\nSkills Word Cloud\n", size=18, color="#222222")
    plt.axis("off")

    # Set the background color of the figure to white
    plt.gcf().patch.set_facecolor("white")

    # Adjust layout to center the plot
    plt.tight_layout()

    plt.show()

In [None]:
# List of job types
job_types = list(job_skills.keys())

# Dropdown widget for job selection
job_dropdown = widgets.Dropdown(
    options=job_types,
    value=job_types[0],
    description="Select Job Type:",
    disabled=False,
    style={"description_width": "initial", "width": "400px"},
)

selected_job_type = job_dropdown.value
skills_for_selected_job = job_skills.get(selected_job_type, [])
all_skills = " ".join(skills_for_selected_job)


# Function to update visualizations when the dropdown value changes
def on_job_type_change(change):
    generate_visualization(change.new)


# Attach the function to the dropdown's observe event
job_dropdown.observe(on_job_type_change, names="value")

# Display the initial visualization
generate_visualization(job_types[0])

# Display the dropdown widget
display(job_dropdown)

In [None]:
def generate_visualization(job_type):
    clear_output(wait=True)
    print(f"Visualizations for {job_type}:\n")

    # Get skills for the selected job type
    skills_for_selected_job = job_skills.get(job_type, [])
    all_skills = " ".join(skills_for_selected_job)

    # Generate word cloud
    wordcloud = WordCloud(width=900, height=380, colormap="summer").generate(all_skills)

    # Plot word cloud
    plt.figure(figsize=(16, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("\nSkills Word Cloud\n", size=18, color="#222222")
    plt.axis("off")

    # Set the background color of the figure to white
    plt.gcf().patch.set_facecolor("white")

    # Adjust layout to center the plot
    plt.tight_layout()

    plt.show()

In [None]:
import ipywidgets as widgets
from IPython.display import display
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# EXAMPLES
job_skills = {
    "Data Scientist": ["Python", "Machine Learning", "Data Analysis", "Statistics"],
    "Software Engineer": ["Java", "JavaScript", "Software Development", "Databases"],
    "Product Manager": ["Product Strategy", "Market Research", "Project Management"],
    "UX Designer": ["User Experience", "Wireframing", "UI Design"],
    "Business Analyst": [
        "Business Analysis",
        "Data Modeling",
        "Requirements Gathering",
    ],
}


# Placeholder function to generate visualizations
def generate_visualization(job_type):
    # Get skills for the selected job type
    skills_for_selected_job = job_skills.get(job_type, [])
    all_skills = " ".join(skills_for_selected_job)

    # Generate word cloud
    wordcloud = WordCloud(width=900, height=380, colormap="summer").generate(all_skills)

    # Plot word cloud
    plt.figure(figsize=(16, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("\nSkills Word Cloud\n", size=18, color="#222222")
    plt.axis("off")

    # Set the background color of the figure to white
    plt.gcf().patch.set_facecolor("white")

    # Adjust layout to center the plot
    plt.tight_layout()

    plt.show()


# List of job types
job_types = list(job_skills.keys())

# Dropdown widget for job selection
job_dropdown = widgets.Dropdown(
    options=job_types,
    value=job_types[0],
    description="Select Job Type:",
    disabled=False,
    style={"description_width": "initial", "width": "400px"},
)


# Function to update visualizations when the dropdown value changes
def on_job_type_change(change):
    generate_visualization(change.new)


# Attach the function to the dropdown's observe event
job_dropdown.observe(on_job_type_change, names="value")

# Display the initial visualization
generate_visualization(job_types[0])

# Display the dropdown widget
display(job_dropdown)