## Importing libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from sklearn.preprocessing import MultiLabelBinarizer

## Loading data

In [None]:
postings_df = pd.read_csv('job_postings_updated.csv')

In [None]:
# jobs with salary info
postings_df[~postings_df.isna().any(axis = 1)].shape[0]

In [None]:
# salaries bigger than $100k
postings_df[postings_df['ref_salary'] > 100000].shape[0]

In [None]:
postings_df['ref_salary'].describe()

In [None]:
postings_df.head()

## Position/ Job level table

### All

In [None]:
grouped_df = pd.crosstab(postings_df['position'], postings_df['levelMapping'], margins=True).reset_index()
grouped_df.columns.name = None

In [None]:
grouped_df

### With salary

In [None]:
postings_df_with_salary = postings_df[~postings_df.isna().any(axis = 1)]

In [None]:
grouped_df = pd.crosstab(postings_df_with_salary['position'], postings_df_with_salary['levelMapping'], margins=True).reset_index()
grouped_df.columns.name = None

In [None]:
grouped_df

## Filtered data by job level

In [None]:
# Assuming levelMapping has three possible values: 'Beginner', 'Intermediate', 'Advanced'
beginner_df = postings_df[postings_df['levelMapping'] == 'associate']
intermediate_df = postings_df[postings_df['levelMapping'] == 'entry_level']
advanced_df = postings_df[postings_df['levelMapping'] == 'mid_senior']

## Skill counts

In [None]:
postings_df['skills'] = postings_df['extracted_skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
postings_df['skills_list'] = postings_df['skills'].str.split(', ')
skill_df = postings_df.explode('skills_list')
skill_counts = skill_df['skills_list'].str.title().value_counts()
print(skill_counts)

## Top 50 skills plot

In [None]:
color_map = plt.colormaps.get_cmap('turbo')
colors = color_map(np.linspace(0.3, 0.8, 50))

plt.figure(figsize=(8, 18))
bars = skill_counts[:50].sort_values(ascending=True).plot(kind='barh', color=colors)

for bar in bars.patches:
    plt.text(
        bar.get_width() + 60,  
        bar.get_y() + bar.get_height() / 2,  
        f'{int(bar.get_width())}', 
        ha='center', va='center', fontsize=12, color='black'
    )
bars.set_ylabel('')
plt.yticks(fontsize=12)

plt.xlim(0, skill_counts.max() + 135) 
plt.savefig('figures/top50skills.jpeg', bbox_inches='tight')

plt.show()

## Salary histogram plot

In [None]:

plt.figure(figsize=(10, 6))
sns.histplot(x=postings_df['ref_salary'])
plt.xlabel('Reference salary')
plt.savefig('figures/salary.jpeg', bbox_inches='tight')
plt.show()

## Boxplot of salary for each search string

In [None]:
# Create the boxplot
fig, ax = plt.subplots(figsize=(10, 5))  # Adjusted height to make the figure more horizontal
postings_df.boxplot(column='ref_salary', by='position', vert=False, patch_artist=True, showfliers=False, widths=0.7, ax=ax)

ax.set_title('')
ax.set_ylabel('')
fig.suptitle('')

# Rotate x-ticks 90 degrees
ax.set_xticklabels(ax.get_xticklabels(), ha='center', fontsize=13)
ax.tick_params(axis='y', labelsize=13)

# Format the x-ticks using a lambda function
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'${x/1000:.0f}k'))

# Adjust layout
fig.tight_layout(pad=2.0, w_pad=2.0, h_pad=2.0)

# Save the figure
fig.savefig('figures/boxplots_positions.jpeg', bbox_inches='tight')

plt.show()

## Boxplot of salary for each job level

In [None]:
# Create the boxplot
fig, ax = plt.subplots(figsize=(10, 5))  # Adjusted height to make the figure more horizontal
postings_df.boxplot(column='ref_salary', by='levelMapping', vert=False, patch_artist=True, showfliers=False, widths=0.7, ax=ax)

ax.set_title('')
ax.set_ylabel('')
fig.suptitle('')

# Rotate x-ticks 90 degrees
ax.set_xticklabels(ax.get_xticklabels(), ha='center', fontsize=13)
ax.tick_params(axis='y', labelsize=13)

yticks = [tick.get_text() for tick in ax.get_yticklabels()]
ytick_labels = ['mid/ senior' if tick == 'mid_senior' else 'entry level' if tick == 'entry_level' else tick for tick in yticks]
ax.set_yticklabels(ytick_labels)

# Format the x-ticks using a lambda function
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'${x/1000:.0f}k'))

# Adjust layout
fig.tight_layout(pad=2.0, w_pad=2.0, h_pad=2.0)

# Save the figure
fig.savefig('figures/boxplots_levels.jpeg', bbox_inches='tight')

plt.show()

## Skills with salary info

In [None]:
# Create a binary matrix for the skills
skills = list(set(skill for sublist in postings_df['skills_list'] for skill in sublist))
skills_matrix = pd.DataFrame(0, index=postings_df.index, columns=skills)

for i, skills_list in enumerate(postings_df['skills_list']):
    for skill in skills_list:
        skills_matrix.at[i, skill] = 1

# Concatenate the skills matrix with the original dataframe
postings_df = pd.concat([postings_df, skills_matrix], axis=1)

# Calculate the mean salary for each skill
skill_salary = postings_df[skills].multiply(postings_df['ref_salary'], axis=0)
skill_salary_mean = skill_salary.sum() / postings_df[skills].sum()
# Calculate the count of each skill
skill_count = postings_df[skills].sum()

In [None]:
# Combine the mean salary and count into a single DataFrame
skill_stats = pd.DataFrame({'mean_salary': skill_salary_mean, 'count': skill_count})

In [None]:
# Filter out skills with mean salary greater than zero
filtered_skill_stats = skill_stats[skill_stats['mean_salary'] > 0]

# Order by biggest mean salary
filtered_skill_stats = filtered_skill_stats.sort_values(by='mean_salary', ascending=False)

# Display the filtered and ordered DataFrame
filtered_skill_stats.to_csv('skills_salary_info.csv')

In [None]:
skill_salary_df = postings_df.explode('skills_list')

# Get the top 50 skills by count
top_50_skills = skill_salary_df['skills_list'].value_counts().nlargest(50).sort_values(ascending=True).index

# Filter the DataFrame to include only the top 50 skills
filtered_df = skill_salary_df[skill_salary_df['skills_list'].isin(top_50_skills)]

In [None]:
top_50_skills

## Boxplot salary when the job description contains the skill in the top 50 skills

In [None]:
fig, ax = plt.subplots(figsize=(5, 10)) 
filtered_df.boxplot(column='ref_salary', by='skills_list', vert=False, patch_artist=True, showfliers=False, widths=0.7, ax=ax)

ax.set_title('')
ax.set_ylabel('')
fig.suptitle('')

# Reduce the y-ticks font size
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, ha='right', fontsize=12)
ax.tick_params(axis='x', labelsize=12)

# Format the x-ticks using a lambda function
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'${x/1000:.0f}k'))

# Adjust layout
fig.tight_layout(pad=2.0, w_pad=2.0, h_pad=2.0)

# Save the figure
fig.savefig('figures/boxplots_skills.jpeg', bbox_inches='tight')

plt.show()

## Point-Biserial Correlation

In [None]:
# Create a binary matrix for skills
mlb = MultiLabelBinarizer(classes = top_50_skills)
skills_encoded = mlb.fit_transform(postings_df_with_salary['skills_list'])
skills_df = pd.DataFrame(skills_encoded, columns=mlb.classes_)

# Include the salary in the DataFrame
skills_df['ref_salary'] = postings_df_with_salary['ref_salary'].values

# Calculate the correlation matrix for skills and salary
skill_salary_corr = skills_df.corr()

# Extract the correlation of skills with salary
skill_salary_corr_with_salary = skill_salary_corr['ref_salary'].drop('ref_salary')


In [None]:
import matplotlib.pyplot as plt

# Sort the correlations by value
sorted_corr = skill_salary_corr_with_salary.sort_values()

# Plot the correlations
plt.figure(figsize=(10, 8))
sorted_corr.plot(kind='barh')
plt.xlabel('Correlation Coefficient', fontsize=14)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.grid(True)
plt.tight_layout()
plt.savefig('figures/salary_corr.jpeg', bbox_inches='tight')
plt.show()


## Phi correlation

In [None]:
# Compute the co-occurrence matrix for skills
skills_co_occurrence_matrix = np.dot(skills_df.iloc[:, :-1].T, skills_df.iloc[:, :-1])

# Convert the co-occurrence matrix to a DataFrame for better readability
skills_co_occurrence_df = pd.DataFrame(skills_co_occurrence_matrix, index=mlb.classes_, columns=mlb.classes_)

# Calculate the correlation matrix for the co-occurrence matrix
skills_co_occurrence_corr = skills_co_occurrence_df.corr()

# Plotting the skill co-occurrence correlation matrix using seaborn
plt.figure(figsize=(12, 10))

sns.heatmap(skills_co_occurrence_corr, cmap='coolwarm', annot=False, fmt=".2f")
plt.xticks(rotation=90, fontsize=11)
plt.yticks(fontsize=11)
plt.savefig('figures/skill_corr.jpeg', bbox_inches='tight')
plt.show()

## Skills clusters