In [None]:
#!pip install curl_cffi
import functions as f
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# reading the 2 base datasets and combine them
df1 = f.load_data('data/combined_job_offers.csv')
df2 = f.load_data('data/combined_job_offers3.csv')
df3 = pd.concat([df1,df2],axis=0)

#reading the dataset, scraped from stepstone
df_scraped = f.load_data('data/scraped_data.csv')

In [None]:
# cleaning the base dataset
# 1 drop columns df3
df3.drop(columns =['repost_date', 'email', 'job_desc', 'number_of_employees', 'num_applicants', 'salary'], inplace=True)

# 2 Renaming 'link'
df3.rename(columns={'link': 'source'}, inplace=True)

# 3 replace all links with LinkedIn
df3['source'] = df3['source'].apply(lambda x: 'LinkedIn')

In [None]:
# remove duplicates from scraped dataset
df_scraped.drop_duplicates(subset=['job_title', 'company_name', 'post_date'], keep='last', inplace=True)
df_scraped.drop(columns =['number_of_employees', 'num_applicants', 'salary'], inplace=True)

# combine scraped dataset with base dataset
df_combined = pd.concat([df3,df_scraped],axis=0)

# apply wrangling the job_levels, sectors and job_types
df_combined['job_level'] = df_combined.apply(f.replace_nan_with_job_level, axis=1)
df_combined['sector'] = df_combined.apply(f.replace_sectors, axis=1)
df_combined = df_combined[df_combined['job_type'] != 'Volunteer']
df_combined = df_combined[df_combined['job_type'] != 'Temporary']
df_combined = df_combined[df_combined['job_type'] != 'Internship']
df_combined = df_combined[df_combined['job_type'] != 'Contract']

# Filter out rows where 'sector' is 'Unknown'
filtered_df = df_combined[df_combined['sector'] != 'Unknown']

In [None]:
# create pivot table for HYPOTHESIS 1
sector_job_level = filtered_df.groupby(["job_level", "sector"])["job_title"].count()
pivot_table = sector_job_level.reset_index().pivot(index='job_level', columns='sector', values='job_title')

# fill NaN with 0 for better readability
pivot_table = pivot_table.fillna(0).astype(int)

# identify top 10 sectors by total counts
sector_totals = pivot_table.sum(axis=0)
top_10_sectors = sector_totals.nlargest(10).index

# filter the pivot table to only include the top 10 sectors
pivot_table_sector_job_level = pivot_table[top_10_sectors]

# Define the custom sorting order for 'job_level'
job_level_order = ['Entry Level', 'Senior Level', 'Director Level']

# reorder the rows based on the custom order
pivot_table_sector_job_level = pivot_table_sector_job_level.reindex(job_level_order)
pivot_table_sector_job_level = pivot_table_sector_job_level.copy()

# add a total column for each row and column
pivot_table_sector_job_level['Total'] = pivot_table_sector_job_level.sum(axis=1)
pivot_table_sector_job_level.loc['Total'] = pivot_table_sector_job_level.sum(axis=0)

#display the pivot table
pivot_table_sector_job_level

In [None]:
# remove the total rows and columns before plotting
pivot_table_sector_job_level_dropped_totals = pivot_table_sector_job_level.drop('Total', axis=0).drop('Total', axis=1)

# plot the pivot table as a stacked bar chart
pivot_table_sector_job_level_dropped_totals.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.title('Job Level Distribution by Sector (Top 10 Sectors)', fontsize=16)
plt.xlabel('Job Level', fontsize=14)
plt.ylabel('Open Positions', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig('plots/hypo1_bar.png', transparent=True)
plt.show()

In [None]:
# plot the pivot table as a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table_sector_job_level_dropped_totals, annot=True, fmt='d', cmap='Blues', cbar=True)

plt.title('Job Level Distribution by Sector (Top 10 Sectors)', fontsize=16)
plt.xlabel('Sector', fontsize=14)
plt.ylabel('Job Level', fontsize=14)
plt.yticks(rotation=0)

plt.tight_layout()
plt.savefig('plots/hypo1_heat.png', transparent=True)
plt.show()

In [None]:
# create pivot table for HYPOTHESIS 2
sector_job_type = filtered_df.groupby(["job_type", "sector"])["job_title"].count()
pivot_table = sector_job_type.reset_index().pivot(index='job_type', columns='sector', values='job_title')

# fill NaN with 0 for better readability
pivot_table = pivot_table.fillna(0).astype(int)

# identify the top 10 sectors by total counts
sector_totals = pivot_table.sum(axis=0)
top_10_sectors = sector_totals.nlargest(10).index

# filter the pivot table to only include the top 10 sectors
pivot_table_sector_job_type = pivot_table[top_10_sectors]

# define the custom sorting order for 'job_type'
job_type_order = ['Full-time', 'Part-time']

# Reorder the rows based on the custom order
pivot_table_sector_job_type = pivot_table_sector_job_type.reindex(job_type_order)
pivot_table_sector_job_type = pivot_table_sector_job_type.copy()

# Add a total column for each row and column
pivot_table_sector_job_type['Total'] = pivot_table_sector_job_type.sum(axis=1)
pivot_table_sector_job_type.loc['Total'] = pivot_table_sector_job_type.sum(axis=0)

#display the pivot table
pivot_table_sector_job_type

In [None]:
# remove the total rows and columns before plotting
pivot_table_sector_job_type_dropped_totals = pivot_table_sector_job_type.drop('Total', axis=0).drop('Total', axis=1)

# plot the pivot table as a stacked bar chart
pivot_table_sector_job_type_dropped_totals.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.title('Job Type Distribution by Sector (Top 10 Sectors)', fontsize=16)
plt.xlabel('Job Type', fontsize=14)
plt.ylabel('Open Positions', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig('plots/hypo2_bar.png', transparent=True)
plt.show()

In [None]:
# plot the pivot table as a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table_sector_job_type_dropped_totals, annot=True, fmt='d', cmap='Blues', cbar=True)

plt.title('Job Type Distribution by Sector (Top 10 Sectors)', fontsize=16)
plt.xlabel('Sector', fontsize=14)
plt.ylabel('Job Type', fontsize=14)
plt.yticks(rotation=0)

plt.tight_layout()
plt.savefig('plots/hypo2_heat.png', transparent=True)
plt.show()

In [None]:
# create pivot table for HYPOTHESIS 3
sector_job_remote = filtered_df.groupby(["job_remote", "sector"])["job_title"].count()
pivot_table = sector_job_remote.reset_index().pivot(index='job_remote', columns='sector', values='job_title')

# fill NaN with 0 for better readability
pivot_table = pivot_table.fillna(0).astype(int)

# identify the top 10 sectors by total counts
sector_totals = pivot_table.sum(axis=0)
top_10_sectors = sector_totals.nlargest(10).index

# filter the pivot table to only include the top 10 sectors
pivot_table_sector_job_remote = pivot_table[top_10_sectors]
pivot_table_sector_job_remote = pivot_table_sector_job_remote.copy()

# add a total column for each row and column
pivot_table_sector_job_remote['Total'] = pivot_table_sector_job_remote.sum(axis=1)
pivot_table_sector_job_remote.loc['Total'] = pivot_table_sector_job_remote.sum(axis=0)

#display the pivot table
pivot_table_sector_job_remote

In [None]:
# remove the total rows and columns before plotting
pivot_table_sector_job_remote_dropped_totals = pivot_table_sector_job_remote.drop('Total', axis=0).drop('Total', axis=1)

# plot the pivot table as a stacked bar chart
pivot_table_sector_job_remote_dropped_totals.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.title('Remote Job Distribution by Sector (Top 10 Sectors)', fontsize=16)
plt.xlabel('Job Remote', fontsize=14)
plt.ylabel('Open Positions', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig('plots/hypo3_bar.png', transparent=True)
plt.show()

In [None]:
# plot the pivot table as a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table_sector_job_remote_dropped_totals, annot=True, fmt='d', cmap='Blues', cbar=True)

plt.title('Remote Job Distribution by Sector (Top 10 Sectors)', fontsize=16)
plt.xlabel('Sector', fontsize=14)
plt.ylabel('Job Remote', fontsize=14)
plt.yticks(rotation=0)

plt.tight_layout()
plt.savefig('plots/hypo3_heat.png', transparent=True)
plt.show()

In [None]:
# additional graph for number of job offers 
df6 = df3.groupby(['company_name'])['job_title'].count()
top_10 = df6.sort_values(ascending=False).head(10)

# Create the pie chart
fig, ax = plt.subplots(figsize=(8, 8))
wedges, texts, autotexts = ax.pie(
    top_10,
    labels=top_10.index,
    autopct='%1.1f%%',
    startangle=90,  # Rotate the pie chart to start at the top
    pctdistance=0.85,  # Move percentages outward
    labeldistance=1.05,  # Position labels radially outside the pie
)

# Customize percentage text appearance
for autotext in autotexts:
    autotext.set_color('black')  # Set text color
    autotext.set_fontsize(10)  # Adjust font size

# Add a title
ax.set_title('Top 10 Companies by Job Titles', pad=20)

plt.tight_layout()
plt.savefig('plots/company_pie.png', transparent=True)
plt.show()