In [2]:
# Step 1: Import Libraries & Load Data
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
df = pd.read_csv(r'C:\Users\Ashita SHARMA\Desktop\Datascience Project\datasets\Gap skills 4.1.csv')

In [4]:
# 2. Initial Data Inspection
print(df.head())
df.info()

               Job_Title       Industry Company_Size   Location  \
0  Cybersecurity Analyst  Entertainment        Small      Dubai   
1   Marketing Specialist     Technology        Large  Singapore   
2          AI Researcher     Technology        Large  Singapore   
3          Sales Manager         Retail        Small     Berlin   
4  Cybersecurity Analyst  Entertainment        Small      Tokyo   

  AI_Adoption_Level Automation_Risk     Required_Skills     Salary_USD  \
0            Medium            High        UX/UI Design  111392.165243   
1            Medium            High           Marketing   93792.562466   
2            Medium            High        UX/UI Design  107170.263069   
3               Low            High  Project Management   93027.953758   
4               Low             Low          JavaScript   87752.922171   

  Remote_Friendly Job_Growth_Projection  
0             Yes                Growth  
1              No               Decline  
2             Yes         

In [5]:
# Get descriptive statistics for numerical columns
print(df.describe())

          Salary_USD
count     500.000000
mean    91222.390974
std     20504.291453
min     31969.526346
25%     78511.514863
50%     91998.195286
75%    103971.282092
max    155209.821614


In [6]:
# Check for missing values
print(df.isnull().sum())

Job_Title                0
Industry                 0
Company_Size             0
Location                 0
AI_Adoption_Level        0
Automation_Risk          0
Required_Skills          0
Salary_USD               0
Remote_Friendly          0
Job_Growth_Projection    0
dtype: int64


In [7]:
# Check for duplicate rows
print(df.duplicated().sum())

0


In [8]:

# Dropping rows with missing values
df.dropna(inplace=True)

In [9]:
# Removing duplicate rows
df.drop_duplicates(inplace=True)

In [10]:
# Step 7: EDA - Statistics Summary
print("\n--- Statistics Summary ---")
print(df.describe().T)
# We can see the average salary is around $96k USD and the average automation risk is around 50%.


--- Statistics Summary ---
            count          mean           std           min           25%  \
Salary_USD  500.0  91222.390974  20504.291453  31969.526346  78511.514863   

                     50%            75%            max  
Salary_USD  91998.195286  103971.282092  155209.821614  


In [11]:
# Step 8: EDA - Univariate Analysis
print("\n--- Univariate Analysis ---")
# Let's see the overall distribution of Job Growth Projections.
growth_counts = df['Job_Growth_Projection'].value_counts().reset_index()
growth_counts.columns = ['Projection', 'Count']
fig = px.pie(growth_counts,
             values='Count',
             names='Projection',
             title='Overall Job Growth Projection Distribution')
fig.show()


--- Univariate Analysis ---


In [12]:
# Step 9: EDA - Bivariate Analysis
print("\n--- Bivariate Analysis ---")

# Chart 1: Most Lucrative Skills (Average Salary by Skill) (Interactive)
# This bar chart directly answers: "Which skills correlate with the highest salaries?"
avg_salary_by_skill = df.groupby('Required_Skills')['Salary_USD'].mean().sort_values().reset_index()
fig1 = px.bar(avg_salary_by_skill,
              x='Salary_USD',
              y='Required_Skills',
              orientation='h',
              title='Average Salary by Required Skill',
              color='Salary_USD',
              color_continuous_scale=px.colors.sequential.Viridis,
              labels={'Required_Skills': 'Skill', 'Salary_USD': 'Average Salary (USD)'})
fig1.show()


--- Bivariate Analysis ---


In [13]:
# Step 10: EDA - Multivariate Analysis
print("\n--- Multivariate Analysis ---")

# Chart 2: Skill Demand by Job Growth Projection (Interactive Treemap)
# A treemap is an excellent way to visualize hierarchical data. It shows the proportion
# of jobs in each growth category and which skills are most common within them.
fig2 = px.treemap(df,
                   path=[px.Constant("All Jobs"), 'Job_Growth_Projection', 'Required_Skills'],
                   title='Distribution of Skills by Job Growth Projection',
                   color='Job_Growth_Projection',
                   color_discrete_map={'(?)':'lightgrey', 'Growth':'green', 'Stable':'orange', 'Decline':'red'})
fig2.show()


--- Multivariate Analysis ---


In [18]:
# Chart 3: The "Sweet Spot" - Automation Risk vs. Salary for Growing Jobs (Interactive)
# This is a highly strategic visualization. We filter the data to only include jobs
# with a 'Growth' projection, then plot their risk vs. salary. This helps users
# identify the most desirable and future-proof career paths.
growth_jobs = df[df['Job_Growth_Projection'] == 'Growth']
fig3 = px.histogram(growth_jobs,
                    x='Automation_Risk',
                    y='Salary_USD',
                    color='Required_Skills',
                    title='"Future-Proof" Skills: Risk vs. Salary for Jobs with Growth Projection',
                    hover_data=['Job_Title'],
                    labels={'Automation_Risk': 'Automation Risk (%)', 'Salary_USD': 'Salary (USD)'})
fig3.show()

In [15]:
#Chart 4: Heatmap of Salary by Skill and Industry (Interactive)
# This heatmap identifies the most valuable skill/industry combinations.
# To keep it readable, we'll focus on the top 10 most frequent skills.
top_10_skills = df['Required_Skills'].value_counts().nlargest(10).index
df_top_skills = df[df['Required_Skills'].isin(top_10_skills)]

skill_industry_pivot = df_top_skills.pivot_table(
    values='Salary_USD',
    index='Required_Skills',
    columns='Industry',
    aggfunc='mean'
)

fig4 = px.imshow(skill_industry_pivot,
                 text_auto=".0f", # Format to show full dollar amount
                 aspect="auto",
                 color_continuous_scale='Blues',
                 title='Heatmap: Average Salary (USD) for Top Skills by Industry')
fig4.show()