# Main Skills Analysis

In [9]:
# Import libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

In [10]:
from scr.skill_analysis import (
    analyze_skill_frequency, 
    analyze_skills_by_title, 
    get_top_skills_by_title,
    get_title_statistics
)
from scr.data_processing import categorize_title

In [11]:
# Load cleaned data
df = pd.read_csv('data/processed/cleaned_job_data.csv')
print(f"Loaded data shape: {df.shape}")

Loaded data shape: (523, 8)


In [12]:
# Categorize job titles
print("Categorizing job titles...")
title_distribution = df['title_category'].value_counts()
print("\nTitle Category Distribution:")
print(title_distribution)

Categorizing job titles...

Title Category Distribution:
title_category
Senior Other Data Role                  181
Senior Machine Learning Engineer         72
Mid Other Data Role                      64
Senior Data Scientist                    37
Mid Data Scientist                       30
Senior Data Engineer                     26
Mid Data Analyst                         17
Mid Machine Learning Engineer            17
Mid Data Engineer                        16
Senior AI Engineer                       14
Mid AI Engineer                          10
Mid Data Science General                  9
Junior Other Data Role                    6
Senior Analytics Engineer                 6
Mid Research Scientist                    4
Senior Data Analyst                       4
Senior Data Science General               3
Mid Business Intelligence Analyst         2
Senior Business Intelligence Analyst      2
Senior Data Architect                     1
Mid Analytics Engineer                    1
Mid 

In [13]:
# Analyze skill frequency
print("\nAnalyzing skill frequency...")
skill_freq = analyze_skill_frequency(df, 'skills')
print("Top 15 skills:")
print(skill_freq.head(15))


Analyzing skill frequency...
Top 15 skills:
              skill  frequency  percentage
12  machinelearning        269       51.43
0            python        244       46.65
1               sql        193       36.90
20          pytorch        126       24.09
9             excel        104       19.89
24            spark         98       18.74
31              llm         77       14.72
25              aws         77       14.72
3           airflow         76       14.53
18              etl         59       11.28
10                r         58       11.09
14          tableau         58       11.09
23             java         56       10.71
19       tensorflow         55       10.52
4         snowflake         51        9.75


In [14]:
# Skills by title analysis
print("\nAnalyzing skills by title...")
skills_by_title = analyze_skills_by_title(df, 'title_category', 'skills')
print("Skills by title matrix shape:", skills_by_title.shape)
print("\nSample of skills by title:")
display(skills_by_title.head())


Analyzing skills by title...
Skills by title matrix shape: (22, 70)

Sample of skills by title:


Unnamed: 0,total_jobs,python,sql,kafka,airflow,snowflake,redshift,datalake,dbt,ci/cd,...,datawarehouse,timeseries,d3.js,spss,matlab,seaborn,xgboost,keras,jupyter,shell
Senior Analytics Engineer,6,6,6,1,6,4,2,4,4,4,...,0,0,0,0,0,0,0,0,0,0
Mid Other Data Role,64,19,16,1,4,4,2,0,2,6,...,0,0,0,0,0,0,0,0,0,0
Senior Other Data Role,181,67,53,18,22,10,4,2,8,12,...,9,2,2,1,0,0,0,0,0,0
Mid AI Engineer,10,7,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Junior Other Data Role,6,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Get top skills for each title
print("\nTop skills by title category:")
top_skills = get_top_skills_by_title(df, 'title_category', 'skills', n_skills=5)
for title, skills in top_skills.items():
    print(f"\n{title}:")
    for skill in skills:
        print(f"  - {skill}")


Top skills by title category:


{'Senior Analytics Engineer': ['python', 'sql', 'airflow', 'etl', 'tableau'],
 'Mid Other Data Role': ['excel', 'python', 'sql', 'machinelearning', 'llm'],
 'Senior Other Data Role': ['machinelearning',
  'python',
  'sql',
  'excel',
  'llm'],
 'Mid AI Engineer': ['python', 'pytorch', 'machinelearning', 'llm', 'aws'],
 'Junior Other Data Role': ['excel',
  'spark',
  'machinelearning',
  'python',
  'sql'],
 'Senior AI Engineer': ['llm', 'python', 'machinelearning', 'excel', 'docker'],
 'Senior Data Scientist': ['python', 'machinelearning', 'sql', 'r', 'pytorch'],
 'Senior Data Engineer': ['sql', 'python', 'spark', 'aws', 'airflow'],
 'Mid Data Analyst': ['sql', 'tableau', 'excel', 'looker', 'python'],
 'Senior Machine Learning Engineer': ['machinelearning',
  'pytorch',
  'python',
  'tensorflow',
  'deeplearning'],
 'Mid Data Scientist': ['python', 'sql', 'machinelearning', 'r', 'bigquery'],
 'Mid Machine Learning Engineer': ['machinelearning',
  'pytorch',
  'python',
  'spark',
  

In [16]:
# Title statistics
print("\nTitle statistics:")
title_stats = get_title_statistics(df, 'title_category', 'skills')
print(title_stats)



Title statistics:
                                   title  job_count  avg_skills_per_job  \
2                 Senior Other Data Role        181                 4.2   
9       Senior Machine Learning Engineer         72                 5.5   
1                    Mid Other Data Role         64                 3.3   
6                  Senior Data Scientist         37                 5.4   
10                    Mid Data Scientist         30                 4.4   
7                   Senior Data Engineer         26                 8.7   
8                       Mid Data Analyst         17                 4.1   
12         Mid Machine Learning Engineer         17                 6.1   
17                     Mid Data Engineer         16                10.9   
5                     Senior AI Engineer         14                 3.1   
3                        Mid AI Engineer         10                 3.4   
18              Mid Data Science General          9                 5.8   
4     