# Deep Dive into Skill Correlations

In [1]:
# Import libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from scr.correlation_analysis import (
create_skill_cooccurrence_matrix,
calculate_statistical_correlations,
analyze_skill_associations,
get_significant_correlations
)
from scr.skill_analysis import analyze_skill_frequency

In [3]:
# Load analyzed data
df = pd.read_csv('data/processed/cleaned_job_data.csv')
print(f"Loaded data shape: {df.shape}")

Loaded data shape: (523, 8)


In [4]:
# Create co-occurrence matrix
print("Creating skill co-occurrence matrix...")
cooccurrence_matrix = create_skill_cooccurrence_matrix(df, 'skills')
print("Co-occurrence matrix shape:", cooccurrence_matrix.shape)
print("\nTop co-occurrences for 'python':")
print(cooccurrence_matrix['python'].sort_values(ascending=False).head(10))

Creating skill co-occurrence matrix...
Co-occurrence matrix shape: (69, 69)

Top co-occurrences for 'python':
sql                135
machinelearning    134
pytorch             92
spark               71
airflow             67
r                   51
aws                 49
java                42
etl                 40
llm                 39
Name: python, dtype: int64


In [5]:
# Analyze Python associations
print("\nAnalyzing Python skill associations...")
python_associations = analyze_skill_associations(df, 'python', 'skills')
print("Top skills associated with Python:")
print(python_associations.head(10))


Analyzing Python skill associations...
Top skills associated with Python:
             skill  cooccurrence_count  association_ratio  lift_score  \
0              sql                 135              0.553       1.499   
1  machinelearning                 134              0.549       1.068   
2          pytorch                  92              0.377       3.130   
3            spark                  71              0.291       1.553   
4          airflow                  67              0.275       1.890   
5                r                  51              0.209       1.885   
6              aws                  49              0.201       1.364   
7             java                  42              0.172       1.608   
8              etl                  40              0.164       1.453   
9              llm                  39              0.160       1.086   

   skill_frequency  target_skill_frequency  cooccurrence_percentage  
0              193                     244         

In [6]:
# Calculate statistical correlations
print("\nCalculating statistical correlations...")
corr_matrix, pvalue_matrix = calculate_statistical_correlations(df, 'skills')
print("Correlation matrix shape:", corr_matrix.shape)


Calculating statistical correlations...
Correlation matrix shape: (69, 69)


In [7]:
# Get significant correlations
print("\nFinding significant correlations...")
significant_corrs = get_significant_correlations(
corr_matrix,
pvalue_matrix,
correlation_threshold=0.3,
pvalue_threshold=0.05
)
print(f"Found {len(significant_corrs)} significant correlations")
print("\nTop significant correlations:")
print(significant_corrs.head(10))


Finding significant correlations...
Found 182 significant correlations

Top significant correlations:
         skill1      skill2  correlation        p_value  abs_correlation
180  tensorflow     pytorch     0.907184  4.940374e-198         0.907184
133     pytorch  tensorflow     0.907184  4.940374e-198         0.907184
103  matplotlib      plotly     0.893565  2.392679e-183         0.893565
122      plotly  matplotlib     0.893565  2.392679e-183         0.893565
126  postgresql       mysql     0.718641   3.049225e-84         0.718641
109       mysql  postgresql     0.718641   3.049225e-84         0.718641
175     tableau      looker     0.675270   6.893832e-71         0.675270
98       looker     tableau     0.675270   6.893832e-71         0.675270
139       redis     mongodb     0.586345   1.292369e-49         0.586345
107     mongodb       redis     0.586345   1.292369e-49         0.586345


In [8]:
# Analyze multiple target skills
target_skills = ['python', 'sql', 'machine learning', 'aws']
print(f"\nAnalyzing associations for multiple skills: {target_skills}")

associations_results = {}
for skill in target_skills:
    associations = analyze_skill_associations(df, skill, 'skills')
    associations_results[skill] = associations.head(8)
    print(f"\nTop associations for {skill}:")
    print(associations.head(5))


Analyzing associations for multiple skills: ['python', 'sql', 'machine learning', 'aws']

Top associations for python:
             skill  cooccurrence_count  association_ratio  lift_score  \
0              sql                 135              0.553       1.499   
1  machinelearning                 134              0.549       1.068   
2          pytorch                  92              0.377       3.130   
3            spark                  71              0.291       1.553   
4          airflow                  67              0.275       1.890   

   skill_frequency  target_skill_frequency  cooccurrence_percentage  
0              193                     244                55.327869  
1              269                     244                54.918033  
2               63                     244                37.704918  
3               98                     244                29.098361  
4               76                     244                27.459016  

Top associations for

In [None]:
"""# Save correlation results
significant_corrs.to_csv('data/processed/significant_correlations.csv', index=False)
python_associations.to_csv('data/processed/python_associations.csv', index=False)
print("\nCorrelation results saved to data/processed/")"""


Correlation results saved to data/processed/
