In [6]:
# create_master_skill_list.py
import pandas as pd
import ast  # For safely evaluating string representations of lists
import os
from tqdm import tqdm # For a nice progress bar. Install with: pip install tqdm

print("--- Starting: Creating Master Skill List ---")

# This set will store all unique skills found, automatically handling duplicates.
master_skill_set = set()

# A list of files and the columns that contain skill information.
# Format: (filepath, column_name, format_type)
# format_type can be 'list_string', 'comma_string', 'semicolon_string', 'direct'
files_to_process = [
    ('jobs/jobs_clean.csv', 'skills_norm', 'list_string'),
    ('jobs/jobs_clean.csv', 'required_skills', 'comma_string'),
    ('resumes/resume_data.csv', 'skills', 'list_string'),
    ('resumes/resume_data.csv', 'skills', 'list_string'), # Handling potential duplicate paths
    ('resumes/Resume_dataset/resume_data.csv', 'skills', 'list_string'), # Handling potential duplicate paths
    ('resumes/resume_job_description/resume_dataset_1200.csv', 'Skills', 'comma_string'),
    ('jobs/jobDescription/job_dataset.csv', 'Skills', 'semicolon_string'),
    ('jobs/jobDescription/job_dataset.csv', 'Keywords', 'semicolon_string'),
    ('jobs/linkedin_job/linkedin_jobs_analysis.csv', 'skills', 'comma_string'),
    ('jobs/job_market/job_market_unemployment_trends.csv', 'in_demand_skills', 'comma_string'),
    ('online_courses/coursera_specialization/coursera.csv', 'Skills', 'list_string'),
    ('online_courses/1500_courses/alison.csv', 'Skills', 'comma_string')
]

def clean_skill(skill):
    """Standardizes a single skill string."""
    if not isinstance(skill, str):
        return None
    s = skill.lower().strip()
    # Remove extra characters that might be left over from parsing
    s = s.replace("'", "").replace('"', "").replace('[', '').replace(']', '')
    # Filter out junk values
    if len(s) > 1 and len(s) < 50:
        return s
    return None

def process_file(filepath, column, format_type, skill_set):
    """Reads a CSV, extracts skills from the specified column, and adds them to the master set."""
    if not os.path.exists(filepath):
        print(f"  - Skipping: File not found at '{filepath}'")
        return

    try:
        # Use a more robust CSV reader for potential encoding issues
        df = pd.read_csv(filepath, usecols=[column], encoding='utf-8', on_bad_lines='skip')
        df.dropna(subset=[column], inplace=True)
        
        print(f"  - Processing {len(df)} rows from '{filepath}'...")

        for item in tqdm(df[column], desc=f"  -> Extracting from {os.path.basename(filepath)}"):
            skills_to_add = []
            
            # --- Handle different data formats ---
            if format_type == 'list_string':
                try:
                    skills_to_add = ast.literal_eval(item)
                except (ValueError, SyntaxError):
                    # If it's not a perfect list string, treat it as a comma-separated string
                    skills_to_add = str(item).split(',')
            
            elif format_type == 'comma_string':
                skills_to_add = str(item).split(',')
                
            elif format_type == 'semicolon_string':
                skills_to_add = str(item).split(';')

            elif format_type == 'direct':
                skills_to_add = [item]
            
            # --- Clean and add the extracted skills to the master set ---
            if isinstance(skills_to_add, list):
                for skill_str in skills_to_add:
                    cleaned = clean_skill(skill_str)
                    if cleaned:
                        skill_set.add(cleaned)

    except Exception as e:
        print(f"  - WARNING: Could not process '{filepath}'. Error: {e}")

# --- Main Execution ---

# Process all the standard files
for filepath, column, format_type in files_to_process:
    process_file(filepath, column, format_type, master_skill_set)

# Special handling for the relational LinkedIn data
print("\n  - Processing LinkedIn relational data...")
try:
    df_li_skills = pd.read_csv('jobs/linkedIn/jobs/job_skills.csv')
    df_li_map = pd.read_csv('jobs/linkedIn/mappings/skills.csv')
    df_li_merged = df_li_skills.merge(df_li_map, on='skill_abr', how='left')
    df_li_merged.dropna(subset=['skill_name'], inplace=True)
    
    for skill in tqdm(df_li_merged['skill_name'], desc="  -> Extracting from LinkedIn skills"):
        cleaned = clean_skill(skill)
        if cleaned:
            master_skill_set.add(cleaned)
    print("  - Successfully processed LinkedIn relational skills.")
except FileNotFoundError as e:
    print(f"  - Warning: Could not process LinkedIn relational data. File not found: {e.filename}")


# --- Finalize and Save ---
print("\n--- Finalizing Master Skill List ---")

# Convert the set to a sorted list for consistent ordering
final_skill_list = sorted(list(master_skill_set))

# Create a DataFrame
df_master = pd.DataFrame(final_skill_list, columns=['skill'])

# Save the final, clean, unique list of skills to a new CSV file
output_filename = 'master_skill_list_unique.csv'
df_master.to_csv(output_filename, index=False)

print(f"\n✅ Success! Created '{output_filename}' with a total of {len(df_master)} unique skills.")
print("\nThis file is now your definitive dictionary of all skills from your datasets.")
print("\n--- NEXT STEPS ---")
print("1. Manually review 'master_skill_list.csv' for any obvious errors or junk data.")
print("2. Use this file as the basis for regenerating your 'skill_ontology.csv'.")
print("3. Use your raw data files to create the 'master_skill_sets.csv' for co-occurrence analysis.")

--- Starting: Creating Master Skill List ---
  - Processing 20020 rows from 'jobs/jobs_clean.csv'...


  -> Extracting from jobs_clean.csv: 100%|██████████| 20020/20020 [00:00<00:00, 169003.98it/s]


  - Processing 20020 rows from 'jobs/jobs_clean.csv'...


  -> Extracting from jobs_clean.csv: 100%|██████████| 20020/20020 [00:00<00:00, 1141128.85it/s]


  - Processing 9488 rows from 'resumes/resume_data.csv'...


  -> Extracting from resume_data.csv: 100%|██████████| 9488/9488 [00:00<00:00, 38701.65it/s]


  - Processing 9488 rows from 'resumes/resume_data.csv'...


  -> Extracting from resume_data.csv: 100%|██████████| 9488/9488 [00:00<00:00, 37885.66it/s]


  - Processing 9488 rows from 'resumes/Resume_dataset/resume_data.csv'...


  -> Extracting from resume_data.csv: 100%|██████████| 9488/9488 [00:00<00:00, 32116.06it/s]


  - Processing 1200 rows from 'resumes/resume_job_description/resume_dataset_1200.csv'...


  -> Extracting from resume_dataset_1200.csv: 100%|██████████| 1200/1200 [00:00<00:00, 643874.22it/s]


  - Processing 1068 rows from 'jobs/jobDescription/job_dataset.csv'...


  -> Extracting from job_dataset.csv: 100%|██████████| 1068/1068 [00:00<00:00, 250098.64it/s]


  - Processing 1068 rows from 'jobs/jobDescription/job_dataset.csv'...


  -> Extracting from job_dataset.csv: 100%|██████████| 1068/1068 [00:00<00:00, 462759.99it/s]


  - Processing 1000 rows from 'jobs/job_market/job_market_unemployment_trends.csv'...


  -> Extracting from job_market_unemployment_trends.csv: 100%|██████████| 1000/1000 [00:00<00:00, 884128.16it/s]


  - Processing 1990 rows from 'online_courses/coursera_specialization/coursera.csv'...


  -> Extracting from coursera.csv: 100%|██████████| 1990/1990 [00:00<00:00, 117454.44it/s]


  - Processing 5725 rows from 'online_courses/1500_courses/alison.csv'...


  -> Extracting from alison.csv: 100%|██████████| 5725/5725 [00:00<00:00, 487759.30it/s]



  - Processing LinkedIn relational data...


  -> Extracting from LinkedIn skills: 100%|██████████| 213768/213768 [00:00<00:00, 2863711.35it/s]


  - Successfully processed LinkedIn relational skills.

--- Finalizing Master Skill List ---

✅ Success! Created 'master_skill_list_unique.csv' with a total of 16451 unique skills.

This file is now your definitive dictionary of all skills from your datasets.

--- NEXT STEPS ---
1. Manually review 'master_skill_list.csv' for any obvious errors or junk data.
2. Use this file as the basis for regenerating your 'skill_ontology.csv'.
3. Use your raw data files to create the 'master_skill_sets.csv' for co-occurrence analysis.


In [7]:
# generate_unique_skills.py
import pandas as pd
import os
from tqdm import tqdm

# --- Configuration ---
INPUT_FILE = 'master_skill_sets.csv'
OUTPUT_FILE = 'unique_skills.csv'

def main():
    """
    Reads a CSV file where each row contains a set of skills,
    and creates a new CSV file with a single column of all unique skills,
    cleaned and sorted alphabetically.
    """
    print(f"--- Starting: Generating a unique skill list from '{INPUT_FILE}' ---")

    # 1. Check if the input file exists
    if not os.path.exists(INPUT_FILE):
        print(f"ERROR: Input file '{INPUT_FILE}' not found.")
        print("Please run the data fusion script first to create it.")
        return

    # 2. Read the data and collect all skills into a set
    print("Reading and processing skill sets...")
    unique_skills_set = set()

    # Read the CSV in chunks for memory efficiency, especially if the file is very large
    chunk_size = 10000
    for chunk in tqdm(pd.read_csv(INPUT_FILE, header=None, chunksize=chunk_size, low_memory=False), desc="Processing chunks"):
        # The `stack()` method is a very efficient way to convert all columns into a single series
        # It also automatically drops empty/NaN values.
        all_skills_in_chunk = chunk.stack()
        
        # Clean and add to our master set
        for skill in all_skills_in_chunk:
            # Clean up the skill: convert to string, lowercase, and strip whitespace
            cleaned_skill = str(skill).strip().lower()
            if cleaned_skill: # Ensure it's not an empty string
                unique_skills_set.add(cleaned_skill)

    print(f"Found {len(unique_skills_set)} unique skills.")

    # 3. Convert the set to a sorted list
    sorted_unique_skills = sorted(list(unique_skills_set))

    # 4. Create a DataFrame and save to a new CSV
    df_output = pd.DataFrame(sorted_unique_skills, columns=['skill'])
    
    df_output.to_csv(OUTPUT_FILE, index=False)

    print(f"\n✅ Success! Saved the unique skills list to '{OUTPUT_FILE}'.")
    print("\nThis file now serves as the master dictionary for your project.")
    print("You can now proceed with generating the ontology and other analysis files.")


if __name__ == "__main__":
    main()

--- Starting: Generating a unique skill list from 'master_skill_sets.csv' ---
Reading and processing skill sets...


Processing chunks: 16it [00:02,  7.83it/s]

Found 2723 unique skills.

✅ Success! Saved the unique skills list to 'unique_skills.csv'.

This file now serves as the master dictionary for your project.
You can now proceed with generating the ontology and other analysis files.





In [14]:
# generate_knowledge_base.py
import pandas as pd
import google.generativeai as genai
import time
import os
import json
from tqdm import tqdm # A library for progress bars, install with: pip install tqdm

# --- CONFIGURATION ---
# IMPORTANT: Put your Google AI API Key here
API_KEY = "AIzaSyAD30hzssF3ozDEM1RFi97bVHJQNBMQSfY"

# Input file generated by your previous script
INPUT_ONTOLOGY_FILE = 'unique_skills.csv'

# The final, powerful output file
OUTPUT_KB_FILE = 'skill_knowledge_base2.csv'

# How many skills to process in each API call
BATCH_SIZE = 40 # Smaller batch size for this more complex task

# --- SCRIPT LOGIC ---

def initialize_gemini():
    """Initializes the Gemini API and returns the model object."""
    try:
        genai.configure(api_key=API_KEY)
        model = genai.GenerativeModel('gemini-2.5-flash-lite')
        model.generate_content("test", generation_config={"max_output_tokens": 10})
        print("Successfully connected to Google Gemini API.")
        return model
    except Exception as e:
        print(f"Error initializing Gemini API: {e}\nPlease check your API_KEY.")
        return None

def generate_semantic_info(model, skill_batch):
    """Sends a batch of skills to the Gemini API to get synonyms and descriptions."""
    
    skills_to_process = ", ".join([f'"{skill}"' for skill in skill_batch])

    prompt = f"""
    You are a data normalization and enrichment engine for a skills database. ignore completlely any unknown skill that isn't technical skill
    For each of the following skills, perform two tasks:
    1.  Provide a canonical, standardized name for the skill. For example, for "js" or "reactjs", the canonical name should be "javascript" or "react". For "Python", it should be "python".
    2.  Provide a concise, one-sentence description of the skill.
    3.  Provide parent skill
    4.  Provide type of the content of the skill

    Here is the list of skills: {skills_to_process}

    Your output MUST be a valid JSON array of objects, with no other text before or after it. Each object must have three keys: "original_skill", "canonical_name", and "description".

    Example output format:
    [
      {{
        "original_skill": "JS",
        "canonical_name": "javascript",
        "description": "A popular scripting language for web development.",
        "parent_skill":"Web developement",
        "skill_type":"Programming Language"
      }},
      {{
        "original_skill": "amazon web services",
        "canonical_name": "aws",
        "description": "A comprehensive cloud computing platform by Amazon.",
        "parent_skill":"cloud computing",
        "skill_type":"Platform"
      }}
    ]
    """

    try:
        response = model.generate_content(prompt, generation_config={"temperature": 0.0})
        # Clean the response to ensure it's valid JSON
        clean_response = response.text.strip().replace("```json", "").replace("```", "")
        return json.loads(clean_response)
    except (json.JSONDecodeError, Exception) as e:
        print(f"  -- API/JSON Error for a batch. Skipping. Error: {e}")
        return []

def main():
    if API_KEY == "YOUR_API_KEY_HERE":
        print("ERROR: Please update the API_KEY variable in the script.")
        return

    model = initialize_gemini()
    if not model:
        return

    print(f"Reading base skill ontology from '{INPUT_ONTOLOGY_FILE}'...")
    try:
        df_ontology = pd.read_csv(INPUT_ONTOLOGY_FILE)
        df_ontology['Skill'] = df_ontology['Skill'].str.lower()
        # Set index for easier lookup later
        df_ontology.set_index('Skill', inplace=True)
        all_skills = df_ontology.index.tolist()
        print(f"Found {len(all_skills)} skills to process.")
    except FileNotFoundError:
        print(f"ERROR: Cannot find the source file '{INPUT_ONTOLOGY_FILE}'. Please run generate_ontology.py first.")
        return
        
    # This dictionary will store our enriched data, keyed by the canonical name
    knowledge_base = {}

    print(f"Processing {len(all_skills)} skills in batches of {BATCH_SIZE}...")
    
    # Using tqdm for a nice progress bar in the terminal
    for i in tqdm(range(0, len(all_skills), BATCH_SIZE), desc="Generating Skill Knowledge Base"):
        batch = all_skills[i:i + BATCH_SIZE]
        
        # Call the Gemini API for the current batch
        semantic_data = generate_semantic_info(model, batch)
        
        # Process the results from the API
        for item in semantic_data:
            original = item.get('original_skill', '').lower().strip()
            canonical = item.get('canonical_name', '').lower().strip()
            description = item.get('description', 'No description available.')
            skill_type= item.get('skill_type', '').lower().strip()
            parent_skill= item.get('parent_skill', '').lower().strip()

            
            if not canonical: continue # Skip if AI failed to provide a canonical name

            # If this is the first time we see this canonical name, create an entry
            if canonical not in knowledge_base:

                knowledge_base[canonical] = {
                    'skill_type': skill_type,
                    'parent_skill': parent_skill,
                    'description': description,
                    'synonyms': set()
                }

            # Add the original skill name as a synonym
            if original != canonical:
                knowledge_base[canonical]['synonyms'].add(original)

        time.sleep(2) # Be respectful of API rate limits

    print("\nAggregation complete. Formatting final CSV.")
    
    # Convert the dictionary to a list of records for the DataFrame
    final_data = []
    for canonical, data in knowledge_base.items():
        synonyms_str = ", ".join(sorted(list(data['synonyms'])))
        final_data.append({
            'canonical_skill': canonical,
            'skill_type': data['skill_type'],
            'parent_skill': data['parent_skill'],
            'description': data['description'],
            'synonyms': synonyms_str
        })
        
    # Create and save the final DataFrame
    df_kb = pd.DataFrame(final_data)
    df_kb.sort_values('canonical_skill', inplace=True)
    df_kb.to_csv(OUTPUT_KB_FILE, index=False)
    
    print(f"\n--- Success! ---")
    print(f"Created '{OUTPUT_KB_FILE}' with {len(df_kb)} canonical skills.")
    print("This file is now your central source of truth for skill semantics.")
    print("\nPreview:")
    print(df_kb.head())

if __name__ == "__main__":
    main()

E0000 00:00:1758643652.735454 1225148 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Successfully connected to Google Gemini API.
Reading base skill ontology from 'unique_skills.csv'...
Found 2723 skills to process.
Processing 2723 skills in batches of 40...


Generating Skill Knowledge Base: 100%|██████████| 69/69 [14:10<00:00, 12.33s/it]


Aggregation complete. Formatting final CSV.

--- Success! ---
Created 'skill_knowledge_base2.csv' with 1173 canonical skills.
This file is now your central source of truth for skill semantics.

Preview:
  canonical_skill  skill_type              parent_skill  \
0            .net   framework      software development   
2     3d graphics       field         computer graphics   
3     3d modeling   technique               3d graphics   
4     3d printing  technology  manufacturing technology   
5         3ds max    software               3d modeling   

                                         description       synonyms  
0  A free, cross-platform, open-source developer ...  .ney(vb, asp)  
2  The creation and manipulation of three-dimensi...             3d  
3  The process of creating a mathematical represe...                 
4  A process of creating a three-dimensional obje...                 
5  A professional 3D computer graphics program fo...         3dsmax  





In [3]:
import pandas as pd
import glob
import os

# Find all CSV files in the current directory and all subdirectories
csv_files = glob.glob('**/*.csv', recursive=True)

output_txt = 'csv_inspection_results.txt'  # Name of the output file

if not csv_files:
    print("No CSV files found. Please make sure your CSV files are in the working directory or its subdirectories.")
else:
    print(f"Found {len(csv_files)} CSV files. Inspecting each one...\n")

    # Open the output file once
    with open(output_txt, 'w') as f:
        f.write(f"CSV Inspection Report - Found {len(csv_files)} CSV files\n\n")

        for file_path in csv_files:
            print("--------------------------------------------------")
            print(f"File: {file_path}")
            print("--------------------------------------------------")

            try:
                # Read the CSV into a pandas DataFrame
                df = pd.read_csv(file_path, on_bad_lines='skip')

                # Write results to the output file
                f.write(f"File: {file_path}\n")
                f.write(f"Shape: {df.shape}\n\n")
                f.write("Head:\n")
                f.write(df.head().to_string())
                f.write("\n\n")

                print(f"Results for {file_path} written to {output_txt}\n")

            except Exception as e:
                f.write(f"File: {file_path}\n")
                f.write(f"Error: {e}\n\n")
                print(f"Could not read or process file. Error: {e}\n")

    print(f"All results saved to {output_txt}")


Found 64 CSV files. Inspecting each one...

--------------------------------------------------
File: online_courses/courses_usage/online_courses_uses new.csv
--------------------------------------------------
Results for online_courses/courses_usage/online_courses_uses new.csv written to csv_inspection_results.txt

--------------------------------------------------
File: online_courses/1500_courses/udacity.csv
--------------------------------------------------
Results for online_courses/1500_courses/udacity.csv written to csv_inspection_results.txt

--------------------------------------------------
File: online_courses/1500_courses/MIT ocw.csv
--------------------------------------------------
Results for online_courses/1500_courses/MIT ocw.csv written to csv_inspection_results.txt

--------------------------------------------------
File: online_courses/1500_courses/edx.csv
--------------------------------------------------
Results for online_courses/1500_courses/edx.csv written to cs

In [4]:
# data_fusion.py
import pandas as pd
import ast
import re
from collections import Counter

print("--- Starting Data Fusion and Preparation ---")

# ==============================================================================
# STEP 1: EXTRACT SKILL SETS FROM ALL SOURCES
# ==============================================================================
print("\n[Phase 1/3] Extracting skill sets from all data sources...")

all_skill_sets = []
master_skill_set = set()

def clean_and_add_skills(skill_list):
    """Helper function to clean, normalize, and add skills to the master set and list."""
    if not isinstance(skill_list, list):
        return None
    
    cleaned_list = [str(s).strip().lower() for s in skill_list if pd.notna(s) and str(s).strip()]
    
    # Simple rule to filter out junk/long phrases
    final_skills = [s for s in cleaned_list if len(s) < 50 and len(s) > 1]
    
    if final_skills:
        master_skill_set.update(final_skills)
        all_skill_sets.append(final_skills)

# Source 1: resume_data.csv
try:
    df_resume = pd.read_csv('resumes/resume_data.csv', usecols=['skills'])
    df_resume.dropna(subset=['skills'], inplace=True)
    df_resume['skills'].apply(lambda x: clean_and_add_skills(ast.literal_eval(x)))
    print("  - Successfully processed resumes/resume_data.csv")
except Exception as e:
    print(f"  - Warning: Could not process resumes/resume_data.csv. Error: {e}")

# Source 2: jobs_clean.csv
try:
    df_jobs_clean = pd.read_csv('jobs/jobs_clean.csv', usecols=['skills_norm'])
    df_jobs_clean.dropna(subset=['skills_norm'], inplace=True)
    df_jobs_clean['skills_norm'].apply(lambda x: clean_and_add_skills(ast.literal_eval(x)))
    print("  - Successfully processed jobs_clean.csv")
except Exception as e:
    print(f"  - Warning: Could not process jobs_clean.csv. Error: {e}")

# Source 3: LinkedIn Data
try:
    df_li_jobs = pd.read_csv('jobs/linkedIn/jobs/job_skills.csv')
    df_li_map = pd.read_csv('jobs/linkedIn/mappings/skills.csv').rename(columns={'skill_name': 'skill'})
    df_li_merged = df_li_jobs.merge(df_li_map, on='skill_abr', how='left')
    
    # Group by job_id to create skill lists
    linkedin_skill_lists = df_li_merged.groupby('job_id')['skill'].apply(list).tolist()
    for skill_list in linkedin_skill_lists:
        clean_and_add_skills(skill_list)
    print("  - Successfully processed LinkedIn job skills data.")
except Exception as e:
    print(f"  - Warning: Could not process LinkedIn data. Error: {e}")
    
# Source 4: Job Description Text (for keyword spotting)
def extract_skills_from_text(text, known_skills):
    if not isinstance(text, str):
        return []
    text_lower = " " + text.lower() + " "
    return [skill for skill in known_skills if f" {skill} " in text_lower]

print("\n[Phase 2/3] Generating Master Skill Files...")

# ==============================================================================
# STEP 2: CREATE master_skill_list.csv
# ==============================================================================
master_skill_list_sorted = sorted(list(master_skill_set))
df_master_list = pd.DataFrame(master_skill_list_sorted, columns=['skill'])
df_master_list.to_csv('master_skill_list.csv', index=False)
print(f"  - Created master_skill_list.csv with {len(df_master_list)} unique skills.")

# Now that we have a master list, we can extract from raw text descriptions
print("  - Extracting skills from raw text descriptions (this may take a minute)...")
try:
    df_archive = pd.read_csv('jobs/archive_Job/job_title_des.csv', usecols=['Job Description'])
    df_archive.dropna(inplace=True)
    df_archive['skills'] = df_archive['Job Description'].apply(lambda x: extract_skills_from_text(x, master_skill_set))
    for skill_list in df_archive['skills'].tolist():
        if skill_list: # Only add if skills were found
            all_skill_sets.append(skill_list)
    print("    - Processed jobs/archive_Job/job_title_des.csv")
except Exception as e:
    print(f"  - Warning: Could not process jobs/archive_Job/job_title_des.csv. Error: {e}")


# ==============================================================================
# STEP 3: CREATE master_skill_sets.csv
# ==============================================================================
# This file is the foundation for your co-occurrence matrix.
max_len = max(len(s) for s in all_skill_sets) if all_skill_sets else 0
df_master_sets = pd.DataFrame(all_skill_sets) # Let pandas name columns 0, 1, 2...
df_master_sets.to_csv('master_skill_sets.csv', index=False, header=False) # No header needed for the next script
print(f"  - Created master_skill_sets.csv with {len(df_master_sets)} total entries.")


# ==============================================================================
# STEP 4: CREATE role_definitions.csv (by analyzing common job titles)
# ==============================================================================
print("\n[Phase 3/3] Generating Role Definitions...")

try:
    # Use the most descriptive job title columns
    df_jobs1 = pd.read_csv('jobs/jobs_clean.csv', usecols=['title', 'experience_level_raw', 'skills_norm'])
    df_jobs2 = pd.read_csv('jobs/700jobs/jobs_dataset.csv', usecols=['positionName', 'description'])
    df_jobs2.rename(columns={'positionName': 'title'}, inplace=True)
    df_jobs2['experience_level_raw'] = 'Not Specified' # Add column for consistency
    df_jobs2['skills_norm'] = df_jobs2['description'].apply(lambda x: str(extract_skills_from_text(x, master_skill_set)))

    # Combine the datasets
    df_all_jobs = pd.concat([df_jobs1[['title', 'experience_level_raw', 'skills_norm']],
                             df_jobs2[['title', 'experience_level_raw', 'skills_norm']]], 
                            ignore_index=True)
    
    df_all_jobs.dropna(subset=['title', 'skills_norm'], inplace=True)

    # Normalize job titles
    df_all_jobs['title_norm'] = df_all_jobs['title'].str.lower().str.strip()
    
    # Aggregate skills for the top 50 most common job titles
    top_titles = df_all_jobs['title_norm'].value_counts().nlargest(50).index
    
    role_definitions = []
    
    for title in top_titles:
        subset_df = df_all_jobs[df_all_jobs['title_norm'] == title]
        
        # Aggregate all skills for this title
        all_skills_for_title = []
        subset_df['skills_norm'].apply(lambda x: all_skill_sets.append(ast.literal_eval(x)))
        flat_list = [item for sublist in subset_df['skills_norm'].apply(ast.literal_eval).tolist() for item in sublist]
        
        # Find the 5 most common skills for this role
        core_skills = [skill for skill, count in Counter(flat_list).most_common(5)]
        
        # Determine the most common experience level
        experience_level = subset_df['experience_level_raw'].mode().iloc[0] if not subset_df['experience_level_raw'].mode().empty else 'Various'
        
        role_definitions.append({
            'role_title': title.title(),
            'core_skills': ", ".join(core_skills),
            'typical_experience_level': experience_level
        })

    df_roles = pd.DataFrame(role_definitions)
    df_roles.to_csv('role_definitions.csv', index=False)
    print(f"  - Created role_definitions.csv with {len(df_roles)} common job archetypes.")
    print("\n--- DATA FUSION COMPLETE ---")
    print("You should now re-run your analysis pipeline (generate_matrix, generate_centrality, etc.) using 'master_skill_sets.csv' as the input.")

except Exception as e:
    print(f"  - Error creating role definitions: {e}. Skipping this step.")


if __name__ == "__main__":
    main()

--- Starting Data Fusion and Preparation ---

[Phase 1/3] Extracting skill sets from all data sources...
  - Successfully processed resumes/resume_data.csv
  - Successfully processed jobs_clean.csv
  - Successfully processed LinkedIn job skills data.

[Phase 2/3] Generating Master Skill Files...
  - Created master_skill_list.csv with 2723 unique skills.
  - Extracting skills from raw text descriptions (this may take a minute)...
    - Processed jobs/archive_Job/job_title_des.csv
  - Created master_skill_sets.csv with 158517 total entries.

[Phase 3/3] Generating Role Definitions...
  - Created role_definitions.csv with 50 common job archetypes.

--- DATA FUSION COMPLETE ---
You should now re-run your analysis pipeline (generate_matrix, generate_centrality, etc.) using 'master_skill_sets.csv' as the input.


NameError: name 'main' is not defined

In [5]:
import pandas as pd
import ast
from collections import Counter
import warnings

warnings.filterwarnings('ignore')

print("--- Starting Data Enhancement Pipeline ---")


def generate_seniority_profiles():
    """
    Generates a CSV file mapping skills to their seniority profiles (junior, mid, senior).
    """
    print("\n[1/3] Generating Skill Seniority Profiles...")
    try:
        df = pd.read_csv('jobs/jobs_clean.csv', usecols=['experience_level_raw', 'skills_norm'])
        df.dropna(subset=['experience_level_raw', 'skills_norm'], inplace=True)
    except FileNotFoundError:
        print(" - ERROR: 'jobs_clean.csv' not found. Cannot generate seniority profiles.")
        return

    # Normalize the experience levels into three categories
    def map_level(level):
        level = str(level).lower()
        if 'senior' in level or 'lead' in level or 'expert' in level:
            return 'senior'
        if 'junior' in level or 'entry' in level or 'fresher' in level:
            return 'junior'
        if 'mid' in level or 'intermediate' in level:
            return 'mid'
        return 'other'

    df['level'] = df['experience_level_raw'].apply(map_level)
    df = df[df['level'] != 'other']  # We only care about jobs with clear seniority

    # Explode the skills into a long format
    df['skills_norm'] = df['skills_norm'].apply(ast.literal_eval)
    df_exploded = df.explode('skills_norm').rename(columns={'skills_norm': 'skill'})

    # Count skill occurrences for each level
    skill_level_counts = df_exploded.groupby(['skill', 'level']).size().unstack(fill_value=0)

    # Calculate percentages
    total_counts = skill_level_counts.sum(axis=1)
    skill_level_pct = skill_level_counts.div(total_counts, axis=0) * 100

    # Identify the primary level for each skill
    skill_level_pct['primary_level'] = skill_level_pct.idxmax(axis=1)
    skill_level_pct.rename(columns={'junior': 'pct_junior', 'mid': 'pct_mid', 'senior': 'pct_senior'}, inplace=True)

    # Save to CSV
    output_file = 'skill_seniority_profile.csv'
    skill_level_pct.to_csv(output_file)
    print(f"  - Success! Saved seniority profiles for {len(skill_level_pct)} skills to '{output_file}'.")
    print("  - Preview:")
    print(skill_level_pct.head())


def generate_industry_focus():
    """
    Generates a CSV file of the top 20 most frequent skills for each industry.
    """
    print("\n[2/3] Generating Skill Industry Focus...")
    try:
        # Load the necessary LinkedIn relational files
        df_job_skills = pd.read_csv('jobs/linkedIn/jobs/job_skills.csv')
        df_skills_map = pd.read_csv('jobs/linkedIn/mappings/skills.csv').rename(columns={'skill_name': 'skill'})
        df_job_industries = pd.read_csv('jobs/linkedIn/jobs/job_industries.csv')
        df_industries_map = pd.read_csv('jobs/linkedIn/mappings/industries.csv')
    except FileNotFoundError as e:
        print(f" - ERROR: A required LinkedIn file was not found ({e.filename}). Cannot generate industry focus.")
        return

    # Merge to get full names for skills and industries
    skills_full = df_job_skills.merge(df_skills_map, on='skill_abr', how='left')
    industries_full = df_job_industries.merge(df_industries_map, on='industry_id', how='left')

    # Merge skills with industries on job_id
    df_merged = skills_full.merge(industries_full, on='job_id', how='inner')
    df_merged.dropna(subset=['skill', 'industry_name'], inplace=True)

    # Count skills within each industry
    industry_skill_counts = df_merged.groupby(['industry_name', 'skill']).size().reset_index(name='count')

    # Find the top 20 skills for each industry
    top_skills_per_industry = industry_skill_counts.groupby('industry_name').apply(
        lambda x: x.nlargest(20, 'count')['skill'].tolist()
    ).reset_index(name='top_skills')

    # Convert list to a comma-separated string for easier CSV reading
    top_skills_per_industry['top_skills'] = top_skills_per_industry['top_skills'].apply(lambda x: ', '.join(x))

    # Save to CSV
    output_file = 'skill_industry_focus.csv'
    top_skills_per_industry.to_csv(output_file, index=False)
    print(f"  - Success! Saved top skills for {len(top_skills_per_industry)} industries to '{output_file}'.")
    print("  - Preview:")
    print(top_skills_per_industry.head())


def generate_skill_trends():
    """
    Analyzes and saves the 6-month demand trend for each skill.
    """
    print("\n[3/3] Generating Skill Demand Trends...")
    try:
        df = pd.read_csv('jobs/jobs_clean.csv', usecols=['date_posted', 'skills_norm'])
        df.dropna(subset=['date_posted', 'skills_norm'], inplace=True)
    except FileNotFoundError:
        print(" - ERROR: 'jobs_clean.csv' not found. Cannot generate skill trends.")
        return

    # Convert date and create a 'year_month' column for grouping
    df['date_posted'] = pd.to_datetime(df['date_posted'], errors='coerce')
    df.dropna(subset=['date_posted'], inplace=True)
    df['period'] = df['date_posted'].dt.to_period('M')

    # Explode skills
    df['skills_norm'] = df['skills_norm'].apply(ast.literal_eval)
    df_exploded = df.explode('skills_norm').rename(columns={'skills_norm': 'skill'})

    # Count skills per period
    trends = df_exploded.groupby(['period', 'skill']).size().reset_index(name='count')

    # Pivot to get skills as columns and time as rows
    pivot_trends = trends.pivot(index='period', columns='skill', values='count').fillna(0)

    # Calculate percentage change over 6 months
    trend_data = pivot_trends.pct_change(periods=6).iloc[-1].reset_index(name='trend_6m')
    trend_data['trend_6m'] = (trend_data['trend_6m'] * 100).round(2)  # As percentage
    trend_data.rename(columns={'skill': 'Skill', 'trend_6m': 'Demand_Change_6M_Pct'}, inplace=True)

    # Save to CSV
    output_file = 'skill_demand_trends.csv'
    trend_data.to_csv(output_file, index=False)
    print(f"  - Success! Saved demand trends for {len(trend_data)} skills to '{output_file}'.")
    print("  - Preview of trending up skills:")
    print(trend_data.nlargest(5, 'Demand_Change_6M_Pct'))
    print("\n  - Preview of trending down skills:")
    print(trend_data.nsmallest(5, 'Demand_Change_6M_Pct'))


if __name__ == "__main__":
    generate_seniority_profiles()
    generate_industry_focus()
    generate_skill_trends()
    print("\n--- All enhancement files created successfully! ---")
    print("You can now integrate these into your application's logic.")

--- Starting Data Enhancement Pipeline ---

[1/3] Generating Skill Seniority Profiles...
  - Success! Saved seniority profiles for 40 skills to 'skill_seniority_profile.csv'.
  - Preview:
level    pct_junior    pct_mid  pct_senior primary_level
skill                                                   
angular   31.884058  34.492754   33.623188           mid
aws       34.807554  32.464738   32.727707        junior
azure     34.006491  32.311576   33.681933        junior
c#        34.899329  36.241611   28.859060           mid
css       35.636856  35.501355   28.861789        junior

[2/3] Generating Skill Industry Focus...
  - Success! Saved top skills for 388 industries to 'skill_industry_focus.csv'.
  - Preview:
                                      industry_name  \
0  Abrasives and Nonmetallic Minerals Manufacturing   
1                Accessible Architecture and Design   
2                   Accommodation and Food Services   
3                                        Accounting   
4  

In [5]:
import pandas as pd
import networkx as nx
import numpy as np

# --- Configuration ---
# The input file is the co-occurrence matrix you showed me.
INPUT_FILE = 'skill_co_occurrence_matrix.csv' 
OUTPUT_FILE = 'centrality_measures.csv'

print(f"Reading skill relationship data from '{INPUT_FILE}'...")

try:
    # Load the co-occurrence matrix. The first column contains the skill names, so it becomes the index.
    df = pd.read_csv(INPUT_FILE, index_col=0)
except FileNotFoundError:
    print(f"FATAL ERROR: The input file '{INPUT_FILE}' was not found.")
    print("Please make sure this script is in the same directory as your CSV file.")
    exit()

# The networkx library works best with clean column names. Let's ensure they are all strings.
df.columns = df.columns.astype(str)
df.index = df.index.astype(str)

print("Building the skill network graph. This may take a moment...")

# Create an empty graph
G = nx.Graph()

# Add all skills as nodes to the graph
for skill in df.index:
    G.add_node(skill)

# Add edges between skills with their co-occurrence score as the 'weight'
# We iterate through the upper triangle of the matrix to avoid adding duplicate edges
for i in range(len(df.columns)):
    for j in range(i + 1, len(df.columns)):
        skill1 = df.columns[i]
        skill2 = df.columns[j]
        weight = df.loc[skill1, skill2]
        
        # Only add an edge if there is a connection (weight > 0)
        if weight > 0:
            G.add_edge(skill1, skill2, weight=weight)

print(f"Graph created successfully with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
print("-" * 30)
print("Calculating centrality measures...")

# --- Centrality Calculations ---

# 1. Degree Centrality: How many direct connections a skill has.
print("Calculating Degree Centrality...")
degree_centrality = nx.degree_centrality(G)

# 2. Betweenness Centrality: Identifies "bridge" skills.
# NOTE: This is computationally intensive and can take a few minutes on a large graph.
# We use 'weight' to consider the strength of connections. Since networkx treats higher weights
# as 'longer distances', we need to invert our similarity scores.
# We create a new attribute 'distance' which is 1/weight.
for u, v, data in G.edges(data=True):
    if data['weight'] > 0:
        data['distance'] = 1.0 / data['weight']
    else:
        data['distance'] = float('inf')

print("Calculating Betweenness Centrality (this might take a while)...")
betweenness_centrality = nx.betweenness_centrality(G, weight='distance', normalized=True)

# 3. Eigenvector Centrality: Measures influence.
print("Calculating Eigenvector Centrality...")
# This can sometimes fail to converge on complex graphs. We'll add error handling.
try:
    eigenvector_centrality = nx.eigenvector_centrality(G, weight='weight', max_iter=1000)
except nx.PowerIterationFailedConvergence:
    print("Eigenvector centrality did not converge. Filling with 0.")
    eigenvector_centrality = {node: 0.0 for node in G.nodes()}

print("Centrality calculations complete.")
print("-" * 30)

# --- Assemble and Save the Results ---

# Create a DataFrame to hold the results
centrality_df = pd.DataFrame({
    "Skill": list(G.nodes),
    "Degree Centrality": [degree_centrality.get(node, 0) for node in G.nodes()],
    "Betweenness Centrality": [betweenness_centrality.get(node, 0) for node in G.nodes()],
    "Eigenvector Centrality": [eigenvector_centrality.get(node, 0) for node in G.nodes()]
})

# Sort by a primary centrality measure for easier viewing
centrality_df = centrality_df.sort_values(by="Degree Centrality", ascending=False)

# Save the final DataFrame to a new CSV file
centrality_df.to_csv(OUTPUT_FILE, index=False)

print(f"Successfully created '{OUTPUT_FILE}'.")
print("\nHere's a preview of the top 10 most central skills:")
print(centrality_df.head(10))

Reading skill relationship data from 'skill_co_occurrence_matrix.csv'...
Building the skill network graph. This may take a moment...
Graph created successfully with 2895 nodes and 523358 edges.
------------------------------
Calculating centrality measures...
Calculating Degree Centrality...
Calculating Betweenness Centrality (this might take a while)...
Calculating Eigenvector Centrality...
Centrality calculations complete.
------------------------------
Successfully created 'centrality_measures.csv'.

Here's a preview of the top 10 most central skills:
                       Skill  Degree Centrality  Betweenness Centrality  \
1611               Local Job           0.798549                0.085545   
1915                     PHP           0.703179                0.042534   
862               Data Entry           0.691431                0.011403   
2402   Software Architecture           0.681755                0.093371   
1282          Graphic Design           0.668970                0

In [3]:
# create_matrix.py
import itertools
import pandas as pd
import numpy as np
from collections import defaultdict
import csv # Import the csv module

# --- Configuration ---
INPUT_SKILLS_FILE = "csv/skills_no_duplicate_sorted.csv" # Using the cleaned, sorted skills file
OUTPUT_MATRIX_FILE = "correspendentFinalClean2.csv"

print(f"Reading skills from '{INPUT_SKILLS_FILE}'...")

# Initialize a dictionary to store words and connections as sets to prevent duplicates
words = defaultdict(set)

# Define the chunk size for reading the CSV in parts
chunk_size = 10000
i = 0

# Process CSV in chunks
for chunk in pd.read_csv(INPUT_SKILLS_FILE, header=None, chunksize=chunk_size, low_memory=False):
    i += 1
    print(f"Processing chunk {i}...")
    # Iterate over each row in the chunk
    for row in chunk.itertuples(index=False):
        # Filter out NaN values and convert to list of strings
        parts = [str(item) for item in row if pd.notna(item)]
        # Generate pairwise combinations and update connections
        for a, b in itertools.combinations(parts, 2):
            if a and b:
                words[a].add(b)
                words[b].add(a)

print("Finished processing skill pairs.")

# Convert keys to a list and determine the size of the final dataset
keys = list(words.keys())
size = len(keys)
key_to_index = {key: i for i, key in enumerate(keys)} # Create a mapping for faster lookups

print(f"Found {size} unique skills. Building matrix...")

# Initialize a numpy array to track connections
track = np.zeros((size, size))

# Populate the track array based on the accumulated connections
for i, k in enumerate(keys):
    track[i, i] = len(words[k])  # Self-connection represents the degree of the node
    for j in words[k]:
        if j in key_to_index: # Ensure the skill is in our keys list
            j_index = key_to_index[j]
            track[i, j_index] += 1
            # The matrix is symmetric, so we don't need to add to track[j_index, i] again

print("Matrix built. Normalizing values...")

# Normalize each row in track by dividing each element by its diagonal entry
# We need to handle division by zero for skills that have no connections (track[row,row] == 0)
diagonal = track.diagonal().copy() # Make a copy to avoid modifying it while iterating
diagonal[diagonal == 0] = 1 # Avoid division by zero, the result will be 0 anyway

for row in range(track.shape[0]):
    track[row,:] /= diagonal[row]

print("Normalization complete. Saving to CSV...")

# Create a DataFrame from the track array with labels for rows and columns
track_df = pd.DataFrame(track, index=keys, columns=keys)

# Save the DataFrame to a CSV file. Pandas handles the quoting correctly.
track_df.to_csv(OUTPUT_MATRIX_FILE)

print(f"Successfully created '{OUTPUT_MATRIX_FILE}'!")

Reading skills from 'csv/skills_no_duplicate_sorted.csv'...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
Processing chunk 29...
Processing chunk 30...
Processing chunk 31...
Processing chunk 32...
Processing chunk 33...
Processing chunk 34...
Processing chunk 35...
Processing chunk 36...
Processing chunk 37...
Processing chunk 38...
Processing chunk 39...
Processing chunk 40...
Processing chunk 41...
Proces

In [None]:
# generate_matrix.py
import pandas as pd
import numpy as np
import itertools
from collections import defaultdict

# --- Configuration ---
# This is the file where each row represents a set of co-occurring skills for a project/job.
INPUT_SKILLS_FILE = "csv/skillsFreelancerFinal.csv" 

# This will be the final, correctly formatted co-occurrence matrix.
OUTPUT_MATRIX_FILE = "skill_co_occurrence_matrix_with_duplicate.csv"

# --- Main Script ---

print(f"Step 1: Reading skills from '{INPUT_SKILLS_FILE}' and building co-occurrence map.")

# Use defaultdict(set) for efficient and duplicate-free storage of skill connections.
skill_connections = defaultdict(set)
all_skills = set()

# We read the CSV line by line to handle varying numbers of skills per row.
with open(INPUT_SKILLS_FILE, 'r', encoding='utf-8') as f:
    # Skip the header row if it exists.
    # If you are SURE there's no header, you can comment out the next line.
    next(f, None) 
    
    for line in f:
        # Split by comma and strip whitespace/quotes from each skill
        skills_in_row = [skill.strip().strip('"') for skill in line.strip().split(',')]
        
        # Filter out any empty strings that might result from trailing commas
        cleaned_skills = [skill for skill in skills_in_row if skill]
        
        # Add all unique skills to our master set
        all_skills.update(cleaned_skills)
        
        # Generate all unique pairs of skills in this row
        for skill1, skill2 in itertools.combinations(cleaned_skills, 2):
            skill_connections[skill1].add(skill2)
            skill_connections[skill2].add(skill1)

print(f"Step 2: Found {len(all_skills)} unique skills. Creating the co-occurrence matrix.")

# Create a sorted list of keys for consistent matrix ordering
keys = sorted(list(all_skills))
size = len(keys)

# Create a mapping of skill_name -> index for much faster lookups
key_to_index = {key: i for i, key in enumerate(keys)}

# Initialize a numpy array to store the counts
co_occurrence_counts = np.zeros((size, size), dtype=int)

# Populate the matrix with co-occurrence counts
for skill, connections in skill_connections.items():
    i = key_to_index[skill]
    # The diagonal will store the total number of connections (degree) for each skill
    co_occurrence_counts[i, i] = len(connections)
    for connected_skill in connections:
        if connected_skill in key_to_index:
            j = key_to_index[connected_skill]
            # We simply count 1 for each co-occurrence
            co_occurrence_counts[i, j] = 1

print("Step 3: Normalizing the matrix to create correlation scores.")

# Create a copy of the matrix for normalization
# We will divide each cell (i, j) by the total occurrences of skill i.
# This gives P(j|i) - the probability of seeing skill j given that you see skill i.
normalized_matrix = np.zeros((size, size), dtype=float)
for i in range(size):
    total_occurrences = co_occurrence_counts[i, i]
    if total_occurrences > 0:
        normalized_matrix[i, :] = co_occurrence_counts[i, :] / total_occurrences
        normalized_matrix[i, i] = 1.0 # The probability of a skill co-occurring with itself is 1

print("Step 4: Saving the final matrix to CSV.")

# Create a pandas DataFrame to save the result with proper headers and index
final_df = pd.DataFrame(normalized_matrix, index=keys, columns=keys)

# Save to CSV. Pandas will automatically handle quoting for skill names with commas.
final_df.to_csv(OUTPUT_MATRIX_FILE)

print(f"\nSuccess! Your skill intelligence matrix has been saved to '{OUTPUT_MATRIX_FILE}'.")
print("This file is the 'brain' of your application.")
print("\nYou can now use this file as input for the 'generate_centrality.py' script and then the final 'cv_analyzer_app.py'.")

Step 1: Reading skills from 'csv/skills_no_duplicate_sorted.csv' and building co-occurrence map.
Step 2: Found 2895 unique skills. Creating the co-occurrence matrix.
Step 3: Normalizing the matrix to create correlation scores.
Step 4: Saving the final matrix to CSV.

Success! Your skill intelligence matrix has been saved to 'skill_co_occurrence_matrix.csv'.
This file is the 'brain' of your application.

You can now use this file as input for the 'generate_centrality.py' script and then the final 'cv_analyzer_app.py'.


In [6]:
import pandas as pd
import networkx as nx
from networkx.algorithms import community
import collections

# --- Configuration ---
# This is your main co-occurrence matrix.
# It should be the non-transposed version for easier handling here.
# If you only have the transposed one, that's okay, this script will handle it.
INPUT_MATRIX_FILE = 'skill_co_occurrence_matrix.csv' # Using the file from your screenshot
OUTPUT_CLUSTERS_FILE = 'skill_clusters.csv'

# This is the most important parameter to tune.
# It defines the percentage of the WEAKEST skill connections to REMOVE before clustering.
# A higher value (e.g., 0.90) will create more distinct, smaller clusters.
# A lower value (e.g., 0.70) will create fewer, larger clusters.
# Let's start high to ensure we break up that single giant cluster.
PRUNING_QUANTILE = 0.95 # This means we will only keep the top 5% of strongest connections.

# --- Main Script ---

print(f"Step 1: Loading the skill co-occurrence matrix from '{INPUT_MATRIX_FILE}'...")
try:
    df = pd.read_csv(INPUT_MATRIX_FILE, index_col=0)
except FileNotFoundError:
    print(f"FATAL ERROR: The input file '{INPUT_MATRIX_FILE}' was not found.")
    print("Please make sure this script is in the same directory as your CSV file.")
    exit()

# Clean up names to be safe
df.columns = df.columns.map(str)
df.index = df.index.map(str)

print("Step 2: Building the skill network graph from the matrix...")
# Create a graph from the pandas DataFrame
G = nx.from_pandas_adjacency(df)
print(f"Initial graph created with {G.number_of_nodes()} skills and {G.number_of_edges()} connections.")

print("\nStep 3: Pruning the graph to keep only the strongest connections...")
# Get all edge weights and determine the threshold for keeping the top connections
all_weights = [data['weight'] for u, v, data in G.edges(data=True)]
if not all_weights:
    print("Error: No weights found in the graph. Cannot prune.")
    exit()

threshold = pd.Series(all_weights).quantile(PRUNING_QUANTILE)
print(f"Calculated weight threshold: {threshold:.4f}. Edges with weight below this will be removed.")

# Create a new, pruned graph
G_pruned = nx.Graph()
# Add all skills first, so we don't lose any skills that might become isolated
G_pruned.add_nodes_from(G) 

# Add only the edges that are above our threshold
for u, v, data in G.edges(data=True):
    if data['weight'] >= threshold:
        G_pruned.add_edge(u, v, weight=data['weight'])

print(f"Pruned graph has {G_pruned.number_of_nodes()} skills and {G_pruned.number_of_edges()} strong connections.")

print("\nStep 4: Detecting skill communities using the Louvain algorithm...")
# Find communities (clusters) in the pruned graph
# The 'seed' makes the result reproducible
communities_sets = community.louvain_communities(G_pruned, weight='weight', seed=42)
communities_sets = sorted(communities_sets, key=len, reverse=True) # Sort by size

print(f"Successfully identified {len(communities_sets)} distinct skill clusters.")

print("\nStep 5: Formatting and saving the clusters to 'skill_clusters.csv'...")
# Create a dictionary to map each skill to its cluster ID
community_dict = {}
for i, cluster in enumerate(communities_sets):
    for skill in cluster:
        community_dict[skill] = i

# Convert to a DataFrame
community_df = pd.DataFrame(community_dict.items(), columns=["Skill", "Cluster_ID"])
community_df = community_df.sort_values(by="Cluster_ID")

# Save the clusters to a new CSV file
community_df.to_csv(OUTPUT_CLUSTERS_FILE, index=False)

print(f"\nSuccess! New '{OUTPUT_CLUSTERS_FILE}' has been created.")
print("\n--- Preview of the 5 Largest Clusters Found ---")

for i in range(min(5, len(communities_sets))):
    cluster_skills = list(communities_sets[i])
    print(f"\nCluster {i} (Size: {len(cluster_skills)} skills):")
    # Print up to the first 10 skills in the cluster for preview
    print(", ".join(cluster_skills[:10]))

Step 1: Loading the skill co-occurrence matrix from 'skill_co_occurrence_matrix.csv'...
Step 2: Building the skill network graph from the matrix...
Initial graph created with 2895 skills and 526253 connections.

Step 3: Pruning the graph to keep only the strongest connections...
Calculated weight threshold: 0.0090. Edges with weight below this will be removed.
Pruned graph has 2895 skills and 26433 strong connections.

Step 4: Detecting skill communities using the Louvain algorithm...
Successfully identified 387 distinct skill clusters.

Step 5: Formatting and saving the clusters to 'skill_clusters.csv'...

Success! New 'skill_clusters.csv' has been created.

--- Preview of the 5 Largest Clusters Found ---

Cluster 0 (Size: 77 skills):
ChatGPT, Conversational AI, YOLO, GPT-4, Computer Vision, GPT-3, LLaMA 2, Internet Security, Dolly, Generative AI

Cluster 1 (Size: 71 skills):
FL Studio, Maya, Logo Design, 3D Animation, Moho, Arts & Crafts, Icon Design, Rotoscoping, Digital Art, Comics