In [None]:
import pandas as pd
import glob
import os

# Find all CSV files in the current directory and all subdirectories
csv_files = glob.glob('**/*.csv', recursive=True)

output_txt = 'csv_inspection_results.txt'  # Name of the output file

if not csv_files:
    print("No CSV files found. Please make sure your CSV files are in the working directory or its subdirectories.")
else:
    print(f"Found {len(csv_files)} CSV files. Inspecting each one...\n")

    # Open the output file once
    with open(output_txt, 'w') as f:
        f.write(f"CSV Inspection Report - Found {len(csv_files)} CSV files\n\n")

        for file_path in csv_files:
            print("--------------------------------------------------")
            print(f"File: {file_path}")
            print("--------------------------------------------------")

            try:
                # Read the CSV into a pandas DataFrame
                df = pd.read_csv(file_path, on_bad_lines='skip')

                # Write results to the output file
                f.write(f"File: {file_path}\n")
                f.write(f"Shape: {df.shape}\n\n")
                f.write("Head:\n")
                f.write(df.head().to_string())
                f.write("\n\n")

                print(f"Results for {file_path} written to {output_txt}\n")

            except Exception as e:
                f.write(f"File: {file_path}\n")
                f.write(f"Error: {e}\n\n")
                print(f"Could not read or process file. Error: {e}\n")

    print(f"All results saved to {output_txt}")


In [None]:
import pandas as pd
import networkx as nx
import numpy as np

# --- Configuration ---
# The input file is the co-occurrence matrix you showed me.
INPUT_FILE = 'skill_co_occurrence_matrix.csv' 
OUTPUT_FILE = 'centrality_measures.csv'

print(f"Reading skill relationship data from '{INPUT_FILE}'...")

try:
    # Load the co-occurrence matrix. The first column contains the skill names, so it becomes the index.
    df = pd.read_csv(INPUT_FILE, index_col=0)
except FileNotFoundError:
    print(f"FATAL ERROR: The input file '{INPUT_FILE}' was not found.")
    print("Please make sure this script is in the same directory as your CSV file.")
    exit()

# The networkx library works best with clean column names. Let's ensure they are all strings.
df.columns = df.columns.astype(str)
df.index = df.index.astype(str)

print("Building the skill network graph. This may take a moment...")

# Create an empty graph
G = nx.Graph()

# Add all skills as nodes to the graph
for skill in df.index:
    G.add_node(skill)

# Add edges between skills with their co-occurrence score as the 'weight'
# We iterate through the upper triangle of the matrix to avoid adding duplicate edges
for i in range(len(df.columns)):
    for j in range(i + 1, len(df.columns)):
        skill1 = df.columns[i]
        skill2 = df.columns[j]
        weight = df.loc[skill1, skill2]
        
        # Only add an edge if there is a connection (weight > 0)
        if weight > 0:
            G.add_edge(skill1, skill2, weight=weight)

print(f"Graph created successfully with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
print("-" * 30)
print("Calculating centrality measures...")

# --- Centrality Calculations ---

# 1. Degree Centrality: How many direct connections a skill has.
print("Calculating Degree Centrality...")
degree_centrality = nx.degree_centrality(G)

# 2. Betweenness Centrality: Identifies "bridge" skills.
# NOTE: This is computationally intensive and can take a few minutes on a large graph.
# We use 'weight' to consider the strength of connections. Since networkx treats higher weights
# as 'longer distances', we need to invert our similarity scores.
# We create a new attribute 'distance' which is 1/weight.
for u, v, data in G.edges(data=True):
    if data['weight'] > 0:
        data['distance'] = 1.0 / data['weight']
    else:
        data['distance'] = float('inf')

print("Calculating Betweenness Centrality (this might take a while)...")
betweenness_centrality = nx.betweenness_centrality(G, weight='distance', normalized=True)

# 3. Eigenvector Centrality: Measures influence.
print("Calculating Eigenvector Centrality...")
# This can sometimes fail to converge on complex graphs. We'll add error handling.
try:
    eigenvector_centrality = nx.eigenvector_centrality(G, weight='weight', max_iter=1000)
except nx.PowerIterationFailedConvergence:
    print("Eigenvector centrality did not converge. Filling with 0.")
    eigenvector_centrality = {node: 0.0 for node in G.nodes()}

print("Centrality calculations complete.")
print("-" * 30)

# --- Assemble and Save the Results ---

# Create a DataFrame to hold the results
centrality_df = pd.DataFrame({
    "Skill": list(G.nodes),
    "Degree Centrality": [degree_centrality.get(node, 0) for node in G.nodes()],
    "Betweenness Centrality": [betweenness_centrality.get(node, 0) for node in G.nodes()],
    "Eigenvector Centrality": [eigenvector_centrality.get(node, 0) for node in G.nodes()]
})

# Sort by a primary centrality measure for easier viewing
centrality_df = centrality_df.sort_values(by="Degree Centrality", ascending=False)

# Save the final DataFrame to a new CSV file
centrality_df.to_csv(OUTPUT_FILE, index=False)

print(f"Successfully created '{OUTPUT_FILE}'.")
print("\nHere's a preview of the top 10 most central skills:")
print(centrality_df.head(10))

In [None]:
# create_matrix.py
import itertools
import pandas as pd
import numpy as np
from collections import defaultdict
import csv # Import the csv module

# --- Configuration ---
INPUT_SKILLS_FILE = "csv/skills_no_duplicate_sorted.csv" # Using the cleaned, sorted skills file
OUTPUT_MATRIX_FILE = "correspendentFinalClean2.csv"

print(f"Reading skills from '{INPUT_SKILLS_FILE}'...")

# Initialize a dictionary to store words and connections as sets to prevent duplicates
words = defaultdict(set)

# Define the chunk size for reading the CSV in parts
chunk_size = 10000
i = 0

# Process CSV in chunks
for chunk in pd.read_csv(INPUT_SKILLS_FILE, header=None, chunksize=chunk_size, low_memory=False):
    i += 1
    print(f"Processing chunk {i}...")
    # Iterate over each row in the chunk
    for row in chunk.itertuples(index=False):
        # Filter out NaN values and convert to list of strings
        parts = [str(item) for item in row if pd.notna(item)]
        # Generate pairwise combinations and update connections
        for a, b in itertools.combinations(parts, 2):
            if a and b:
                words[a].add(b)
                words[b].add(a)

print("Finished processing skill pairs.")

# Convert keys to a list and determine the size of the final dataset
keys = list(words.keys())
size = len(keys)
key_to_index = {key: i for i, key in enumerate(keys)} # Create a mapping for faster lookups

print(f"Found {size} unique skills. Building matrix...")

# Initialize a numpy array to track connections
track = np.zeros((size, size))

# Populate the track array based on the accumulated connections
for i, k in enumerate(keys):
    track[i, i] = len(words[k])  # Self-connection represents the degree of the node
    for j in words[k]:
        if j in key_to_index: # Ensure the skill is in our keys list
            j_index = key_to_index[j]
            track[i, j_index] += 1
            # The matrix is symmetric, so we don't need to add to track[j_index, i] again

print("Matrix built. Normalizing values...")

# Normalize each row in track by dividing each element by its diagonal entry
# We need to handle division by zero for skills that have no connections (track[row,row] == 0)
diagonal = track.diagonal().copy() # Make a copy to avoid modifying it while iterating
diagonal[diagonal == 0] = 1 # Avoid division by zero, the result will be 0 anyway

for row in range(track.shape[0]):
    track[row,:] /= diagonal[row]

print("Normalization complete. Saving to CSV...")

# Create a DataFrame from the track array with labels for rows and columns
track_df = pd.DataFrame(track, index=keys, columns=keys)

# Save the DataFrame to a CSV file. Pandas handles the quoting correctly.
track_df.to_csv(OUTPUT_MATRIX_FILE)

print(f"Successfully created '{OUTPUT_MATRIX_FILE}'!")

In [None]:
# generate_matrix.py
import pandas as pd
import numpy as np
import itertools
from collections import defaultdict

# --- Configuration ---
# This is the file where each row represents a set of co-occurring skills for a project/job.
INPUT_SKILLS_FILE = "csv/skillsFreelancerFinal.csv" 

# This will be the final, correctly formatted co-occurrence matrix.
OUTPUT_MATRIX_FILE = "skill_co_occurrence_matrix_with_duplicate.csv"

# --- Main Script ---

print(f"Step 1: Reading skills from '{INPUT_SKILLS_FILE}' and building co-occurrence map.")

# Use defaultdict(set) for efficient and duplicate-free storage of skill connections.
skill_connections = defaultdict(set)
all_skills = set()

# We read the CSV line by line to handle varying numbers of skills per row.
with open(INPUT_SKILLS_FILE, 'r', encoding='utf-8') as f:
    # Skip the header row if it exists.
    # If you are SURE there's no header, you can comment out the next line.
    next(f, None) 
    
    for line in f:
        # Split by comma and strip whitespace/quotes from each skill
        skills_in_row = [skill.strip().strip('"') for skill in line.strip().split(',')]
        
        # Filter out any empty strings that might result from trailing commas
        cleaned_skills = [skill for skill in skills_in_row if skill]
        
        # Add all unique skills to our master set
        all_skills.update(cleaned_skills)
        
        # Generate all unique pairs of skills in this row
        for skill1, skill2 in itertools.combinations(cleaned_skills, 2):
            skill_connections[skill1].add(skill2)
            skill_connections[skill2].add(skill1)

print(f"Step 2: Found {len(all_skills)} unique skills. Creating the co-occurrence matrix.")

# Create a sorted list of keys for consistent matrix ordering
keys = sorted(list(all_skills))
size = len(keys)

# Create a mapping of skill_name -> index for much faster lookups
key_to_index = {key: i for i, key in enumerate(keys)}

# Initialize a numpy array to store the counts
co_occurrence_counts = np.zeros((size, size), dtype=int)

# Populate the matrix with co-occurrence counts
for skill, connections in skill_connections.items():
    i = key_to_index[skill]
    # The diagonal will store the total number of connections (degree) for each skill
    co_occurrence_counts[i, i] = len(connections)
    for connected_skill in connections:
        if connected_skill in key_to_index:
            j = key_to_index[connected_skill]
            # We simply count 1 for each co-occurrence
            co_occurrence_counts[i, j] = 1

print("Step 3: Normalizing the matrix to create correlation scores.")

# Create a copy of the matrix for normalization
# We will divide each cell (i, j) by the total occurrences of skill i.
# This gives P(j|i) - the probability of seeing skill j given that you see skill i.
normalized_matrix = np.zeros((size, size), dtype=float)
for i in range(size):
    total_occurrences = co_occurrence_counts[i, i]
    if total_occurrences > 0:
        normalized_matrix[i, :] = co_occurrence_counts[i, :] / total_occurrences
        normalized_matrix[i, i] = 1.0 # The probability of a skill co-occurring with itself is 1

print("Step 4: Saving the final matrix to CSV.")

# Create a pandas DataFrame to save the result with proper headers and index
final_df = pd.DataFrame(normalized_matrix, index=keys, columns=keys)

# Save to CSV. Pandas will automatically handle quoting for skill names with commas.
final_df.to_csv(OUTPUT_MATRIX_FILE)

print(f"\nSuccess! Your skill intelligence matrix has been saved to '{OUTPUT_MATRIX_FILE}'.")
print("This file is the 'brain' of your application.")
print("\nYou can now use this file as input for the 'generate_centrality.py' script and then the final 'cv_analyzer_app.py'.")

In [None]:
import pandas as pd
import networkx as nx
from networkx.algorithms import community
import collections

# --- Configuration ---
# This is your main co-occurrence matrix.
# It should be the non-transposed version for easier handling here.
# If you only have the transposed one, that's okay, this script will handle it.
INPUT_MATRIX_FILE = 'skill_co_occurrence_matrix.csv' # Using the file from your screenshot
OUTPUT_CLUSTERS_FILE = 'skill_clusters.csv'

# This is the most important parameter to tune.
# It defines the percentage of the WEAKEST skill connections to REMOVE before clustering.
# A higher value (e.g., 0.90) will create more distinct, smaller clusters.
# A lower value (e.g., 0.70) will create fewer, larger clusters.
# Let's start high to ensure we break up that single giant cluster.
PRUNING_QUANTILE = 0.95 # This means we will only keep the top 5% of strongest connections.

# --- Main Script ---

print(f"Step 1: Loading the skill co-occurrence matrix from '{INPUT_MATRIX_FILE}'...")
try:
    df = pd.read_csv(INPUT_MATRIX_FILE, index_col=0)
except FileNotFoundError:
    print(f"FATAL ERROR: The input file '{INPUT_MATRIX_FILE}' was not found.")
    print("Please make sure this script is in the same directory as your CSV file.")
    exit()

# Clean up names to be safe
df.columns = df.columns.map(str)
df.index = df.index.map(str)

print("Step 2: Building the skill network graph from the matrix...")
# Create a graph from the pandas DataFrame
G = nx.from_pandas_adjacency(df)
print(f"Initial graph created with {G.number_of_nodes()} skills and {G.number_of_edges()} connections.")

print("\nStep 3: Pruning the graph to keep only the strongest connections...")
# Get all edge weights and determine the threshold for keeping the top connections
all_weights = [data['weight'] for u, v, data in G.edges(data=True)]
if not all_weights:
    print("Error: No weights found in the graph. Cannot prune.")
    exit()

threshold = pd.Series(all_weights).quantile(PRUNING_QUANTILE)
print(f"Calculated weight threshold: {threshold:.4f}. Edges with weight below this will be removed.")

# Create a new, pruned graph
G_pruned = nx.Graph()
# Add all skills first, so we don't lose any skills that might become isolated
G_pruned.add_nodes_from(G) 

# Add only the edges that are above our threshold
for u, v, data in G.edges(data=True):
    if data['weight'] >= threshold:
        G_pruned.add_edge(u, v, weight=data['weight'])

print(f"Pruned graph has {G_pruned.number_of_nodes()} skills and {G_pruned.number_of_edges()} strong connections.")

print("\nStep 4: Detecting skill communities using the Louvain algorithm...")
# Find communities (clusters) in the pruned graph
# The 'seed' makes the result reproducible
communities_sets = community.louvain_communities(G_pruned, weight='weight', seed=42)
communities_sets = sorted(communities_sets, key=len, reverse=True) # Sort by size

print(f"Successfully identified {len(communities_sets)} distinct skill clusters.")

print("\nStep 5: Formatting and saving the clusters to 'skill_clusters.csv'...")
# Create a dictionary to map each skill to its cluster ID
community_dict = {}
for i, cluster in enumerate(communities_sets):
    for skill in cluster:
        community_dict[skill] = i

# Convert to a DataFrame
community_df = pd.DataFrame(community_dict.items(), columns=["Skill", "Cluster_ID"])
community_df = community_df.sort_values(by="Cluster_ID")

# Save the clusters to a new CSV file
community_df.to_csv(OUTPUT_CLUSTERS_FILE, index=False)

print(f"\nSuccess! New '{OUTPUT_CLUSTERS_FILE}' has been created.")
print("\n--- Preview of the 5 Largest Clusters Found ---")

for i in range(min(5, len(communities_sets))):
    cluster_skills = list(communities_sets[i])
    print(f"\nCluster {i} (Size: {len(cluster_skills)} skills):")
    # Print up to the first 10 skills in the cluster for preview
    print(", ".join(cluster_skills[:10]))