In [2]:
# Notebook: Profile Weighted Clusters Using Manually Updated Interpretation Guide
import pandas as pd

# --- Step 1: Load Datasets ---
print("Loading datasets...")
clustering_results_path = 'data/kmeans_cosine_clustering_results_4_1-1_threshold_0.25_n4.csv'
interpretation_guide_path = 'data/New_Manually_Updated_Correlation_Interpretation_Guide.csv'
clustering_input_path = 'data/clustering_input_final_0.25.csv'

df_clusters = pd.read_csv(clustering_results_path)
df_interpretation_guide = pd.read_csv(interpretation_guide_path)
df_clustering = pd.read_csv(clustering_input_path)

# --- Step 2: Create Interpretation Dictionary ---
print("Creating interpretation dictionary...")
interpretation_dict = {}
for _, row in df_interpretation_guide.iterrows():
    key = tuple(sorted([row['Metric 1'], row['Metric 2']]))  # Ensure symmetry
    interpretation_dict[key] = {
        "Positive": row['Positive Correlation Interpretation'],
        "Negative": row['Negative Correlation Interpretation']
    }

# --- Step 3: Generate Cluster Profiles ---
print("Generating cluster profiles...")
cluster_profiles = {}
for cluster in df_clusters['Cluster'].unique():
    cluster_df = df_clusters[df_clusters['Cluster'] == cluster]
    states_in_cluster = cluster_df['State'].tolist()
    cluster_data = df_clustering[df_clustering['State'].isin(states_in_cluster)]
    
    report = []
    for _, row in cluster_data.iterrows():
        key = tuple(sorted([row['Metric 1'], row['Metric 2']]))
        correlation = row['Correlation']
        
        if key in interpretation_dict:
            interpretation = interpretation_dict[key]["Positive"] if correlation > 0 else interpretation_dict[key]["Negative"]
            report.append(f"- {row['Metric 1']} & {row['Metric 2']}: {interpretation} (Corr: {correlation:.2f})")
    
    cluster_profiles[cluster] = "\n".join(report)

# --- Step 4: Save Cluster Profiles as a CSV ---
print("Saving cluster profiles...")
df_cluster_profiles = pd.DataFrame(cluster_profiles.items(), columns=['Cluster', 'Profile'])
df_cluster_profiles.to_csv("data/cluster_profiles_weighted_updated_0.25_n4.csv", index=False)

# --- Step 5: Print States in Each Cluster ---
print("\nStates in Each Cluster:")
clustered_states = df_clusters.groupby("Cluster")["State"].apply(list)
for cluster, states in clustered_states.items():
    print(f"Cluster {cluster}:")
    print(", ".join(states))  # Print states as a comma-separated list
    print("-" * 50)  # Separator for readability

print("Cluster profiling completed successfully using manually updated interpretations!")

Loading datasets...
Creating interpretation dictionary...
Generating cluster profiles...
Saving cluster profiles...

States in Each Cluster:
Cluster 0.0:
Alabama, Arkansas, Colorado, Connecticut, Delaware, Illinois, Kansas, Maryland, Missouri, Montana, New Hampshire, New Mexico, New York, Pennsylvania, South Carolina, Tennessee, Utah, Virginia, Washington
--------------------------------------------------
Cluster 1.0:
Arizona, Florida, Georgia, Idaho, Indiana, Maine, Minnesota, Nevada, Oregon, Texas, Vermont, Wisconsin
--------------------------------------------------
Cluster 2.0:
Alaska, Iowa, Kentucky, Mississippi, North Carolina, Oklahoma, West Virginia, Wyoming
--------------------------------------------------
Cluster 3.0:
District of Columbia, Massachusetts, Michigan, Nebraska, Rhode Island, South Dakota
--------------------------------------------------
Cluster profiling completed successfully using manually updated interpretations!


In [3]:
# --- Step 6: Identify Missing States ---
print("\nIdentifying states missing from the clustering process...")

# Load the original dataset (before filtering)
original_dataset_path = 'data/Multiple_Cause_of_Death,_1999-2014_v1.1.csv'  # Replace with actual path
df_original = pd.read_csv(original_dataset_path)

# Extract state lists
original_states = set(df_original["State"].unique())  # All states in the original dataset
clustered_states = set(df_clusters["State"].unique())  # States that were clustered

# Find missing states
missing_states = original_states - clustered_states

# Print missing states if any
if missing_states:
    print("States missing from clustering results:")
    print(", ".join(sorted(missing_states)))  # Print in alphabetical order for readability
else:
    print("No states were lost during clustering.")


Identifying states missing from the clustering process...
States missing from clustering results:
California, Hawaii, Louisiana, New Jersey, North Dakota, Ohio
