In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load dataset
file_path = "path/to/PATIENTS_w_AGE.csv"
df = pd.read_csv(file_path)

if "AGE" not in df.columns:
    raise ValueError("The column 'AGE' is missing from the dataset.")

# Extract and clean the age values

df = df[df["AGE"] < 90]
ages = df["AGE"].dropna().astype(int)
# Automatically determine the number of bins (m) using quantiles
num_bins = int(np.ceil(np.sqrt(len(ages))))  # Using sqrt rule for initial bin count
num_bins = int(num_bins // 16)  # Floor division ensures an integer output

# Compute bin edges based on quantiles
bin_edges = np.quantile(ages, np.linspace(0, 1, num_bins + 1))
print(len(bin_edges))
# Ensure bin labels match the number of bins (1 fewer than bin edges)
bin_labels = [f"Bin_{i+1}" for i in range(len(bin_edges) - 1)]
print(len(bin_labels))
# Apply Equal Frequency Binning
df["Age_Bins"] = pd.qcut(ages, q=len(bin_edges), duplicates="drop")

# Display bin counts
print(f"Number of Bins Selected: {len(bin_labels)}")
print(df["Age_Bins"].value_counts().sort_index())

plt.figure(figsize=(10, 5))
plt.hist(ages, bins=bin_edges, alpha=0.7, color="skyblue", edgecolor="black")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Equal Frequency Binning Distribution")
plt.grid(axis="y")
plt.savefig("path/to/eq-bin.png")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

file_path = "path/to/PATIENTS_w_AGE.csv"

try:
    df = pd.read_csv(file_path)

    if "AGE" not in df.columns:
        raise ValueError("Column 'AGE' not found in the dataset.")
    df = df[df["AGE"] < 90]
    # Extract the AGE column and reshape for K-Means clustering
    ages = df["AGE"].dropna().values.reshape(-1, 1)  # Removing NaN values if any

    # Define the range for K (number of clusters)
    k_values = range(1, 11)  # Testing from 1 to 10 clusters
    wcss = []  # Within-cluster sum of squares

    # Compute WCSS for each k
    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(ages)
        wcss.append(kmeans.inertia_)

    # Plot the Elbow Method graph
    plt.figure(figsize=(8, 5))
    plt.plot(k_values, wcss, marker='o', linestyle='--', color='b', label="WCSS")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Within-Cluster Sum of Squares (WCSS)")
    plt.title("Elbow Method for Optimal K in Age Clustering")
    plt.xticks(k_values)
    plt.grid(True)
    plt.legend()
    plt.savefig("path/to/cluster-elbow.png")
    plt.show()

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the path.")
except pd.errors.EmptyDataError:
    print("Error: The file is empty or could not be read.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


In [None]:
from sklearn.metrics import silhouette_score

# Define the optimal cluster range based on the Elbow Method (testing for k=2 to k=6)
optimal_k_values = range(2, 7)  # Avoid k=1 
silhouette_scores = []

# Compute silhouette scores for each k
for k in optimal_k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(ages)
    score = silhouette_score(ages, labels)
    silhouette_scores.append(score)

# Plot the Silhouette Scores
plt.figure(figsize=(8, 5))
plt.plot(optimal_k_values, silhouette_scores, marker='o', linestyle='--', color='r', label="Silhouette Score")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Method for Optimal K in Age Clustering")
plt.xticks(optimal_k_values)
plt.grid(True)
plt.legend()
plt.savefig("path/to/cluster-Silhouette.png")
plt.show()

# Return the best k based on silhouette score
best_k = optimal_k_values[np.argmax(silhouette_scores)]
best_k


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

file_path = "path/to/PATIENTS_w_AGE.csv"

df = pd.read_csv(file_path)

if "AGE" not in df.columns:
    raise ValueError("The column 'AGE' is missing from the dataset.")

# Extract age values
df = df[df["AGE"] < 90]
ages = df["AGE"].dropna().astype(int)
# Compute descriptive statistics
age_min = ages.min()
age_max = ages.max()
age_mean = ages.mean()
age_median = ages.median()
age_mode = ages.mode().values[0]
percentiles = np.percentile(ages, [10, 25, 50, 75, 90])

print(f"Min Age: {age_min}, Max Age: {age_max}")
print(f"Mean Age: {age_mean:.2f}, Median Age: {age_median}, Mode Age: {age_mode}")
print(f"10th, 25th, 50th, 75th, 90th Percentiles: {percentiles}")

plt.figure(figsize=(10, 5))
plt.hist(ages, bins=30, alpha=0.7, color="skyblue", edgecolor="black")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Age Distribution Histogram")
plt.grid(axis="y")
plt.savefig("/home/jovyan/work/pos-to-neg-mimic/mimic-age-int/eq-bin.png")

# Optimal Grouping 1: Equal Frequency Binning
num_bins = 3  #adjust according to above code cell output 
df["Age_Bins"] = pd.qcut(ages, num_bins, duplicates="drop")

# Display bin counts
print("Equal Frequency Bins:")
print(df["Age_Bins"].value_counts().sort_index())

# Optimal Grouping 2: K-Means Clustering (Dynamic Grouping)
num_clusters = 3 #adjust according to above code cell output 
ages_reshaped = np.array(ages).reshape(-1, 1)
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df["KMeans_Bins"] = kmeans.fit_predict(ages_reshaped)

# Display cluster centers
print("K-Means Cluster Centers:")
print(sorted(kmeans.cluster_centers_.flatten()))

# Visualizing K-Means Clusters
plt.figure(figsize=(10, 5))
plt.scatter(ages, np.zeros_like(ages), c=kmeans.labels_, cmap="viridis", alpha=0.6)
plt.xlabel("Age")
plt.title("K-Means Age Grouping")
plt.savefig("path/to/k-means.png")


In [None]:
import pandas as pd

file_path = "path/to/PATIENTS_w_AGE.csv"

df = pd.read_csv(file_path)


if "AGE" not in df.columns:
    raise ValueError("The column 'AGE' is missing from the dataset.")
#k-means clusters
#[34.25287356321835, 72.90306854029248, 78.38907849829354]
# Define grouping accordingly
def assign_age_group(age):

    if  age <= 23:
        return "group1"
    elif 23 < age <= 58:
        return "group2"
    elif 58 < age <= 78:
        return "group3"

df["Lifespan_Period"] = df["AGE"].apply(assign_age_group)

# Save the updated CSV with the new column
output_file = "/path/to/grouped_ages.csv"
df.to_csv(output_file, index=False)

print("Updated CSV saved as:", output_file)
print(df["Lifespan_Period"].value_counts())


In [None]:
#jaccard similarity calculations
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Define possible intervals
possible_intervals = [(0,0), (0,17), (12,55)]


# Define the age groups
age_groups = {
 
    "group1": (0, 23),
    "group2": (23, 58),
    "group3": (58, 78)
}

# Function to compute Jaccard similarity between two intervals
def jaccard_similarity(interval1, interval2):
    start1, end1 = interval1
    start2, end2 = interval2
    intersection = max(0, min(end1, end2) - max(start1, start2))
    union = (end1 - start1) + (end2 - start2) - intersection
    return intersection / union if union != 0 else 0

# Compute Jaccard similarity for each group with possible intervals
jaccard_scores = {
    group: [jaccard_similarity(bounds, interval) for interval in possible_intervals]
    for group, bounds in age_groups.items()
}

# Convert to DataFrame for display
jaccard_df = pd.DataFrame(jaccard_scores, index=[f"{possible_intervals[i]}" for i in range(len(possible_intervals))])

# Display results
print("Jaccard Similarity Scores:\n", jaccard_df)
