In [None]:
# ===================================================================================
# DAY 1: PARKINSON'S ANALYZER - V6.0 (AI-DRIVEN, CONTEXT-AWARE & EXPLAINABLE)
# ===================================================================================

# --- Part 1: Imports ---
import pandas as pd
import numpy as np
from dtaidistance import dtw, dtw_visualisation
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt

print("✅ Step 1: Libraries imported.")

# --- Part 2: Data Loading & Cleaning ---
column_names = [
    'subject#', 'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS', 'Jitter(%)', 'Jitter(Abs)',
    'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP', 'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
    'Shimmer:APQ11', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE'
]
df = pd.read_csv('parkinsons_updrs.data', names=column_names)
# Now we keep age and sex for contextual filtering
df = df[['subject#', 'age', 'sex', 'test_time', 'total_UPDRS']]
df[['total_UPDRS', 'test_time', 'age']] = df[['total_UPDRS', 'test_time', 'age']].apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)
print(f"✅ Step 2: Data loaded & cleaned. Found {df['subject#'].nunique()} unique patients.")

# --- Part 3: Interpolation and Trajectory Preparation ---
def normalize_and_interpolate(patient_df, points=100):
    if len(patient_df) < 2: return None
    min_time = patient_df['test_time'].min()
    max_time = patient_df['test_time'].max()
    if max_time == min_time: return None
    time_normalized = (patient_df['test_time'] - min_time) / (max_time - min_time)
    interp_func = interp1d(time_normalized, patient_df['total_UPDRS'], kind='linear', fill_value="extrapolate")
    return interp_func(np.linspace(0, 1, points))

# Create a master dictionary of all valid trajectories
all_trajectories = {sid: normalize_and_interpolate(pdf) for sid, pdf in df.groupby('subject#') if normalize_and_interpolate(pdf) is not None}
subject_ids = list(all_trajectories.keys())
trajectory_matrix = np.array(list(all_trajectories.values()))

# --- FIX for K-Means NaN Error ---
# Find rows with NaN values and filter them out before clustering
nan_rows_mask = np.isnan(trajectory_matrix).any(axis=1)
if np.any(nan_rows_mask):
    print(f"Warning: Found and removed {np.sum(nan_rows_mask)} trajectories with NaN values before clustering.")
    trajectory_matrix = trajectory_matrix[~nan_rows_mask]
    subject_ids = [sid for i, sid in enumerate(subject_ids) if not nan_rows_mask[i]]
# --- End of FIX ---

print("✅ Step 3: All patient trajectories standardized.")

# --- UPGRADE 1: K-MEANS CLUSTERING TO DISCOVER COHORTS ---
# Scale data before clustering for better performance
scaler = StandardScaler()
scaled_trajectories = scaler.fit_transform(trajectory_matrix)
# Use K-Means to find 2 natural clusters
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
clusters = kmeans.fit_predict(scaled_trajectories)

# Create a new dataframe for our results
cohort_df = pd.DataFrame({'subject#': subject_ids, 'cluster': clusters})

# Figure out which cluster is "Fast" vs "Slow" by checking their original progression rates
rates_df = df.groupby('subject#').apply(lambda x: (x['total_UPDRS'].max() - x['total_UPDRS'].min()) / (x['test_time'].max() - x['test_time'].min()) if (x['test_time'].max() - x['test_time'].min()) > 0 else 0).reset_index(name='rate')
cohort_df = pd.merge(cohort_df, rates_df, on='subject#')
cluster_avg_rates = cohort_df.groupby('cluster')['rate'].mean()
fast_cluster_id = cluster_avg_rates.idxmax()
cohort_df['cohort'] = np.where(cohort_df['cluster'] == fast_cluster_id, 'Fast Progressor (AI-Discovered)', 'Slow Progressor (AI-Discovered)')

# Merge everything back into the main df
df = pd.merge(df, cohort_df[['subject#', 'cohort']], on='subject#')
fast_progressors_df = df[df['cohort'].str.contains('Fast')]
slow_progressors_df = df[df['cohort'].str.contains('Slow')]
print(f"✅ Step 4: AI-driven cohorts discovered via K-Means clustering.")

# --- Part 5: The Definitive, CONTEXT-AWARE Comparison Engine ---
def find_best_match(new_patient_series, patient_age=None, patient_sex=None, age_window=5):
    new_patient_interpolated = normalize_and_interpolate(pd.DataFrame({'test_time': range(len(new_patient_series)), 'total_UPDRS': new_patient_series}))
    
    # UPGRADE 3: Contextual Filtering
    target_df = df.copy()
    if patient_age is not None:
        target_df = target_df[df['age'].between(patient_age - age_window, patient_age + age_window)]
    if patient_sex is not None:
        target_df = target_df[df['sex'] == patient_sex]
    
    if len(target_df) < 5:
        print("Warning: Comparison group is very small due to filtering. Results may be less reliable.")
        target_df = df # Fallback to all data if filter is too restrictive

    min_dist = float('inf')
    best_match_id, best_match_cohort, best_match_path, best_match_trajectory = None, None, None, None

    # Compare only against the filtered group
    for sid in target_df['subject#'].unique():
        if sid in all_trajectories:
            traj = all_trajectories[sid]
            dist = dtw.distance(new_patient_interpolated, traj)
            if dist < min_dist:
                min_dist = dist
                best_match_id = sid
                best_match_cohort = target_df[target_df['subject#'] == sid]['cohort'].iloc[0]
                best_match_path = dtw.warping_path(new_patient_interpolated, traj)
                best_match_trajectory = traj
    
    if best_match_id is None: return None # No match found
    
    # UPGRADE 2: Improved Confidence Score
    # Compare the match distance to the median distance within its OWN cohort
    cohort_ids = target_df[target_df['cohort'] == best_match_cohort]['subject#'].unique()
    cohort_distances = [dtw.distance(best_match_trajectory, all_trajectories[sid]) for sid in cohort_ids if sid != best_match_id and sid in all_trajectories]
    median_cohort_dist = np.median(cohort_distances) if cohort_distances else min_dist
    confidence = max(0, 100 * (1 - min_dist / (median_cohort_dist + 1e-6)))

    return {
        "cohort": best_match_cohort, "match_id": best_match_id, "confidence": confidence, "path": best_match_path,
        "new_patient_interp": new_patient_interpolated, "match_trajectory_interp": best_match_trajectory
    }

# --- Part 6: Test the Final Engine! ---
test_patient_fast = [22, 25, 26, 29, 32]
# Test with a specific patient profile
results = find_best_match(test_patient_fast, patient_age=65, patient_sex=0) # 0 for male, 1 for female

if results:
    print("---------------------------------------------")
    print("🚀 ENGINE TEST COMPLETE (V6.0 - AI DRIVEN) 🚀")
    print(f"Test Patient Data: {test_patient_fast}")
    print(f"   -> Predicted Cohort: {results['cohort']}")
    print(f"   -> Closest Match: Patient #{results['match_id']}")
    print(f"   -> Match Confidence: {results['confidence']:.1f}% (Compared to others in its cohort)")
    print("---------------------------------------------")
    
    # Generate the same powerful visualizations as before
    # ... (Visualization code remains the same as V5.1, as it's already excellent) ...
    # CRYSTAL BALL
    fig1, ax1 = plt.subplots(figsize=(12, 7))
    cutoff_point = 20
    matched_trajectory = results['match_trajectory_interp']
    x_full, x_forecast = np.arange(100), np.arange(cutoff_point - 1, 100)
    ax1.plot(results['new_patient_interp'], label='Your Patient\'s Trajectory (Normalized)', color='red', linewidth=4)
    ax1.plot(x_full, matched_trajectory, label=f'Best Match (Patient #{results["match_id"]}) Full Path', color='gray', linestyle=':')
    ax1.plot(x_forecast, matched_trajectory[cutoff_point - 1:], color='green', linewidth=4, linestyle='--', label='Projected Forecast')
    ax1.set_title('Predictive Trajectory Forecast (Context-Aware)', fontsize=16)
    ax1.set_xlabel('Normalized Time')
    ax1.set_ylabel('Interpolated UPDRS Score')
    ax1.legend()
    ax1.grid(True)
    plt.show()

    # XAI PLOT
    fig2, (ax2_1, ax2_2) = plt.subplots(nrows=2, ncols=1, figsize=(10, 10))
    dtw_visualisation.plot_warping(results['new_patient_interp'], results['match_trajectory_interp'], results['path'], fig=fig2, axs=(ax2_1, ax2_2))
    ax2_1.set_title("DTW Warping Path Explained", fontsize=16)
    plt.tight_layout()
    plt.show()
else:
    print("Could not find a suitable match for the test patient based on the specified context.")


✅ Step 1: Libraries imported.
✅ Step 2: Data loaded & cleaned. Found 42 unique patients.


  slope = (y_hi - y_lo) / (x_hi - x_lo)[:, None]


✅ Step 3: All patient trajectories standardized.


ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values