# Assignment 2: Huntington's Disease Dataset Analysis

---

## 1. Import Necessary Libraries

In [None]:
# Core data manipulation and analysis libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set display options for better DataFrame output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

## 2. Data Loading and Initial Exploration

In [None]:
# load dataset
df = pd.read_csv('data/Huntington_Disease_Dataset.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Number of patients: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

In [None]:
#display first few rows to see the data structure
print("First 5 rows of the dataset:")
df.head()

In [None]:
#detailed information about the dataset
#to check data types, missing values, and memory usage
print("Dataset Information:")
print("=" * 50)
df.info()

In [None]:
#statistical summary of numerical columns
#to provide insights into the distribution of key clinical variables
print("Statistical Summary of Numerical Features:")
print("=" * 50)
df.describe().transpose()

In [None]:
#identify column types and their relevance for analysis

print("Column Analysis:")
print("=" * 40)
for i, col in enumerate(df.columns):
    dtype = df[col].dtype
    unique_vals = df[col].nunique()
    missing_vals = df[col].isnull().sum()
    print(f"{i+1:2d}. {col:<30} | {str(dtype):<10} | Unique: {unique_vals:4d} | Missing: {missing_vals:3d}")

## 3. Data Preprocessing

### 3.1 Data Cleaning and Feature Selection

In [None]:
#test to remove irrelevant columns for analysis
#patient_ID: unique identifier, not predictive
#random sequences: generated for privacy, not real biological data
#gene info columns: redundant descriptive information
#

columns_to_drop = [
    'Patient_ID',  # Unique identifier - not predictive
    'Random_Protein_Sequence',  # Random sequence for privacy
    'Random_Gene_Sequence',  # Random sequence for privacy  
    'Gene/Factor',  # Redundant with other genetic features
    'Chromosome_Location',  # Static genetic information
    'Function',  # Descriptive, not quantitative
    'Effect',  # Descriptive, not quantitative
    'Category'  # Descriptive, not quantitative
]

# Create cleaned dataset focusing on clinically relevant features
df_clean = df.drop(columns=columns_to_drop)

print(f"Original dataset: {df.shape}")
print(f"Cleaned dataset: {df_clean.shape}")
print(f"Removed {len(columns_to_drop)} irrelevant columns")

print("\nRemaining features:")
for col in df_clean.columns:
    print(f"- {col}")

### 3.2 Handle Duplicates

In [None]:
# Check for duplicate records
# In medical data, duplicates could indicate data entry errors

print("Duplicate Analysis:")
print("=" * 30)

# Check for complete duplicates
duplicate_rows = df_clean.duplicated().sum()
print(f"Complete duplicate rows: {duplicate_rows}")

# Check for duplicates based on key clinical features
key_features = ['Age', 'Sex', 'HTT_CAG_Repeat_Length', 'Disease_Stage']
duplicate_clinical = df_clean.duplicated(subset=key_features).sum()
print(f"Duplicate clinical profiles: {duplicate_clinical}")

if duplicate_rows > 0:
    print(f"\nRemoving {duplicate_rows} duplicate rows...")
    df_clean = df_clean.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {df_clean.shape}")
else:
    print("No duplicate rows found - data quality is good!")

### 3.3 Missing Data Analysis and Imputation

In [None]:
# Visualize missing data patterns
plt.figure(figsize=(16, 10))

# Missing data heatmap
plt.subplot(2, 2, 1)
# Create shortened labels for better readability
short_labels = [
    'Age', 'Sex', 'Family_Hist', 'HTT_CAG', 'Motor_Symp', 
    'Cognitive', 'Chorea', 'Brain_Vol', 'Functional', 
    'Gene_Mut', 'HTT_Expr', 'Protein_Agg', 'Disease_Stage'
]

# Create heatmap with improved formatting
ax1 = sns.heatmap(df_clean.isnull(), 
                  cbar=True, 
                  xticklabels=short_labels,
                  yticklabels=False,
                  cmap='viridis', 
                  cbar_kws={'label': 'Missing Data'})
plt.title('Missing Data Heatmap', fontsize=14, pad=15)
plt.xlabel('Clinical Features', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)

# Missing data bar plot
plt.subplot(2, 2, 2)
missing_counts = df_clean.isnull().sum().sort_values(ascending=True)
missing_counts = missing_counts[missing_counts > 0] 
if len(missing_counts) > 0:
    missing_counts.plot(kind='barh', color='coral')
    plt.title('Missing Data Count by Feature')
    plt.xlabel('Number of Missing Values')
else:
    plt.text(0.5, 0.5, 'No Missing Data Found!', ha='center', va='center', fontsize=14)
    plt.title('Missing Data Count by Feature')

# Missing data percentage
plt.subplot(2, 2, 3)
missing_percentages = ((df_clean.isnull().sum() / len(df_clean)) * 100).sort_values(ascending=True)
missing_percentages = missing_percentages[missing_percentages > 0]
if len(missing_percentages) > 0:
    missing_percentages.plot(kind='barh', color='lightblue')
    plt.title('Missing Data Percentage by Feature')
    plt.xlabel('Percentage of Missing Values (%)')
else:
    plt.text(0.5, 0.5, 'No Missing Data Found!', ha='center', va='center', fontsize=14)
    plt.title('Missing Data Percentage by Feature')

# Data completeness overview
plt.subplot(2, 2, 4)
total_cells = len(df_clean) * len(df_clean.columns)
missing_cells = df_clean.isnull().sum().sum()
complete_cells = total_cells - missing_cells

labels = ['Complete', 'Missing']
sizes = [complete_cells, missing_cells]
colors = ['lightgreen', 'lightcoral']

plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Overall Data Completeness')

plt.tight_layout()
plt.show()

print(f"\nData Completeness Summary:")
print(f"Total data points: {total_cells:,}")
print(f"Complete data points: {complete_cells:,} ({(complete_cells/total_cells)*100:.1f}%)")
print(f"Missing data points: {missing_cells:,} ({(missing_cells/total_cells)*100:.1f}%)")

In [None]:
# Comprehensive missing data analysis
print("Missing Data Analysis:")
print("=" * 40)

# Calculate missing data statistics
missing_stats = []
for col in df_clean.columns:
    missing_count = df_clean[col].isnull().sum()
    missing_percent = (missing_count / len(df_clean)) * 100
    missing_stats.append({
        'Column': col,
        'Missing_Count': missing_count,
        'Missing_Percent': round(missing_percent, 2),
        'Data_Type': str(df_clean[col].dtype)
    })

# Create DataFrame for better visualization
missing_df = pd.DataFrame(missing_stats)
missing_df = missing_df.sort_values('Missing_Percent', ascending=False)

print("Missing Data Summary:")
print(missing_df.to_string(index=False))

# Identify columns with significant missing data (>5% missing)
high_missing = missing_df[missing_df['Missing_Percent'] > 5]
print(f"\nColumns with >5% missing data:")
if len(high_missing) > 0:
    for _, row in high_missing.iterrows():
        print(f"- {row['Column']}: {row['Missing_Count']} missing ({row['Missing_Percent']}%)")
else:
    print("- None (excellent data quality!)")

### 3.4 Outlier Detection and Handling

In [None]:
# Outlier detection
# In medical data, outliers could represent: rare but valid extreme cases, data entry errors, measurement equipment issues

def detect_outliers_iqr(data, column):
    """Detect outliers using Interquartile Range (IQR) method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

def detect_outliers_zscore(data, column, threshold=3):
    """Detect outliers using Z-score method"""
    z_scores = np.abs(stats.zscore(data[column].dropna()))
    outliers = data[np.abs(stats.zscore(data[column].dropna())) > threshold]
    return outliers, z_scores

# Identify numerical columns for outlier analysis
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns for outlier detection:")
for i, col in enumerate(numerical_cols, 1):
    print(f"{i:2d}. {col}")

print(f"\nAnalyzing {len(numerical_cols)} numerical features for outliers...")

In [None]:
# Comprehensive outlier analysis
outlier_summary = []

print("Outlier Detection Summary:")
print("=" * 60)
print(f"{'Feature':<25} {'Total':<8} {'IQR_Out':<8} {'Z_Out':<8} {'%_IQR':<8} {'%_Z':<8}")
print("=" * 60)

for col in numerical_cols:
    # Only analyze columns with data
    if df_clean[col].notna().sum() > 0:  
        # IQR method
        iqr_outliers, lower_iqr, upper_iqr = detect_outliers_iqr(df_clean, col)
        
        # Z-score method  
        zscore_outliers, z_scores = detect_outliers_zscore(df_clean, col)
        
        # Calculate percentages
        total_valid = df_clean[col].notna().sum()
        iqr_pct = (len(iqr_outliers) / total_valid) * 100
        z_pct = (len(zscore_outliers) / total_valid) * 100
        
        # Store summary
        outlier_summary.append({
            'Feature': col,
            'Total_Records': total_valid,
            'IQR_Outliers': len(iqr_outliers),
            'Z_Outliers': len(zscore_outliers),
            'IQR_Percentage': round(iqr_pct, 2),
            'Z_Percentage': round(z_pct, 2),
            'Lower_Bound_IQR': round(lower_iqr, 2),
            'Upper_Bound_IQR': round(upper_iqr, 2)
        })
        
        print(f"{col:<25} {total_valid:<8} {len(iqr_outliers):<8} {len(zscore_outliers):<8} {iqr_pct:<8.1f} {z_pct:<8.1f}")

In [None]:
# Convert to DataFrame for better analysis
outlier_df = pd.DataFrame(outlier_summary)
print("\n" + "=" * 60)

# Define clinical significance thresholds for medical data
print("OUTLIER INTERPRETATION GUIDELINES:")
print("=" * 40)
print("• <2%: Excellent data quality - minimal outliers expected")
print("• 2-5%: Good data quality - acceptable outlier range for clinical data")
print("• 5-10%: Moderate concern - may indicate measurement issues or rare cases")
print("• >10%: Significant concern - requires investigation for data entry errors")

print("\nCLINICAL SIGNIFICANCE ASSESSMENT:")
print("=" * 40)

# Categorize features by outlier severity
excellent_quality = outlier_df[outlier_df['IQR_Percentage'] < 2]
good_quality = outlier_df[(outlier_df['IQR_Percentage'] >= 2) & (outlier_df['IQR_Percentage'] < 5)]
moderate_concern = outlier_df[(outlier_df['IQR_Percentage'] >= 5) & (outlier_df['IQR_Percentage'] < 10)]
significant_concern = outlier_df[outlier_df['IQR_Percentage'] >= 10]

if len(excellent_quality) > 0:
    print(f"\nEXCELLENT QUALITY ({len(excellent_quality)} features):")
    for _, row in excellent_quality.iterrows():
        print(f"   • {row['Feature']}: {row['IQR_Outliers']} outliers ({row['IQR_Percentage']}%)")

if len(good_quality) > 0:
    print(f"\nGOOD QUALITY ({len(good_quality)} features):")
    for _, row in good_quality.iterrows():
        print(f"   • {row['Feature']}: {row['IQR_Outliers']} outliers ({row['IQR_Percentage']}%)")

if len(moderate_concern) > 0:
    print(f"\n⚠️ MODERATE CONCERN ({len(moderate_concern)} features):")
    for _, row in moderate_concern.iterrows():
        print(f"   • {row['Feature']}: {row['IQR_Outliers']} outliers ({row['IQR_Percentage']}%)")
    print("   → Consider investigating these features for biological relevance vs. errors")

if len(significant_concern) > 0:
    print(f"\nSIGNIFICANT CONCERN ({len(significant_concern)} features):")
    for _, row in significant_concern.iterrows():
        print(f"   • {row['Feature']}: {row['IQR_Outliers']} outliers ({row['IQR_Percentage']}%)")
    print("   → Recommend detailed investigation for data entry errors or systematic issues")
else:
    print("\nSIGNIFICANT CONCERN: None detected")

print(f"\nOVERALL DATA QUALITY ASSESSMENT:")
print("=" * 35)
total_features = len(outlier_df)
good_features = len(excellent_quality) + len(good_quality)
quality_score = (good_features / total_features) * 100

print(f"• Features with acceptable outlier levels: {good_features}/{total_features} ({quality_score:.1f}%)")
if quality_score >= 80:
    print("• Overall Assessment: EXCELLENT data quality for clinical analysis")
elif quality_score >= 60:
    print("• Overall Assessment: GOOD data quality with minor concerns")
else:
    print("• Overall Assessment: REQUIRES ATTENTION before analysis")

## 4. Exploratory Data Analysis

### 4.1 Disease Stage Distribution

In [None]:
# Analyze disease stage distribution
plt.figure(figsize=(15, 10))

# Disease stage count plot
plt.subplot(2, 2, 1)
disease_counts = df_clean['Disease_Stage'].value_counts()
plt.pie(disease_counts.values, labels=disease_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Disease Stages')

# Disease stage bar plot
plt.subplot(2, 2, 2)
sns.countplot(data=df_clean, x='Disease_Stage', order=disease_counts.index)
plt.title('Count of Patients by Disease Stage')
plt.xlabel('Disease Stage')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# Age distribution by disease stage
plt.subplot(2, 2, 3)
sns.boxplot(data=df_clean, x='Disease_Stage', y='Age')
plt.title('Age Distribution by Disease Stage')
plt.xlabel('Disease Stage')
plt.ylabel('Age')
plt.xticks(rotation=45)

# HTT CAG repeat length by disease stage
plt.subplot(2, 2, 4)
sns.boxplot(data=df_clean, x='Disease_Stage', y='HTT_CAG_Repeat_Length')
plt.title('HTT CAG Repeat Length by Disease Stage')
plt.xlabel('Disease Stage')
plt.ylabel('HTT CAG Repeat Length')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# summary statistics
print("Disease Stage Distribution:")
print("=" * 40)
for stage, count in disease_counts.items():
    percentage = (count / len(df_clean)) * 100
    print(f"{stage}: {count:,} patients ({percentage:.1f}%)")

print(f"\nTotal patients analyzed: {len(df_clean):,}")

### 4.2 Clinical Features Analysis

In [None]:
# Analyze key clinical features
plt.figure(figsize=(20, 15))

# Set consistent color palette
colors = ['skyblue', 'lightgreen', 'coral', 'plum', 'gold', 'lightsteelblue', 'lightblue', 'pink']

# 1. Age distribution
plt.subplot(3, 3, 1)
plt.hist(df_clean['Age'], bins=30, alpha=0.7, color=colors[0], edgecolor='black', linewidth=0.5)
plt.title('Age Distribution', fontsize=12, fontweight='bold')
plt.xlabel('Age (years)', fontsize=10)
plt.ylabel('Frequency', fontsize=10)

# 2. HTT CAG Repeat Length distribution
plt.subplot(3, 3, 2)
plt.hist(df_clean['HTT_CAG_Repeat_Length'], bins=30, alpha=0.7, color=colors[1], edgecolor='black', linewidth=0.5)
plt.title('HTT CAG Repeat Length Distribution', fontsize=12, fontweight='bold')
plt.xlabel('CAG Repeats', fontsize=10)
plt.ylabel('Frequency', fontsize=10)

# 3. Chorea Score distribution
plt.subplot(3, 3, 3)
plt.hist(df_clean['Chorea_Score'], bins=30, alpha=0.7, color=colors[2], edgecolor='black', linewidth=0.5)
plt.title('Chorea Score Distribution', fontsize=12, fontweight='bold')
plt.xlabel('Chorea Score', fontsize=10)
plt.ylabel('Frequency', fontsize=10)

# 4. Brain Volume Loss distribution
plt.subplot(3, 3, 4)
plt.hist(df_clean['Brain_Volume_Loss'], bins=30, alpha=0.7, color=colors[3], edgecolor='black', linewidth=0.5)
plt.title('Brain Volume Loss Distribution', fontsize=12, fontweight='bold')
plt.xlabel('Brain Volume Loss (%)', fontsize=10)
plt.ylabel('Frequency', fontsize=10)

# 5. Functional Capacity distribution
plt.subplot(3, 3, 5)
plt.hist(df_clean['Functional_Capacity'], bins=30, alpha=0.7, color=colors[4], edgecolor='black', linewidth=0.5)
plt.title('Functional Capacity Distribution', fontsize=12, fontweight='bold')
plt.xlabel('Functional Capacity', fontsize=10)
plt.ylabel('Frequency', fontsize=10)

# 6. Motor Symptoms by Disease Stage
plt.subplot(3, 3, 6)
motor_crosstab = pd.crosstab(df_clean['Disease_Stage'], df_clean['Motor_Symptoms'])
motor_crosstab.plot(kind='bar', stacked=True, ax=plt.gca(), color=['lightcoral', 'lightgreen'])
plt.title('Motor Symptoms by Disease Stage', fontsize=12, fontweight='bold')
plt.xlabel('Disease Stage', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.xticks(rotation=45, fontsize=9)
plt.legend(title='Motor Symptoms', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)

# 7. Sex distribution
plt.subplot(3, 3, 7)
sex_counts = df_clean['Sex'].value_counts()
plt.pie(sex_counts.values, labels=sex_counts.index, autopct='%1.1f%%', colors=['lightblue', 'pink'],
        startangle=90, textprops={'fontsize': 10})
plt.title('Sex Distribution', fontsize=12, fontweight='bold')

# 8. Family History distribution
plt.subplot(3, 3, 8)
family_counts = df_clean['Family_History'].value_counts()
plt.pie(family_counts.values, labels=family_counts.index, autopct='%1.1f%%', colors=['lightcoral', 'lightyellow'],
        startangle=90, textprops={'fontsize': 10})
plt.title('Family History Distribution', fontsize=12, fontweight='bold')

# 9. Gene Mutation Type distribution
plt.subplot(3, 3, 9)
mutation_counts = df_clean['Gene_Mutation_Type'].value_counts()
plt.bar(range(len(mutation_counts)), mutation_counts.values, color=colors[5])
plt.title('Gene Mutation Type Distribution', fontsize=12, fontweight='bold')
plt.xlabel('Mutation Type', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.xticks(range(len(mutation_counts)), mutation_counts.index, rotation=45, fontsize=9)

plt.tight_layout()
plt.show()

### 4.3 Correlation Analysis and Feature Relationships

In [None]:
# Correlation analysis with statistical significance testing
from scipy.stats import pearsonr

plt.figure(figsize=(18, 12))

# Calculate correlation matrix with p-values
numerical_features = df_clean.select_dtypes(include=[np.number]).columns
correlation_matrix = df_clean[numerical_features].corr()

# Calculate p-values for correlations
p_values = np.zeros((len(numerical_features), len(numerical_features)))
for i, col1 in enumerate(numerical_features):
    for j, col2 in enumerate(numerical_features):
        if i != j:
            clean_data1 = df_clean[col1].dropna()
            clean_data2 = df_clean[col2].dropna()
            common_idx = clean_data1.index.intersection(clean_data2.index)
            if len(common_idx) > 3:
                _, p_val = pearsonr(df_clean.loc[common_idx, col1], df_clean.loc[common_idx, col2])
                p_values[i, j] = p_val
            else:
                p_values[i, j] = 1.0
        else:
            p_values[i, j] = 0.0

# Create significance mask (p < 0.05)
significant_mask = p_values < 0.05

# Correlation heatmap
plt.subplot(2, 3, 1)
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0, 
            square=True, fmt='.2f', cbar_kws={'label': 'Correlation Coefficient'},
            mask=~significant_mask, annot_kws={'size': 8})
plt.title('Correlation Matrix (p < 0.05)', fontsize=12, pad=10)
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.yticks(rotation=0, fontsize=9)

#HTT CAG Repeats vs Age - genetic load relationship
plt.subplot(2, 3, 2)
plt.scatter(df_clean['Age'], df_clean['HTT_CAG_Repeat_Length'], alpha=0.6, color='darkred')
plt.xlabel('Age (years)', fontsize=10)
plt.ylabel('HTT CAG Repeat Length', fontsize=10)
plt.title('Genetic Load vs Age', fontsize=12)
# trend line
z = np.polyfit(df_clean['Age'], df_clean['HTT_CAG_Repeat_Length'], 1)
p = np.poly1d(z)
plt.plot(df_clean['Age'], p(df_clean['Age']), "r--", alpha=0.8, linewidth=2)

# Chorea Score vs Brain Volume Loss - disease severity relationship
plt.subplot(2, 3, 3)
plt.scatter(df_clean['Chorea_Score'], df_clean['Brain_Volume_Loss'], alpha=0.6, color='navy')
plt.xlabel('Chorea Score', fontsize=10)
plt.ylabel('Brain Volume Loss', fontsize=10)
plt.title('Movement Disorders vs Brain Atrophy', fontsize=12)
# Add trend line
z = np.polyfit(df_clean['Chorea_Score'], df_clean['Brain_Volume_Loss'], 1)
p = np.poly1d(z)
plt.plot(df_clean['Chorea_Score'], p(df_clean['Chorea_Score']), "r--", alpha=0.8, linewidth=2)

# functional Capacity vs HTT CAG Repeats - genetic impact on function
plt.subplot(2, 3, 4)
plt.scatter(df_clean['HTT_CAG_Repeat_Length'], df_clean['Functional_Capacity'], alpha=0.6, color='green')
plt.xlabel('HTT CAG Repeat Length', fontsize=10)
plt.ylabel('Functional Capacity', fontsize=10)
plt.title('Genetic Burden vs Daily Function', fontsize=12)
# Add trend line
z = np.polyfit(df_clean['HTT_CAG_Repeat_Length'], df_clean['Functional_Capacity'], 1)
p = np.poly1d(z)
plt.plot(df_clean['HTT_CAG_Repeat_Length'], p(df_clean['HTT_CAG_Repeat_Length']), "r--", alpha=0.8, linewidth=2)

# Age vs Brain Volume Loss - aging and neurodegeneration
plt.subplot(2, 3, 5)
plt.scatter(df_clean['Age'], df_clean['Brain_Volume_Loss'], alpha=0.6, color='purple')
plt.xlabel('Age (years)', fontsize=10)
plt.ylabel('Brain Volume Loss', fontsize=10)
plt.title('Age-Related Brain Atrophy', fontsize=12)
# trend line
z = np.polyfit(df_clean['Age'], df_clean['Brain_Volume_Loss'], 1)
p = np.poly1d(z)
plt.plot(df_clean['Age'], p(df_clean['Age']), "r--", alpha=0.8, linewidth=2)

# Protein Aggregation vs HTT Expression - molecular pathology
plt.subplot(2, 3, 6)
plt.scatter(df_clean['HTT_Gene_Expression_Level'], df_clean['Protein_Aggregation_Level'], alpha=0.6, color='orange')
plt.xlabel('HTT Gene Expression Level', fontsize=10)
plt.ylabel('Protein Aggregation Level', fontsize=10)
plt.title('Gene Expression vs Protein Pathology', fontsize=12)
#trend line
z = np.polyfit(df_clean['HTT_Gene_Expression_Level'], df_clean['Protein_Aggregation_Level'], 1)
p = np.poly1d(z)
plt.plot(df_clean['HTT_Gene_Expression_Level'], p(df_clean['HTT_Gene_Expression_Level']), "r--", alpha=0.8, linewidth=2)

plt.tight_layout()
plt.show()

## 5. Research Questions and Insights

### 5.1 Key Findings from EDA

In [None]:
#summary of key statistical findings
print("HUNTINGTON'S DISEASE DATA ANALYSIS - KEY INSIGHTS")
print("=" * 60)

# data quality insights
print("DATA QUALITY INSIGHTS:")
print("=" * 30)
total_patients = len(df_clean)
missing_cognitive = df_clean['Cognitive_Decline'].isnull().sum()
missing_pct = (missing_cognitive / total_patients) * 100

print(f"• Total patients analyzed: {total_patients:,}")
print(f"• Missing cognitive assessments: {missing_cognitive:,} ({missing_pct:.1f}%)")
print(f"• Complete clinical profiles: {total_patients - missing_cognitive:,}")

# disease stage distribution insights
print(f"\nDISEASE STAGE DISTRIBUTION:")
print("=" * 30)
stage_dist = df_clean['Disease_Stage'].value_counts()
for stage, count in stage_dist.items():
    pct = (count / total_patients) * 100
    print(f"• {stage}: {count:,} patients ({pct:.1f}%)")

# demographic insights
print(f"\nDEMOGRAPHIC PATTERNS:")
print("=" * 30)
sex_dist = df_clean['Sex'].value_counts()
family_hist = df_clean['Family_History'].value_counts()

for sex, count in sex_dist.items():
    pct = (count / total_patients) * 100
    print(f"• {sex}: {count:,} patients ({pct:.1f}%)")

print(f"• Family History Present: {family_hist.get('Yes', 0):,} patients ({(family_hist.get('Yes', 0)/total_patients)*100:.1f}%)")

# clinical severity patterns
print(f"\nCLINICAL SEVERITY PATTERNS:")
print("=" * 30)
print(f"• Age range: {df_clean['Age'].min()}-{df_clean['Age'].max()} years (mean: {df_clean['Age'].mean():.1f})")
print(f"• HTT CAG repeats: {df_clean['HTT_CAG_Repeat_Length'].min()}-{df_clean['HTT_CAG_Repeat_Length'].max()} (mean: {df_clean['HTT_CAG_Repeat_Length'].mean():.1f})")
print(f"• Chorea severity: {df_clean['Chorea_Score'].min():.1f}-{df_clean['Chorea_Score'].max():.1f} (mean: {df_clean['Chorea_Score'].mean():.1f})")
print(f"• Brain volume loss: {df_clean['Brain_Volume_Loss'].min():.1f}-{df_clean['Brain_Volume_Loss'].max():.1f}% (mean: {df_clean['Brain_Volume_Loss'].mean():.1f}%)")

# correlation insights
print(f"\nCORRELATION INSIGHTS:")
print("=" * 30)
corr_matrix = df_clean[numerical_features].corr()

#strongest correlations with key clinical variables
key_correlations = [
    ('HTT_CAG_Repeat_Length', 'Chorea_Score'),
    ('Age', 'Brain_Volume_Loss'),
    ('Chorea_Score', 'Brain_Volume_Loss'),
    ('HTT_CAG_Repeat_Length', 'Functional_Capacity'),
    ('HTT_Gene_Expression_Level', 'Protein_Aggregation_Level')
]

for var1, var2 in key_correlations:
    correlation = corr_matrix.loc[var1, var2]
    direction = "positive" if correlation > 0 else "negative"
    strength = "strong" if abs(correlation) > 0.7 else "moderate" if abs(correlation) > 0.4 else "weak"
    print(f"• {var1} ↔ {var2}: {correlation:.3f} ({strength} {direction})")

print(f"\n" + "=" * 60)

### 5.2 Potential Research Questions Based on EDA Findings

In [None]:
# Research Questions Based on EDA Findings
print("RESEARCH QUESTIONS FOR HUNTINGTON'S DISEASE DATASET")
print("=" * 65)

print("Based on comprehensive EDA analysis, the following research questions")
print("could be explored using this dataset:\n")

research_questions = [
    {
        "category": "GENETIC PREDICTION & PROGNOSIS",
        "questions": [
            "Can HTT CAG repeat length predict disease onset timing in pre-symptomatic patients?",
            "What is the optimal CAG repeat threshold for early intervention strategies?",
            "How does genetic mutation type influence disease progression patterns?",
            "Can we develop a genetic risk score combining CAG repeats and mutation type?"
        ],
        "eda_support": "HTT CAG repeats show correlations with clinical severity measures and disease stages."
    },
    {
        "category": "DISEASE PROGRESSION MODELING", 
        "questions": [
            "Can we predict functional decline trajectory using baseline clinical measures?",
            "What combination of biomarkers best predicts transition between disease stages?",
            "How does brain volume loss correlate with motor and cognitive symptom severity?",
            "Can we identify rapid vs. slow progressors using early clinical indicators?"
        ],
        "eda_support": "Strong correlations found between age, brain atrophy, and clinical severity scores."
    },
    {
        "category": "DEMOGRAPHIC & CLINICAL PATTERNS",
        "questions": [
            "Do sex differences influence disease presentation and progression patterns?",
            "How does family history impact age of symptom onset and severity?",
            "What are the optimal clinical assessment schedules for different patient subgroups?",
            "Can demographic factors improve personalized treatment planning?"
        ],
        "eda_support": "Demographic distributions show balanced representation across sex and family history."
    },
    {
        "category": "BIOMARKER DEVELOPMENT",
        "questions": [
            "Can protein aggregation levels serve as early biomarkers for disease progression?",
            "What is the relationship between HTT gene expression and clinical outcomes?",
            "Can we develop composite biomarker scores for clinical trials?",
            "Which molecular markers correlate best with functional capacity changes?"
        ],
        "eda_support": "Molecular features (gene expression, protein aggregation) show measurable relationships."
    },
    {
        "category": "CLINICAL TRIAL DESIGN",
        "questions": [
            "What sample sizes are needed for detecting treatment effects in different disease stages?",
            "Which clinical endpoints show the most sensitivity to change over time?",
            "Can we stratify patients for clinical trials based on progression risk profiles?",
            "What are optimal inclusion/exclusion criteria for therapeutic studies?"
        ],
        "eda_support": "Large dataset (48k+ patients) with balanced stage distribution enables power calculations."
    }
]

for i, rq in enumerate(research_questions, 1):
    print(f"{rq['category']}")
    print("-" * len(rq['category']))
    
    for j, question in enumerate(rq['questions'], 1):
        print(f"   {j}. {question}")
    
    print(f"EDA Support: {rq['eda_support']}\n")

print("RECOMMENDED PRIORITY RESEARCH AREAS:")
print("=" * 45)
print("1. GENETIC PREDICTION MODELS - High clinical impact for patient counseling")
print("2. BIOMARKER VALIDATION - Critical for drug development pipeline") 
print("3. PROGRESSION MODELING - Essential for clinical trial design")
print("4. PERSONALIZED MEDICINE - Future of HD patient care")

print(f"\nDATASET STRENGTHS FOR RESEARCH:")
print("=" * 35)
print("- Large sample size (48,536 patients) provides statistical power")
print("- Comprehensive clinical features enable multi-modal analysis")  
print("- Balanced disease stage representation supports longitudinal insights")
print("- Genetic and molecular data enable precision medicine approaches")
print("- Missing data patterns are manageable (only 25% cognitive assessments)")

print(f"\nDATASET LIMITATIONS TO CONSIDER:")
print("=" * 40)
print("- Cross-sectional data limits longitudinal progression analysis")
print("- Missing cognitive assessments may bias severity estimates")
print("- Synthetic molecular sequences limit translational applications")
print("- Geographic/ethnic diversity not captured in current features")

print("\n" + "=" * 65)
print("CONCLUSION: This dataset provides a solid foundation for multiple")
print("   research directions in Huntington's disease, with particular strength")
print("   in genetic prediction and clinical progression modeling.")
print("=" * 65)