# Assignment 2: Huntington's Disease Dataset Analysis

---

## 1. Import Necessary Libraries

In [None]:
# Core data manipulation and analysis libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set display options for better DataFrame output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

## 2. Data Loading and Initial Exploration

In [None]:
# load dataset
df = pd.read_csv('data/Huntington_Disease_Dataset.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Number of patients: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

In [None]:
#display first few rows to see the data structure
print("First 5 rows of the dataset:")
df.head()

In [None]:
#detailed information about the dataset
#to check data types, missing values, and memory usage
print("Dataset Information:")
print("=" * 50)
df.info()

In [None]:
#statistical summary of numerical columns
#to provide insights into the distribution of key clinical variables
print("Statistical Summary of Numerical Features:")
print("=" * 50)
df.describe().transpose()

In [None]:
#identify column types and their relevance for analysis

print("Column Analysis:")
print("=" * 40)
for i, col in enumerate(df.columns):
    dtype = df[col].dtype
    unique_vals = df[col].nunique()
    missing_vals = df[col].isnull().sum()
    print(f"{i+1:2d}. {col:<30} | {str(dtype):<10} | Unique: {unique_vals:4d} | Missing: {missing_vals:3d}")

## 3. Data Preprocessing

### 3.1 Data Cleaning and Feature Selection

In [None]:
#test to remove irrelevant columns for analysis
#patient_ID: unique identifier, not predictive
#random sequences: generated for privacy, not real biological data
#gene info columns: redundant descriptive information
#

columns_to_drop = [
    'Patient_ID',  # Unique identifier - not predictive
    'Random_Protein_Sequence',  # Random sequence for privacy
    'Random_Gene_Sequence',  # Random sequence for privacy  
    'Gene/Factor',  # Redundant with other genetic features
    'Chromosome_Location',  # Static genetic information
    'Function',  # Descriptive, not quantitative
    'Effect',  # Descriptive, not quantitative
    'Category'  # Descriptive, not quantitative
]

# Create cleaned dataset focusing on clinically relevant features
df_clean = df.drop(columns=columns_to_drop)

print(f"Original dataset: {df.shape}")
print(f"Cleaned dataset: {df_clean.shape}")
print(f"Removed {len(columns_to_drop)} irrelevant columns")

print("\nRemaining features:")
for col in df_clean.columns:
    print(f"- {col}")

### 3.2 Handle Duplicates

In [None]:
# Check for duplicate records
# In medical data, duplicates could indicate data entry errors

print("Duplicate Analysis:")
print("=" * 30)

# Check for complete duplicates
duplicate_rows = df_clean.duplicated().sum()
print(f"Complete duplicate rows: {duplicate_rows}")

# Check for duplicates based on key clinical features
key_features = ['Age', 'Sex', 'HTT_CAG_Repeat_Length', 'Disease_Stage']
duplicate_clinical = df_clean.duplicated(subset=key_features).sum()
print(f"Duplicate clinical profiles: {duplicate_clinical}")

if duplicate_rows > 0:
    print(f"\nRemoving {duplicate_rows} duplicate rows...")
    df_clean = df_clean.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {df_clean.shape}")
else:
    print("No duplicate rows found - data quality is good!")

### 3.3 Missing Data Analysis and Imputation

In [None]:
# Visualize missing data patterns
plt.figure(figsize=(12, 8))

# Missing data heatmap
plt.subplot(2, 2, 1)
sns.heatmap(df_clean.isnull(), cbar=True, xticklabels=True, yticklabels=False, 
            cmap='viridis', cbar_kws={'label': 'Missing Data'})
plt.title('Missing Data Heatmap')
plt.xlabel('Features')
plt.xticks(rotation=45)

# Missing data bar plot
plt.subplot(2, 2, 2)
missing_counts = df_clean.isnull().sum().sort_values(ascending=True)
# Only show columns with missing data
missing_counts = missing_counts[missing_counts > 0]  
if len(missing_counts) > 0:
    missing_counts.plot(kind='barh', color='coral')
    plt.title('Missing Data Count by Feature')
    plt.xlabel('Number of Missing Values')
else:
    plt.text(0.5, 0.5, 'No Missing Data Found!', ha='center', va='center', fontsize=14)
    plt.title('Missing Data Count by Feature')

# Missing data percentage
plt.subplot(2, 2, 3)
missing_percentages = ((df_clean.isnull().sum() / len(df_clean)) * 100).sort_values(ascending=True)
missing_percentages = missing_percentages[missing_percentages > 0]
if len(missing_percentages) > 0:
    missing_percentages.plot(kind='barh', color='lightblue')
    plt.title('Missing Data Percentage by Feature')
    plt.xlabel('Percentage of Missing Values (%)')
else:
    plt.text(0.5, 0.5, 'No Missing Data Found!', ha='center', va='center', fontsize=14)
    plt.title('Missing Data Percentage by Feature')

# Data completeness overview
plt.subplot(2, 2, 4)
total_cells = len(df_clean) * len(df_clean.columns)
missing_cells = df_clean.isnull().sum().sum()
complete_cells = total_cells - missing_cells

labels = ['Complete', 'Missing']
sizes = [complete_cells, missing_cells]
colors = ['lightgreen', 'lightcoral']

plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Overall Data Completeness')

plt.tight_layout()
plt.show()

print(f"\nData Completeness Summary:")
print(f"Total data points: {total_cells:,}")
print(f"Complete data points: {complete_cells:,} ({(complete_cells/total_cells)*100:.1f}%)")
print(f"Missing data points: {missing_cells:,} ({(missing_cells/total_cells)*100:.1f}%)")

In [None]:
# Comprehensive missing data analysis
print("Missing Data Analysis:")
print("=" * 40)

# Calculate missing data statistics
missing_stats = []
for col in df_clean.columns:
    missing_count = df_clean[col].isnull().sum()
    missing_percent = (missing_count / len(df_clean)) * 100
    missing_stats.append({
        'Column': col,
        'Missing_Count': missing_count,
        'Missing_Percent': round(missing_percent, 2),
        'Data_Type': str(df_clean[col].dtype)
    })

# Create DataFrame for better visualization
missing_df = pd.DataFrame(missing_stats)
missing_df = missing_df.sort_values('Missing_Percent', ascending=False)

print("Missing Data Summary:")
print(missing_df.to_string(index=False))

# Identify columns with significant missing data (>5% missing)
high_missing = missing_df[missing_df['Missing_Percent'] > 5]
print(f"\nColumns with >5% missing data:")
if len(high_missing) > 0:
    for _, row in high_missing.iterrows():
        print(f"- {row['Column']}: {row['Missing_Count']} missing ({row['Missing_Percent']}%)")
else:
    print("- None (excellent data quality!)")

### 3.4 Outlier Detection and Handling

In [None]:
# Outlier detection
# In medical data, outliers could represent: rare but valid extreme cases, data entry errors, measurement equipment issues

def detect_outliers_iqr(data, column):
    """Detect outliers using Interquartile Range (IQR) method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

def detect_outliers_zscore(data, column, threshold=3):
    """Detect outliers using Z-score method"""
    z_scores = np.abs(stats.zscore(data[column].dropna()))
    outliers = data[np.abs(stats.zscore(data[column].dropna())) > threshold]
    return outliers, z_scores

# Identify numerical columns for outlier analysis
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns for outlier detection:")
for i, col in enumerate(numerical_cols, 1):
    print(f"{i:2d}. {col}")

print(f"\nAnalyzing {len(numerical_cols)} numerical features for outliers...")

In [None]:
# Comprehensive outlier analysis
outlier_summary = []

print("Outlier Detection Summary:")
print("=" * 60)
print(f"{'Feature':<25} {'Total':<8} {'IQR_Out':<8} {'Z_Out':<8} {'%_IQR':<8} {'%_Z':<8}")
print("=" * 60)

for col in numerical_cols:
    #only analyze columns with data
    if df_clean[col].notna().sum() > 0:  
        # IQR method
        iqr_outliers, lower_iqr, upper_iqr = detect_outliers_iqr(df_clean, col)
        
        # Z-score method  
        zscore_outliers, z_scores = detect_outliers_zscore(df_clean, col)
        
        # Calculate percentages
        total_valid = df_clean[col].notna().sum()
        iqr_pct = (len(iqr_outliers) / total_valid) * 100
        z_pct = (len(zscore_outliers) / total_valid) * 100
        
        # Store summary
        outlier_summary.append({
            'Feature': col,
            'Total_Records': total_valid,
            'IQR_Outliers': len(iqr_outliers),
            'Z_Outliers': len(zscore_outliers),
            'IQR_Percentage': round(iqr_pct, 2),
            'Z_Percentage': round(z_pct, 2),
            'Lower_Bound_IQR': round(lower_iqr, 2),
            'Upper_Bound_IQR': round(upper_iqr, 2)
        })
        
        print(f"{col:<25} {total_valid:<8} {len(iqr_outliers):<8} {len(zscore_outliers):<8} {iqr_pct:<8.1f} {z_pct:<8.1f}")

# Convert to DataFrame for better analysis
outlier_df = pd.DataFrame(outlier_summary)
print("\n" + "=" * 60)
print("Features with significant outliers (>5% of data):")
significant_outliers = outlier_df[outlier_df['IQR_Percentage'] > 5]
if len(significant_outliers) > 0:
    for _, row in significant_outliers.iterrows():
        print(f"- {row['Feature']}: {row['IQR_Outliers']} outliers ({row['IQR_Percentage']}%)")
else:
    print("- No features have >5% outliers (good data quality!)")

## 4. Exploratory Data Analysis

### 4.1 Disease Stage Distribution

In [None]:
# Analyze disease stage distribution
plt.figure(figsize=(15, 10))

# Disease stage count plot
plt.subplot(2, 2, 1)
disease_counts = df_clean['Disease_Stage'].value_counts()
plt.pie(disease_counts.values, labels=disease_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Disease Stages')

# Disease stage bar plot
plt.subplot(2, 2, 2)
sns.countplot(data=df_clean, x='Disease_Stage', order=disease_counts.index)
plt.title('Count of Patients by Disease Stage')
plt.xlabel('Disease Stage')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)

# Age distribution by disease stage
plt.subplot(2, 2, 3)
sns.boxplot(data=df_clean, x='Disease_Stage', y='Age')
plt.title('Age Distribution by Disease Stage')
plt.xlabel('Disease Stage')
plt.ylabel('Age')
plt.xticks(rotation=45)

# HTT CAG repeat length by disease stage
plt.subplot(2, 2, 4)
sns.boxplot(data=df_clean, x='Disease_Stage', y='HTT_CAG_Repeat_Length')
plt.title('HTT CAG Repeat Length by Disease Stage')
plt.xlabel('Disease Stage')
plt.ylabel('HTT CAG Repeat Length')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# summary statistics
print("Disease Stage Distribution:")
print("=" * 40)
for stage, count in disease_counts.items():
    percentage = (count / len(df_clean)) * 100
    print(f"{stage}: {count:,} patients ({percentage:.1f}%)")

print(f"\nTotal patients analyzed: {len(df_clean):,}")