# Assignment 2: Huntington's Disease Dataset Analysis

---

## 1. Import Necessary Libraries

In [None]:
# Core data manipulation and analysis libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set display options for better DataFrame output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

## 2. Data Loading and Initial Exploration

In [None]:
# load dataset
df = pd.read_csv('data/Huntington_Disease_Dataset.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Number of patients: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

In [None]:
#display first few rows to see the data structure
print("First 5 rows of the dataset:")
df.head()

In [None]:
#detailed information about the dataset
#to check data types, missing values, and memory usage
print("Dataset Information:")
print("=" * 50)
df.info()

In [None]:
#statistical summary of numerical columns
#to provide insights into the distribution of key clinical variables
print("Statistical Summary of Numerical Features:")
print("=" * 50)
df.describe().transpose()

In [None]:
#identify column types and their relevance for analysis

print("Column Analysis:")
print("=" * 40)
for i, col in enumerate(df.columns):
    dtype = df[col].dtype
    unique_vals = df[col].nunique()
    missing_vals = df[col].isnull().sum()
    print(f"{i+1:2d}. {col:<30} | {str(dtype):<10} | Unique: {unique_vals:4d} | Missing: {missing_vals:3d}")

## 3. Data Preprocessing

### 3.1 Data Cleaning and Feature Selection

In [None]:
#test to remove irrelevant columns for analysis
#patient_ID: unique identifier, not predictive
#random sequences: generated for privacy, not real biological data
#gene info columns: redundant descriptive information
#

columns_to_drop = [
    'Patient_ID',  # Unique identifier - not predictive
    'Random_Protein_Sequence',  # Random sequence for privacy
    'Random_Gene_Sequence',  # Random sequence for privacy  
    'Gene/Factor',  # Redundant with other genetic features
    'Chromosome_Location',  # Static genetic information
    'Function',  # Descriptive, not quantitative
    'Effect',  # Descriptive, not quantitative
    'Category'  # Descriptive, not quantitative
]

# Create cleaned dataset focusing on clinically relevant features
df_clean = df.drop(columns=columns_to_drop)

print(f"Original dataset: {df.shape}")
print(f"Cleaned dataset: {df_clean.shape}")
print(f"Removed {len(columns_to_drop)} irrelevant columns")

print("\nRemaining features:")
for col in df_clean.columns:
    print(f"- {col}")

### 3.2 Handle Duplicates

In [None]:
# Check for duplicate records
# In medical data, duplicates could indicate data entry errors

print("Duplicate Analysis:")
print("=" * 30)

# Check for complete duplicates
duplicate_rows = df_clean.duplicated().sum()
print(f"Complete duplicate rows: {duplicate_rows}")

# Check for duplicates based on key clinical features
key_features = ['Age', 'Sex', 'HTT_CAG_Repeat_Length', 'Disease_Stage']
duplicate_clinical = df_clean.duplicated(subset=key_features).sum()
print(f"Duplicate clinical profiles: {duplicate_clinical}")

if duplicate_rows > 0:
    print(f"\nRemoving {duplicate_rows} duplicate rows...")
    df_clean = df_clean.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {df_clean.shape}")
else:
    print("No duplicate rows found - data quality is good!")