In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

print("=== Alzheimer's Disease Prediction ===\n")

print("Step 1: Loading data...")
data = pd.read_csv('alzheimers_disease_data.csv')
print(f"Total patients: {len(data)}")
print(f"Total features: {len(data.columns)}\n")

print("Step 2: Basic Data Exploration...")
print("\nFirst 5 rows:")
print(data.head())

print("\nDataset shape:")
print(f"Rows: {data.shape[0]}")
print(f"Columns: {data.shape[1]}")

print("\nColumn names:")
print(data.columns.tolist())

print("\nData types:")
print(data.dtypes)

print("\nBasic statistics:")
print(data.describe())

print("\nMissing values per column:")
print(data.isnull().sum())
print(f"Total missing values: {data.isnull().sum().sum()}")

print("\nDiagnosis distribution:")
print(data['Diagnosis'].value_counts())
print(f"Patients with Alzheimer's (1): {sum(data['Diagnosis'] == 1)}")
print(f"Patients without Alzheimer's (0): {sum(data['Diagnosis'] == 0)}")

print("\nAge statistics:")
print(f"Average age: {data['Age'].mean():.1f}")
print(f"Minimum age: {data['Age'].min()}")
print(f"Maximum age: {data['Age'].max()}")

print("\n" + "="*50)
print("Step 3: Preparing data...")
data = data.drop(['PatientID', 'DoctorInCharge'], axis=1)

X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']
print(f"Features being used: {len(X.columns)}")

print("\n" + "="*50)
print("Step 4: Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training data: {len(X_train)} patients")
print(f"Testing data: {len(X_test)} patients")

print("\n" + "="*50)
print("Step 5: Training the model...")
model = DecisionTreeClassifier(max_depth=5, random_state=42)
model.fit(X_train, y_train)
print("Model trained successfully!")

print("\n" + "="*50)
print("Step 6: Testing the model...")
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.1f}%")

print("\n" + "="*50)
print("Step 7: Detailed results...")
cm = confusion_matrix(y_test, predictions)
print("\nConfusion Matrix:")
print(cm)
print(f"\nCorrect predictions (No Alzheimer's): {cm[0][0]}")
print(f"Correct predictions (Alzheimer's): {cm[1][1]}")
print(f"Incorrect predictions: {cm[0][1] + cm[1][0]}")
print(f"Total predictions: {len(y_test)}")

print("\n" + "="*50)
print("Step 8: Most important features...")
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("Top 10 features:")
print(feature_importance.head(10).to_string(index=False))

print("\n" + "="*50)
print("Step 9: Testing with a sample patient...")
sample_patient = {
    'Age': 72,
    'Gender': 1,
    'Ethnicity': 0,
    'EducationLevel': 2,
    'BMI': 27.5,
    'Smoking': 0,
    'AlcoholConsumption': 1.0,
    'PhysicalActivity': 4.0,
    'DietQuality': 6.0,
    'SleepQuality': 5.0,
    'FamilyHistoryAlzheimers': 1,
    'CardiovascularDisease': 0,
    'Diabetes': 0,
    'Depression': 0,
    'HeadInjury': 0,
    'Hypertension': 1,
    'SystolicBP': 145,
    'DiastolicBP': 88,
    'CholesterolTotal': 215.0,
    'CholesterolLDL': 140.0,
    'CholesterolHDL': 48.0,
    'CholesterolTriglycerides': 170.0,
    'MMSE': 24.0,
    'FunctionalAssessment': 7.0,
    'MemoryComplaints': 1,
    'BehavioralProblems': 0,
    'ADL': 6.0,
    'Confusion': 0,
    'Disorientation': 0,
    'PersonalityChanges': 0,
    'DifficultyCompletingTasks': 1,
    'Forgetfulness': 1
}

sample_df = pd.DataFrame([sample_patient])
prediction = model.predict(sample_df)[0]
probability = model.predict_proba(sample_df)[0]

result = "Alzheimer's Disease" if prediction == 1 else "No Alzheimer's"
confidence = max(probability) * 100

print(f"\nSample Patient (Age {sample_patient['Age']}):")
print(f"Prediction: {result}")
print(f"Confidence: {confidence:.1f}%")

print("\n" + "="*50)
print(f"Final Model Accuracy: {accuracy * 100:.1f}%")
print("="*50)

=== Alzheimer's Disease Prediction ===

Step 1: Loading data...
Total patients: 2149
Total features: 35

Step 2: Basic Data Exploration...

First 5 rows:
   PatientID  Age  Gender  Ethnicity  EducationLevel        BMI  Smoking  \
0       4751   73       0          0               2  22.927749        0   
1       4752   89       0          0               0  26.827681        0   
2       4753   73       0          3               1  17.795882        0   
3       4754   74       1          0               1  33.800817        1   
4       4755   89       0          0               0  20.716974        0   

   AlcoholConsumption  PhysicalActivity  DietQuality  ...  MemoryComplaints  \
0           13.297218          6.327112     1.347214  ...                 0   
1            4.542524          7.619885     0.518767  ...                 0   
2           19.555085          7.844988     1.826335  ...                 0   
3           12.209266          8.428001     7.435604  ...                