In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

sns.set_theme(context='notebook', palette='muted', style='darkgrid')


df = pd.read_csv('../alzheimer_done.csv')

X = df.drop(columns = ['Diagnosis'])
y = df['Diagnosis']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, shuffle = True)


In [9]:
df

Unnamed: 0,MemoryComplaints,BehavioralProblems,ADL,MMSE,FunctionalAssessment,Disorientation,PersonalityChanges,Smoking,HeadInjury,CholesterolTotal,Diagnosis
0,0,0,1.725883,21.463532,6.518877,0,0,0,0,242.366840,0
1,0,0,2.592424,20.613267,7.118696,0,0,0,0,231.162595,0
2,0,0,7.119548,7.356249,5.895077,1,0,0,0,284.181858,0
3,0,1,6.481226,13.991127,8.965106,0,0,1,0,159.582240,0
4,0,0,0.014691,13.517609,6.045039,0,1,0,0,237.602184,0
...,...,...,...,...,...,...,...,...,...,...,...
2144,0,0,4.492838,1.201190,0.238667,0,0,0,0,280.476824,1
2145,0,1,9.204952,6.458060,8.687480,0,0,0,0,186.384436,1
2146,0,0,5.036334,17.011003,1.972137,0,0,0,0,237.024558,1
2147,0,0,3.785399,4.030491,5.173891,0,0,0,0,242.197192,1


In [10]:
rf_model_entropy = RandomForestClassifier(n_estimators=124, max_depth=19, min_samples_split=2, min_samples_leaf=2, max_features='log2', criterion='entropy')
rf_model_entropy.fit(X_train, y_train)
y_pred_entropy = rf_model_entropy.predict(X_test)

rf_model_gini = RandomForestClassifier(n_estimators=124, max_depth=19, min_samples_split=2, min_samples_leaf=2, max_features='log2', criterion='entropy')
rf_model_gini.fit(X_train, y_train)
y_pred_gini = rf_model_gini.predict(X_test)
# Tạo classification report
# report = classification_report(y_test, y_pred)
# # In report ra màn hình
# print("Classification Report for Random Forest:")
# print(report)
acc_skl_entropy = accuracy_score(y_test, y_pred_entropy)
acc_skl_gini = accuracy_score(y_test, y_pred_gini)
print(f"Random Forest Accuracy Entropy: {acc_skl_entropy}")
print(f"Random Forest Accuracy Gini: {acc_skl_gini}")

Random Forest Accuracy Entropy: 0.9651162790697675
Random Forest Accuracy Gini: 0.9651162790697675


In [11]:
import joblib

# Save the trained model
joblib.dump(rf_model_gini, '../model/random_forest_lib.pkl')

['../model/random_forest_lib.pkl']

In [12]:
X_train = X_train.to_numpy() if isinstance(X_train, pd.DataFrame) else X_train
X_test = X_test.to_numpy() if isinstance(X_test, pd.DataFrame) else X_test
y_train = y_train.to_numpy() if isinstance(y_train, pd.Series) else y_train
y_test = y_test.to_numpy() if isinstance(y_test, pd.Series) else y_test


In [13]:
import numpy as np

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2, criterion="gini"):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        num_classes = np.unique(y).size

        if (depth >= self.max_depth or num_classes == 1 or n_samples < self.min_samples_split):
            return Node(value=self._majority_class(y))

        best_impurity = float('inf')
        best_feature, best_threshold = None, None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            # Optional: Speed up by sampling a subset of thresholds
            if len(thresholds) > 20:
                thresholds = np.random.choice(thresholds, 20, replace=False)

            for threshold in thresholds:
                left_idx = X[:, feature] <= threshold
                right_idx = ~left_idx

                if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
                    continue

                impurity = self._calculate_impurity(y[left_idx], y[right_idx])
                if impurity < best_impurity:
                    best_impurity = impurity
                    best_feature = feature
                    best_threshold = threshold

        if best_feature is None:
            return Node(value=self._majority_class(y))

        left_idx = X[:, best_feature] <= best_threshold
        right_idx = ~left_idx

        left = self._grow_tree(X[left_idx], y[left_idx], depth + 1)
        right = self._grow_tree(X[right_idx], y[right_idx], depth + 1)
        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _calculate_impurity(self, left_y, right_y):
        total = len(left_y) + len(right_y)

        def gini(y):
            _, counts = np.unique(y, return_counts=True)
            p = counts / counts.sum()
            return 1 - np.sum(p**2)

        def entropy(y):
            _, counts = np.unique(y, return_counts=True)
            p = counts / counts.sum()
            return -np.sum(p * np.log2(p + 1e-9))

        if self.criterion == "gini":
            return (len(left_y) / total) * gini(left_y) + (len(right_y) / total) * gini(right_y)
        elif self.criterion == "entropy":
            return (len(left_y) / total) * entropy(left_y) + (len(right_y) / total) * entropy(right_y)
        else:
            raise ValueError("Unknown criterion: {}".format(self.criterion))

    def _majority_class(self, y):
        values, counts = np.unique(y, return_counts=True)
        return values[np.argmax(counts)]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        while node.value is None:
            node = node.left if x[node.feature] <= node.threshold else node.right
        return node.value


class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, criterion="gini"):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            indices = np.random.choice(len(X), len(X), replace=True)
            X_sample, y_sample = X[indices], y[indices]
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                criterion=self.criterion
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        preds = np.array([tree.predict(X) for tree in self.trees])
        # Majority vote along axis=0
        return np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), axis=0, arr=preds)

In [15]:
import pickle
from sklearn.metrics import accuracy_score, classification_report

# Cấu hình chung
params = {
    'n_estimators': 21,
    'max_depth': 13,
    'min_samples_split': 2
}

# So sánh Gini và Entropy
results = {}
for criterion in ['gini', 'entropy']:
    print(f"\n🔍 Đang huấn luyện với tiêu chí: {criterion.upper()}")
    clf = RandomForestClassifier(**params, criterion=criterion)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)
    
    results[criterion] = {
        'accuracy': acc,
        'report': report,
        'model': clf
    }

    print(f"✅ Accuracy ({criterion}): {acc:.4f}")
    print(f"📄 Classification Report ({criterion}):\n{report}")

# So sánh kết quả
print("\n📊 So sánh tổng quan:")
print(f"{'Tiêu chí':<10} | {'Accuracy':<8}")
print("-" * 25)
for k, v in results.items():
    print(f"{k:<10} | {v['accuracy']:.4f}")

# Kết luận & lưu mô hình tốt nhất
gini_acc = results['gini']['accuracy']
entropy_acc = results['entropy']['accuracy']

if gini_acc > entropy_acc:
    best_model = results['gini']['model']
    best_name = 'gini'
    print(f"\n✅ Gini cho kết quả tốt hơn với độ chính xác: {gini_acc:.4f}")
elif entropy_acc > gini_acc:
    best_model = results['entropy']['model']
    best_name = 'entropy'
    print(f"\n✅ Entropy cho kết quả tốt hơn với độ chính xác: {entropy_acc:.4f}")
else:
    best_model = results['gini']['model']
    best_name = 'gini'
    print(f"\n⚖️  Cả hai đều tương đương. Chọn Gini làm mô hình mặc định.")

# Lưu mô hình
model_path = f"../model/best_model.pkl"
import joblib

joblib.dump(best_model, model_path)


print(f"\n💾 Mô hình tốt nhất đã được lưu vào: {model_path}")



🔍 Đang huấn luyện với tiêu chí: GINI
✅ Accuracy (gini): 0.9535
📄 Classification Report (gini):
              precision    recall  f1-score   support

           0     0.9491    0.9775    0.9631       267
           1     0.9613    0.9141    0.9371       163

    accuracy                         0.9535       430
   macro avg     0.9552    0.9458    0.9501       430
weighted avg     0.9537    0.9535    0.9532       430


🔍 Đang huấn luyện với tiêu chí: ENTROPY
✅ Accuracy (entropy): 0.9581
📄 Classification Report (entropy):
              precision    recall  f1-score   support

           0     0.9560    0.9775    0.9667       267
           1     0.9618    0.9264    0.9437       163

    accuracy                         0.9581       430
   macro avg     0.9589    0.9520    0.9552       430
weighted avg     0.9582    0.9581    0.9580       430


📊 So sánh tổng quan:
Tiêu chí   | Accuracy
-------------------------
gini       | 0.9535
entropy    | 0.9581

✅ Entropy cho kết quả tốt hơn với 

In [19]:
from joblib import load
model = load('../model/random_forest_scr.pkl')
