In [None]:
%pip install imbalanced-learn

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from collections import Counter
import matplotlib.pyplot as plt

RANDOM_STATE = 55

In [None]:
df = pd.read_csv('../filled.csv')

In [None]:
df.head()

In [None]:
df = df.drop(['id', 'Name'], axis= 1)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
x = df.drop(['Depression'], axis=1)
y = df['Depression']

In [None]:
num_columns = x.select_dtypes(include=['int64', 'float64']).columns
str_columns = x.select_dtypes(include=['object']).columns

print(num_columns)
print(str_columns)

In [None]:
num_pipline = Pipeline([
    ('scaler', preprocessing.StandardScaler())
])
str_pipline = Pipeline([
    ('ordinal', preprocessing.OrdinalEncoder())
])
preprocessor = ColumnTransformer([
    ('num', num_pipline, num_columns),
    ('str', str_pipline, str_columns)
])

In [None]:
x_processed = preprocessor.fit_transform(x)

In [None]:
x_processed

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_processed, y, test_size=0.2, random_state=42)


In [None]:
class_name = {
    0: 'Health',
    1: 'Depression'
}

count_labels = Counter(y)
cls_name = [class_name[id] for id in list(count_labels.keys())]
cls_num = list(count_labels.values())
plt.figure(figsize=(4, 8))
plt.bar(cls_name, cls_num, color='skyblue')
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.xticks(cls_name)
for i, cnt in enumerate(cls_num):
    plt.text(cls_name[i], cnt+0.1, str(cnt), ha='center', va='bottom')
plt.show()

In [None]:
x_test.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
basemodel = RandomForestClassifier()
model = AdaBoostClassifier(basemodel, n_estimators=200, learning_rate=0.5)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))

# Dừng chạy phần dưới này

In [None]:
min_samples_split_list = [2,10, 30, 50, 100, 200, 300, 700]
max_depth_list = [2, 4, 8, 16, 32, 64, None]
n_estimators_list = [10,50,100,500]

In [None]:
min_sample_split_reports = {}
for msp in min_samples_split_list:
    model = RandomForestClassifier(min_samples_split=msp)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    min_sample_split_reports[msp] = classification_report(y_test, y_predict)

In [None]:
for m, r in min_sample_split_reports.items():
    print(m)
    print(r)

In [None]:
max_depth_reports = {}
for md in max_depth_list:
    model = RandomForestClassifier(max_depth=md)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    max_depth_reports[md] = classification_report(y_test, y_predict)

In [None]:
for m, r in max_depth_reports.items():
    print(m)
    print(r)

In [None]:
n_estimators_reports = {}
for ne in n_estimators_list:
    model = RandomForestClassifier(n_estimators=ne)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    n_estimators_reports[ne] = classification_report(y_test, y_predict)

In [None]:
for m, r in n_estimators_reports.items():
    print(m)
    print(r)

In [None]:
model = RandomForestClassifier(n_estimators=500, min_samples_split=10, max_depth=64, class_weight='balanced')
model.fit(x_train, y_train)

In [None]:
y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))

In [None]:
model_cw = RandomForestClassifier(class_weight='balanced')
model_cw.fit(x_train, y_train)
y_predict = model_cw.predict(x_test)
print(classification_report(y_test, y_predict))

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(x_train, y_train)

count_labels = Counter(y_resampled)
cls_name = [class_name[id] for id in list(count_labels.keys())]
cls_num = list(count_labels.values())
plt.figure(figsize=(4, 8))
plt.bar(cls_name, cls_num, color='skyblue')
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.xticks(cls_name)
for i, cnt in enumerate(cls_num):
    plt.text(cls_name[i], cnt+0.1, str(cnt), ha='center', va='bottom')
plt.show()

In [None]:
model_smote = RandomForestClassifier(n_estimators=500, min_samples_split=30, max_depth=16)
model_smote.fit(X_resampled, y_resampled)
y_predict = model_smote.predict(x_test)
print(classification_report(y_test, y_predict))


In [None]:
min_sample_split_reports = {}
for msp in min_samples_split_list:
    model = RandomForestClassifier(min_samples_split=msp)
    model.fit(X_resampled, y_resampled)
    y_predict = model.predict(x_test)
    min_sample_split_reports[msp] = classification_report(y_test, y_predict)

In [None]:
for m, r in min_sample_split_reports.items():
    print(m)
    print(r)

In [None]:
max_depth_reports = {}
for md in max_depth_list:
    model = RandomForestClassifier(max_depth=md)
    model.fit(X_resampled, y_resampled)
    y_predict = model.predict(x_test)
    max_depth_reports[md] = classification_report(y_test, y_predict)

In [None]:
for m, r in max_depth_reports.items():
    print(m)
    print(r)

In [None]:
n_estimators_reports = {}
for ne in n_estimators_list:
    model = RandomForestClassifier(n_estimators=ne)
    model.fit(X_resampled, y_resampled)
    y_predict = model.predict(x_test)
    n_estimators_reports[ne] = classification_report(y_test, y_predict)

In [None]:
for m, r in n_estimators_reports.items():
    print(m)
    print(r)