In [None]:
from sklearn.tree import DecisionTreeClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # One-hot encoding cho các cột phân loại
        ('num', StandardScaler(), numeric_train),  # Chuẩn hóa các cột số
    ]
)

dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=2002))  # Mô hình cây quyết định
])


dt_pipeline.fit(x_train,y_train)


print('train score:', dt_pipeline.score(x_train,y_train))
print('val score',dt_pipeline.score(x_val,y_val) )


train score: 1.0
val score 0.8993055555555556


In [None]:
from sklearn.model_selection import  cross_val_score

log_reg_cv = LogisticRegression(solver='liblinear',max_iter=1000,random_state=42)
dt_cv = DecisionTreeClassifier(random_state=2002)


pipeline_cv_log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', log_reg_cv)
])

pipeline_cv_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', dt_cv)
])

lr_scores = cross_val_score(pipeline_cv_log_reg,x_train,y_train,cv=5)

dt_scores = cross_val_score(pipeline_cv_dt,x_train,y_train)

print('Logistic Regression CV scores:', lr_scores.mean())
print('Logistic Regression CV scores:', dt_scores.mean())

Logistic Regression CV scores: 0.9116267520233693
Logistic Regression CV scores: 0.8936856977273946


In [None]:
nm = NearMiss()
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # One-hot encoding cho các cột phân loại
        ('num', StandardScaler(), numeric_train),  # Chuẩn hóa các cột số
    ]
)

# Tạo Pipeline cho mô hình Logistic Regression với Polynomial Features
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Thêm PolynomialFeatures vào pipeline
    ('classifier', LogisticRegression(max_iter=1000,random_state=42,solver='liblinear',class_weight='balanced'))  # Mô hình hồi quy logistic
])

pipeline.fit(x_train, y_train)

train_score = pipeline.score(x_train, y_train)

val_score = pipeline.score(x_val, y_val)

print("Training Score:", train_score)
print("Validation Score:", val_score)

print(classification_report(
    y_test, pipeline.predict(x_test)
))
# tính với test 
y_pred = pipeline.predict(x_test)
print('recall score: ', recall_score(y_test, y_pred))
print('precision score: ', precision_score(y_test, y_pred))



Training Score: 0.8464031483303431
Validation Score: 0.8513888888888889
              precision    recall  f1-score   support

       False       1.00      0.84      0.91      2189
        True       0.37      0.99      0.54       211

    accuracy                           0.85      2400
   macro avg       0.68      0.91      0.73      2400
weighted avg       0.94      0.85      0.88      2400

recall score:  0.985781990521327
precision score:  0.37076648841354726


In [None]:
from sklearn.preprocessing import MinMaxScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),
        ('num', MinMaxScaler(), numeric_train)
    ]
)

x_train_processed = preprocessor.fit_transform(x_train)
x_val_processed = preprocessor.transform(x_val)
x_test_processed = preprocessor.transform(x_test)

# 2. Oversampling với SMOTE
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_processed, y_train)

# 3. Undersampling với NearMiss
nearmiss = NearMiss()
x_train_resampled, y_train_resampled = nearmiss.fit_resample(x_train_resampled, y_train_resampled)

# 4. Thêm Polynomial Features
poly = PolynomialFeatures(degree=1, include_bias=False)
x_train_poly = poly.fit_transform(x_train_resampled)
x_val_poly = poly.transform(x_val_processed)
x_test_poly = poly.transform(x_test_processed)



# 5. Huấn luyện Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear', class_weight={0:3,1:4})
model.fit(x_train_poly, y_train_resampled)

# 6. Dự đoán và đánh giá
y_pred = model.predict(x_test_poly)

print(classification_report(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.99      0.83      0.90      2189
        True       0.34      0.94      0.50       211

    accuracy                           0.83      2400
   macro avg       0.67      0.88      0.70      2400
weighted avg       0.94      0.83      0.87      2400

Recall Score: 0.9383886255924171
Precision Score: 0.3407917383820998


In [None]:
param_grid = {
    'max_depth': [10 ],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(class_weight={1:1,0:1},random_state=42),
    param_grid=param_grid,
    cv=3, scoring='recall', n_jobs=-1, verbose=2
)
grid_search.fit(x_train_poly, y_train_resampled)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test_poly)

print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
              precision    recall  f1-score   support

       False       0.99      0.87      0.93      2189
        True       0.41      0.95      0.57       211

    accuracy                           0.88      2400
   macro avg       0.70      0.91      0.75      2400
weighted avg       0.94      0.88      0.90      2400

