# preprocessing

In [26]:
import pickle

import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import NearMiss
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             precision_recall_curve, precision_score,
                             recall_score)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, OneHotEncoder,
                                   PolynomialFeatures, StandardScaler)

In [15]:
file = '../data/NASA Near-Earth Objects-Train.csv'
df = pd.read_csv(file,index_col=0)


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4568 entries, 2001981 to 54075323
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   absolute_magnitude_h               4568 non-null   float64
 1   is_potentially_hazardous_asteroid  4568 non-null   bool   
 2   is_sentry_object                   4568 non-null   bool   
 3   kilometers_estimated_diameter_min  4568 non-null   float64
 4   kilometers_estimated_diameter_max  4568 non-null   float64
 5   orbit_class_type                   4568 non-null   object 
 6   perihelion_distance                4568 non-null   float64
 7   aphelion_distance                  4568 non-null   float64
 8   first_observation_date             4568 non-null   object 
 9   last_observation_date              4568 non-null   object 
 10  orbit_class_description            4568 non-null   object 
 11  first_observation_year             4568 non-null   

In [17]:
cate_train = ['orbit_class_type','is_sentry_object','is_collidable']
numeric_train = ['absolute_magnitude_h','kilometers_estimated_diameter_min','kilometers_estimated_diameter_max','perihelion_distance','first_observation_year','last_observation_year']

In [18]:
x = df[cate_train + numeric_train]
y = df['is_potentially_hazardous_asteroid']

In [19]:
# train test split

# 70% train , 15% validation, 15% test

x_temp , x_test, y_temp, y_test = train_test_split(x,y,test_size=0.1,random_state=0)

x_train, x_val, y_train, y_val = train_test_split(x_temp,y_temp,test_size=0.2,random_state=0)


print ("lượng dữ liệu dùng để train: ", x_train.shape[0])
print ("lượng dữ liệu dùng để validation: ", x_val.shape[0])
print ("lượng dữ liệu dùng để test: ", x_test.shape[0])


lượng dữ liệu dùng để train:  3288
lượng dữ liệu dùng để validation:  823
lượng dữ liệu dùng để test:  457


- Sử dụng Pipeline để xây dựng mô hình
- DÙng GridSearchCV để tìm ra mô hình tốt nhất
- tạo mảng các mô hình cần thử (RandomForestClassifier, LogisticRegression)
- PolynomialFeatures để tạo thêm các feature mới theo đa thức 1,2
- Sampling data dùng SMOTE 
  <h3>Lưu mô hình lại vào file .pkl</h3>

In [23]:

dim = [1, 2]
res = []
best_models = {}  # Dictionary to store the best models

# Models to evaluate
Model = [
    LogisticRegression(random_state=42),
    RandomForestClassifier(random_state=42)
]

for model in Model:
    for d in dim:
        print("d = ", d)

        # Preprocessors for numeric and categorical features
        numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=d))
        ])

        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Combine preprocessors
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_train),  # Replace `numeric_train` with actual columns
                ('cat', categorical_transformer, cate_train)  # Replace `cate_train` with actual columns
            ])

        # Use imblearn's Pipeline to handle SMOTE
        pipeline = ImbPipeline(
            steps=[
                ('preprocessor', preprocessor),  # Preprocessing pipeline
                ('smote', SMOTE(random_state=42)),  # Handle imbalanced data
                ('classifier', model)  # Current model
            ])

        # Define parameter grid
        if model.__class__.__name__ == 'LogisticRegression':
            param_grid = {
                'classifier__C': [0.1, 1, 10],
                'classifier__class_weight': ['balanced', {0: 1, 1: 2}]
            }
        elif model.__class__.__name__ == 'RandomForestClassifier':
            param_grid = {
                'classifier__n_estimators': [100, 200, 300],
                'classifier__max_depth': [10, 15, 20],
                'classifier__min_samples_split': [2, 5, 10],
                'classifier__min_samples_leaf': [1, 2, 4],
                'classifier__class_weight': ['balanced', {0: 1, 1: 2}]
            }

        # Perform grid search
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(x_train, y_train)

        # Save the best model for this iteration
        best_model_key = f"{model.__class__.__name__}_degree_{d}"
        best_models[best_model_key] = grid_search.best_estimator_

        # Save best model to a file
        with open(f"{best_model_key}.pkl", 'wb') as f:
            pickle.dump(grid_search.best_estimator_, f)

        # Predictions and metrics
        y_pred = grid_search.predict(x_val)
        res.append({
            'model': model.__class__.__name__,
            'degree': d,
            'accuracy': accuracy_score(y_val, y_pred),
            'precision': precision_score(y_val, y_pred),
            'recall': recall_score(y_val, y_pred)
        })

# Print results
for r in res:
    print(r)
    print('__________________________-')


d =  1
d =  2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

d =  1
d =  2
{'model': 'LogisticRegression', 'degree': 1, 'accuracy': 0.8116646415552855, 'precision': 0.7883211678832117, 'recall': 0.826530612244898}
__________________________-
{'model': 'LogisticRegression', 'degree': 2, 'accuracy': 0.8444714459295262, 'precision': 0.851063829787234, 'recall': 0.8163265306122449}
__________________________-
{'model': 'RandomForestClassifier', 'degree': 1, 'accuracy': 0.8602673147023087, 'precision': 0.8673740053050398, 'recall': 0.8341836734693877}
__________________________-
{'model': 'RandomForestClassifier', 'degree': 2, 'accuracy': 0.8554070473876063, 'precision': 0.8719346049046321, 'recall': 0.8163265306122449}
__________________________-


In [7]:


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # Xử lý cột danh mục
        ('num', StandardScaler(), numeric_train)  # Chuẩn hóa cột số
    ]
)

# 1. Tạo Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Xử lý cột số và cột phân loại
    ('smote', SMOTE(random_state=42)),  # Xử lý dữ liệu mất cân bằng
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Tạo đặc trưng đa thức
    ('classifier', RandomForestClassifier(random_state=42))  # Random Forest
])

# 2. Cấu Hình GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Số lượng cây
    'classifier__max_depth': [10, 15, 20],  # Giới hạn độ sâu
    'classifier__min_samples_split': [2, 5, 10],  # Mẫu tối thiểu để tách
    'classifier__min_samples_leaf': [1, 2, 4],  # Mẫu tối thiểu tại lá
    'classifier__class_weight': ['balanced', {0: 1, 1: 2}]  # Cân bằng trọng số
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='recall',  # Đánh giá dựa trên recall
    cv=3,
    n_jobs=-1,
    verbose=2
)

# 3. Huấn Luyện và Tìm Tham Số Tối Ưu
grid_search.fit(x_train, y_train)

# 4. In Kết Quả Tốt Nhất
print("Best Parameters:", grid_search.best_params_)

# 5. Dự Đoán và Đánh Giá
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

from sklearn.metrics import classification_report, recall_score, precision_score
print(classification_report(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred))


Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'classifier__class_weight': {0: 1, 1: 2}, 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
              precision    recall  f1-score   support

       False       0.91      0.74      0.82       253
        True       0.74      0.91      0.82       204

    accuracy                           0.82       457
   macro avg       0.83      0.83      0.82       457
weighted avg       0.83      0.82      0.82       457

Recall Score: 0.9117647058823529
Precision Score: 0.7380952380952381


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # Xử lý cột danh mục
        ('num', StandardScaler(), numeric_train)  # Chuẩn hóa cột số
    ]
)

# 1. Tạo Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Xử lý cột số và cột phân loại
    ('smote', SMOTE(random_state=42)),  # Xử lý dữ liệu mất cân bằng
    ('poly', PolynomialFeatures(degree=1, include_bias=False)),  # Tạo đặc trưng đa thức
    ('classifier', RandomForestClassifier(random_state=42))  # Random Forest
])

# 2. Cấu Hình GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Số lượng cây
    'classifier__max_depth': [10, 15, 20],  # Giới hạn độ sâu
    'classifier__min_samples_split': [2, 5, 10],  # Mẫu tối thiểu để tách
    'classifier__min_samples_leaf': [1, 2, 4],  # Mẫu tối thiểu tại lá
    'classifier__class_weight': ['balanced', {0: 1, 1: 2}]  # Cân bằng trọng số
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='recall',  # Đánh giá dựa trên recall
    cv=3,
    n_jobs=-1,
    verbose=2
)

# 3. Huấn Luyện và Tìm Tham Số Tối Ưu
grid_search.fit(x_train, y_train)

# 4. In Kết Quả Tốt Nhất
print("Best Parameters:", grid_search.best_params_)

# 5. Dự Đoán và Đánh Giá
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

from sklearn.metrics import classification_report, recall_score, precision_score
print(classification_report(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred))


Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'classifier__class_weight': {0: 1, 1: 2}, 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}
              precision    recall  f1-score   support

       False       0.93      0.71      0.80       253
        True       0.72      0.93      0.81       204

    accuracy                           0.81       457
   macro avg       0.82      0.82      0.81       457
weighted avg       0.83      0.81      0.81       457

Recall Score: 0.9313725490196079
Precision Score: 0.7196969696969697


In [33]:
from sklearn.preprocessing import MinMaxScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),
        ('num', MinMaxScaler(), numeric_train)
    ]
)

x_train_processed = preprocessor.fit_transform(x_train)
x_val_processed = preprocessor.transform(x_val)
x_test_processed = preprocessor.transform(x_test)

# 2. Oversampling với SMOTE
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_processed, y_train)

# 3. Undersampling với NearMiss
nearmiss = NearMiss()
x_train_resampled, y_train_resampled = nearmiss.fit_resample(x_train_resampled, y_train_resampled)

# 4. Thêm Polynomial Features
poly = PolynomialFeatures(degree=1, include_bias=False)
x_train_poly = poly.fit_transform(x_train_resampled)
x_val_poly = poly.transform(x_val_processed)
x_test_poly = poly.transform(x_test_processed)



# 5. Huấn luyện Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear', class_weight={0:3,1:4})
model.fit(x_train_poly, y_train_resampled)

# 6. Dự đoán và đánh giá
y_pred = model.predict(x_test_poly)

print(classification_report(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.85      0.68      0.76       253
        True       0.69      0.85      0.76       204

    accuracy                           0.76       457
   macro avg       0.77      0.77      0.76       457
weighted avg       0.78      0.76      0.76       457

Recall Score: 0.8529411764705882
Precision Score: 0.6850393700787402
