# preprocessing

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import NearMiss
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             precision_recall_curve, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, OneHotEncoder,
                                   PolynomialFeatures, StandardScaler)

In [2]:
file = '../data/NASA Near-Earth Objects-CleanbyThang.csv'
df = pd.read_csv(file,index_col=0)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23999 entries, 2001981 to 54073367
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   absolute_magnitude_h               23999 non-null  float64
 1   is_potentially_hazardous_asteroid  23999 non-null  bool   
 2   is_sentry_object                   23999 non-null  bool   
 3   kilometers_estimated_diameter_min  23999 non-null  float64
 4   kilometers_estimated_diameter_max  23999 non-null  float64
 5   orbit_class_type                   23999 non-null  object 
 6   perihelion_distance                23999 non-null  float64
 7   aphelion_distance                  23999 non-null  float64
 8   first_observation_date             23999 non-null  object 
 9   last_observation_date              23999 non-null  object 
 10  orbit_class_description            23999 non-null  object 
 11  first_observation_year             23999 non-null 

In [4]:
cate_train = ['orbit_class_type','is_sentry_object','is_collidable']
numeric_train = ['absolute_magnitude_h','kilometers_estimated_diameter_min','kilometers_estimated_diameter_max','perihelion_distance','first_observation_year','last_observation_year']

In [5]:
x = df[cate_train + numeric_train]
y = df['is_potentially_hazardous_asteroid']

In [6]:
# train test split

# 70% train , 15% validation, 15% test

x_temp , x_test, y_temp, y_test = train_test_split(x,y,test_size=0.1,random_state=0)

x_train, x_val, y_train, y_val = train_test_split(x_temp,y_temp,test_size=0.2,random_state=0)


print ("lượng dữ liệu dùng để train: ", x_train.shape[0])
print ("lượng dữ liệu dùng để validation: ", x_val.shape[0])
print ("lượng dữ liệu dùng để test: ", x_test.shape[0])

lượng dữ liệu dùng để train:  17279
lượng dữ liệu dùng để validation:  4320
lượng dữ liệu dùng để test:  2400


In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # Xử lý cột danh mục
        ('num', StandardScaler(), numeric_train)  # Chuẩn hóa cột số
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline.fit(x_train, y_train)

train_score = pipeline.score(x_train, y_train)
val_score = pipeline.score(x_val, y_val)
print("train_score: ", train_score)
print("Validation Score:", val_score)

# tính toán độ chính xác trên tập test
y_pred = pipeline.predict(x_test)

test_score = accuracy_score(y_test, y_pred)

print("Test Score:", test_score)

print(classification_report(y_test, y_pred))

train_score:  1.0
Validation Score: 0.9138888888888889
Test Score: 0.9183333333333333
              precision    recall  f1-score   support

       False       0.94      0.97      0.96      2189
        True       0.56      0.36      0.43       211

    accuracy                           0.92      2400
   macro avg       0.75      0.66      0.69      2400
weighted avg       0.91      0.92      0.91      2400



In [8]:
# Logistic Regression
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # Xử lý cột danh mục
        ('num', StandardScaler(), numeric_train)  # Chuẩn hóa cột số
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

pipeline.fit(x_train, y_train)


train_score = pipeline.score(x_train, y_train)
val_score = pipeline.score(x_val, y_val)
print("train_score: ", train_score)
print("Validation Score:", val_score)

# tính toán độ chính xác trên tập test
y_pred = pipeline.predict(x_test)

test_score = accuracy_score(y_test, y_pred)

print("Test Score:", test_score)
print("precision_score: ", precision_score(y_test,y_pred))
print("recall_score: ", recall_score(y_test,y_pred))
    

train_score:  0.9114532090977487
Validation Score: 0.9074074074074074
Test Score: 0.9116666666666666
precision_score:  0.4888888888888889
recall_score:  0.10426540284360189


In [9]:
from sklearn.model_selection import  cross_val_score
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # One-hot encoding cho các cột phân loại
        ('num', StandardScaler(), numeric_train),  # Chuẩn hóa các cột số
    ]
)

# Tạo Pipeline cho mô hình Logistic Regression với Polynomial Features
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Thêm PolynomialFeatures vào pipeline
    ('classifier', LogisticRegression(max_iter=1000,random_state=42,solver='liblinear'))  # Mô hình hồi quy logistic
])

pipeline.fit(x_train, y_train)

train_score = pipeline.score(x_train, y_train)

val_score = pipeline.score(x_val, y_val)

print("Training Score:", train_score)
print("Validation Score:", val_score)

print(classification_report(
    y_test, pipeline.predict(x_test)
))
# tính với test 
y_pred = pipeline.predict(x_test)
print('recall score: ', recall_score(y_test, y_pred))
print('precision score: ', precision_score(y_test, y_pred))



Training Score: 0.913652410440419
Validation Score: 0.9162037037037037
              precision    recall  f1-score   support

       False       0.93      0.98      0.95      2189
        True       0.55      0.22      0.32       211

    accuracy                           0.92      2400
   macro avg       0.74      0.60      0.64      2400
weighted avg       0.90      0.92      0.90      2400

recall score:  0.22274881516587677
precision score:  0.5465116279069767


In [10]:
from sklearn.tree import DecisionTreeClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # One-hot encoding cho các cột phân loại
        ('num', StandardScaler(), numeric_train),  # Chuẩn hóa các cột số
    ]
)

dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=2002))  # Mô hình cây quyết định
])


dt_pipeline.fit(x_train,y_train)


print('train score:', dt_pipeline.score(x_train,y_train))
print('val score',dt_pipeline.score(x_val,y_val) )


train score: 1.0
val score 0.8993055555555556


# Cross validation

In [11]:
from sklearn.model_selection import  cross_val_score

log_reg_cv = LogisticRegression(solver='liblinear',max_iter=1000,random_state=42)
dt_cv = DecisionTreeClassifier(random_state=2002)


pipeline_cv_log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', log_reg_cv)
])

pipeline_cv_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', dt_cv)
])

lr_scores = cross_val_score(pipeline_cv_log_reg,x_train,y_train,cv=5)

dt_scores = cross_val_score(pipeline_cv_dt,x_train,y_train)

print('Logistic Regression CV scores:', lr_scores.mean())
print('Logistic Regression CV scores:', dt_scores.mean())

Logistic Regression CV scores: 0.9116267520233693
Logistic Regression CV scores: 0.8936856977273946


# sampling data

In [23]:
nm = NearMiss()
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # One-hot encoding cho các cột phân loại
        ('num', StandardScaler(), numeric_train),  # Chuẩn hóa các cột số
    ]
)

# Tạo Pipeline cho mô hình Logistic Regression với Polynomial Features
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Thêm PolynomialFeatures vào pipeline
    ('classifier', LogisticRegression(max_iter=1000,random_state=42,solver='liblinear',class_weight='balanced'))  # Mô hình hồi quy logistic
])

pipeline.fit(x_train, y_train)

train_score = pipeline.score(x_train, y_train)

val_score = pipeline.score(x_val, y_val)

print("Training Score:", train_score)
print("Validation Score:", val_score)

print(classification_report(
    y_test, pipeline.predict(x_test)
))
# tính với test 
y_pred = pipeline.predict(x_test)
print('recall score: ', recall_score(y_test, y_pred))
print('precision score: ', precision_score(y_test, y_pred))



Training Score: 0.8464031483303431
Validation Score: 0.8513888888888889
              precision    recall  f1-score   support

       False       1.00      0.84      0.91      2189
        True       0.37      0.99      0.54       211

    accuracy                           0.85      2400
   macro avg       0.68      0.91      0.73      2400
weighted avg       0.94      0.85      0.88      2400

recall score:  0.985781990521327
precision score:  0.37076648841354726


In [29]:
from sklearn.preprocessing import MinMaxScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),
        ('num', MinMaxScaler(), numeric_train)
    ]
)

x_train_processed = preprocessor.fit_transform(x_train)
x_val_processed = preprocessor.transform(x_val)
x_test_processed = preprocessor.transform(x_test)

# 2. Oversampling với SMOTE
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_processed, y_train)

# 3. Undersampling với NearMiss
nearmiss = NearMiss()
x_train_resampled, y_train_resampled = nearmiss.fit_resample(x_train_resampled, y_train_resampled)

# 4. Thêm Polynomial Features
poly = PolynomialFeatures(degree=1, include_bias=False)
x_train_poly = poly.fit_transform(x_train_resampled)
x_val_poly = poly.transform(x_val_processed)
x_test_poly = poly.transform(x_test_processed)



# 5. Huấn luyện Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear', class_weight={0:3,1:4})
model.fit(x_train_poly, y_train_resampled)

# 6. Dự đoán và đánh giá
y_pred = model.predict(x_test_poly)

print(classification_report(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.99      0.83      0.90      2189
        True       0.34      0.94      0.50       211

    accuracy                           0.83      2400
   macro avg       0.67      0.88      0.70      2400
weighted avg       0.94      0.83      0.87      2400

Recall Score: 0.9383886255924171
Precision Score: 0.3407917383820998


In [27]:
param_grid = {
    'max_depth': [10 ],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(class_weight={1:1,0:1},random_state=42),
    param_grid=param_grid,
    cv=3, scoring='recall', n_jobs=-1, verbose=2
)
grid_search.fit(x_train_poly, y_train_resampled)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test_poly)

print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
              precision    recall  f1-score   support

       False       0.99      0.87      0.93      2189
        True       0.41      0.95      0.57       211

    accuracy                           0.88      2400
   macro avg       0.70      0.91      0.75      2400
weighted avg       0.94      0.88      0.90      2400

