# preprocessing

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
file = '../data/NASA Near-Earth Objects-CleanbyThang.csv'
df = pd.read_csv(file,index_col=0)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23999 entries, 2001981 to 54073367
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   absolute_magnitude_h               23999 non-null  float64
 1   is_potentially_hazardous_asteroid  23999 non-null  bool   
 2   is_sentry_object                   23999 non-null  bool   
 3   kilometers_estimated_diameter_min  23999 non-null  float64
 4   kilometers_estimated_diameter_max  23999 non-null  float64
 5   orbit_class_type                   23999 non-null  object 
 6   perihelion_distance                23999 non-null  float64
 7   aphelion_distance                  23999 non-null  float64
 8   first_observation_date             23999 non-null  object 
 9   last_observation_date              23999 non-null  object 
 10  orbit_class_description            23999 non-null  object 
 11  first_observation_year             23999 non-null 

In [4]:
cate_train = ['orbit_class_type','is_sentry_object','is_collidable']
numeric_train = ['absolute_magnitude_h','kilometers_estimated_diameter_min','kilometers_estimated_diameter_max','perihelion_distance','first_observation_year','last_observation_year']

In [9]:
x = df[cate_train + numeric_train]
y = df['is_potentially_hazardous_asteroid']

In [23]:
# train test split

# 70% train , 15% validation, 15% test

x_train , x_temp, y_train, y_temp = train_test_split(x,y,test_size=0.2,random_state=0)

x_test, x_val, y_test, y_val = train_test_split(x_temp,y_temp,test_size=0.5,random_state=0)

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cate_train),  # Xử lý cột danh mục
        ('num', StandardScaler(), numeric_train)  # Chuẩn hóa cột số
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline.fit(x_train, y_train)

train_score = pipeline.score(x_train, y_train)
val_score = pipeline.score(x_val, y_val)
print("train_score: ", train_score)
print("Validation Score:", val_score)

# tính toán độ chính xác trên tập test
y_pred = pipeline.predict(x_test)

test_score = accuracy_score(y_test, y_pred)

print("Test Score:", test_score)

train_score:  0.9999479139538517
Validation Score: 0.92375
Test Score: 0.9129166666666667
