In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [12]:
train_path = r"D:\tune\train_data\train.json"
df = pd.read_json(train_path, lines=True)

In [13]:
def categorize_age(age):
    if pd.isna(age):
        return np.nan
    elif age < 25:
        return 'Less than 25'
    elif age <= 45:
        return '25 - 45'
    else:
        return 'Greater than 45'

df['age_cat'] = df['age'].apply(categorize_age)
df.drop(columns=['age'], inplace=True)

In [14]:
X = df.drop('two_year_recid', axis=1)
y = df['two_year_recid']

# Identify categorical and numerical columns
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), num_features),
        ('cat', Pipeline([
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
            ('imputer', SimpleImputer(strategy='most_frequent'))
        ]), cat_features)
    ]
)

In [15]:
X_processed = preprocessor.fit_transform(X)

In [10]:
# Lấy tên cột cho dữ liệu đã encode
cat_encoded_columns = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(cat_features)

# Kết hợp tên cột số và cột phân loại đã encode
processed_columns = num_features + cat_encoded_columns.tolist()

# Tạo DataFrame mới
X_processed_df = pd.DataFrame(X_processed, columns=processed_columns)

# Hiển thị DataFrame
print(X_processed_df.head(10))

         age  juv_fel_count  juv_misd_count  juv_other_count  priors_count  \
0  34.609719       0.067195        0.000000         0.104943      0.000000   
1  25.000000       2.000000        3.000000         0.104943      8.000000   
2  42.000000       0.000000        0.000000         0.000000      0.000000   
3  24.000000       0.000000        0.072434         0.000000      3.403398   
4  31.000000       0.000000        1.000000         0.000000      3.000000   
5  34.609719       0.000000        0.000000         0.104943      4.000000   
6  71.000000       0.000000        0.000000         0.104943     12.000000   
7  66.000000       0.000000        0.000000         0.000000      3.403398   
8  27.000000       0.000000        0.000000         0.000000      6.000000   
9  29.000000       0.000000        0.000000         0.000000      1.000000   

   sex_Female  sex_Male  sex_None  race_African-American  race_Asian  \
0         0.0       1.0       0.0                    0.0         0.0 