In [1]:
# -*- coding: utf-8 -*-
"""
Pipeline cuối cùng, áp dụng triết lý "Less is More".
Tập trung vào các đặc trưng cốt lõi và hiệu quả nhất.
"""

import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor

# Tải dữ liệu
try:
    train_df_raw = pd.read_csv('train.csv')
    test_df_raw = pd.read_csv('test.csv')
except FileNotFoundError:
    print("Lỗi: Không tìm thấy file train.csv hoặc test.csv.")
    exit()

def create_final_features(train_df, test_df):
    y_target = train_df['Survived']
    combined_df = pd.concat([train_df.drop('Survived', axis=1), test_df], ignore_index=True)
    
    # --- Điền giá trị thiếu ---
    combined_df['Embarked'].fillna(combined_df['Embarked'].mode()[0], inplace=True)
    combined_df['Fare'].fillna(combined_df['Fare'].median(), inplace=True)
    
    # --- Dự đoán Age ---
    df_for_age_pred = combined_df.copy()
    df_for_age_pred['Title_temp'] = df_for_age_pred['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
    # ... (logic dự đoán Age giữ nguyên)
    df_for_age_pred['Sex_temp'] = df_for_age_pred['Sex'].map({'male': 0, 'female': 1})
    age_features = ['Pclass', 'Sex_temp', 'SibSp', 'Parch', 'Fare']
    age_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    age_regressor.fit(df_for_age_pred[df_for_age_pred['Age'].notna()][age_features], df_for_age_pred[df_for_age_pred['Age'].notna()]['Age'])
    predicted_age = age_regressor.predict(df_for_age_pred[df_for_age_pred['Age'].isna()][age_features])
    combined_df.loc[combined_df['Age'].isna(), 'Age'] = predicted_age
    
    # --- Feature Engineering Cốt lõi ---
    combined_df['FamilySize'] = combined_df['SibSp'] + combined_df['Parch'] + 1
    combined_df['IsAlone'] = (combined_df['FamilySize'] == 1).astype(int)
    combined_df['Title'] = combined_df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
    combined_df['Title'] = combined_df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare').replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
    combined_df['Deck'] = combined_df['Cabin'].str[0].fillna('U')
    
    # --- Xóa và Mã hóa ---
    combined_s = combined_df.drop(columns=['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'])
    categorical_cols = ['Sex', 'Embarked', 'Title', 'Deck']
    combined_final = pd.get_dummies(combined_s, columns=categorical_cols, drop_first=True, dtype=int)
    
    # Tách lại
    train_final = combined_final.iloc[:len(train_df)]
    test_final = combined_final.iloc[len(train_df):]
    train_final['Survived'] = y_target
    
    return train_final, test_final

# Thực thi và lưu
print("Đang tạo bộ dữ liệu cuối cùng theo phương pháp đơn giản hóa...")
train_final_df, test_final_df = create_final_features(train_df_raw, test_df_raw)

save_dir = "Data_clean_v4"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

train_final_df.to_csv(f'{save_dir}/train_cleaned.csv', index=False)
test_final_df.to_csv(f'{save_dir}/test_cleaned.csv', index=False)

print(f"Đã lưu dữ liệu vào '{save_dir}'. Số đặc trưng: {len(train_final_df.columns) - 2}")

Đang tạo bộ dữ liệu cuối cùng theo phương pháp đơn giản hóa...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Embarked'].fillna(combined_df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Fare'].fillna(combined_df['Fare'].median(), inplace=True)


Đã lưu dữ liệu vào 'Data_clean_v4'. Số đặc trưng: 20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_final['Survived'] = y_target
