In [None]:
# 特征工程怎么做
# https://gemini.google.com/app/f091005ad8d4a8e7
# https://zhuanlan.zhihu.com/p/1929223292528620363
import category_encoders as ce
from sklearn.model_selection import KFold
# 目标编码5折交叉验证防泄露
enc = ce.TargetEncoder(cols=['city'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for tr_idx, val_idx in kf.split(df):
    enc.fit(df.loc[tr_idx, 'city'], df.loc[tr_idx, 'churn'])
    df.loc[val_idx, 'city_te'] = enc.transform(df.loc[val_idx, 'city'])

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import gensim
print(dir(gensim.models))
# TF-IDF 向量化
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(df['comment'])

# Word2Vec
sentences = [comment.split() for comment in df['comment']]
w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=3)
df['vector'] = df['comment'].apply(lambda x: w2v.wv.get_vector(x.split()[0]) if x.split() else np.zeros(100))

['AtireBM25Model', 'AuthorTopicModel', 'BackMappingTranslationMatrix', 'CoherenceModel', 'Doc2Vec', 'EnsembleLda', 'FAST_VERSION', 'FastText', 'HdpModel', 'KeyedVectors', 'LdaModel', 'LdaMulticore', 'LdaSeqModel', 'LogEntropyModel', 'LsiModel', 'LuceneBM25Model', 'Nmf', 'NormModel', 'OkapiBM25Model', 'Phrases', 'RpModel', 'TfidfModel', 'TranslationMatrix', 'VocabTransform', 'Word2Vec', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_fasttext_bin', 'atmodel', 'basemodel', 'bm25model', 'callbacks', 'coherencemodel', 'doc2vec', 'doc2vec_corpusfile', 'doc2vec_inner', 'ensemblelda', 'fasttext', 'fasttext_corpusfile', 'fasttext_inner', 'hdpmodel', 'interfaces', 'keyedvectors', 'ldamodel', 'ldamulticore', 'ldaseqmodel', 'logentropy_model', 'lsimodel', 'nmf', 'nmf_pgd', 'normmodel', 'phrases', 'rpmodel', 'tfidfmodel', 'translation_matrix', 'utils', 'word2vec', 'word2vec_corpusfile', 'word2vec_inner']


In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LassoCV
# filter
X_new = SelectKBest(mutual_info_classif, k=10).fit_transform(X, y)
#Embedded
model = LassoCV(cv=5)
model.fit(X, y)
keep = X.columns[model.coef_ != 0]

In [None]:
# 自动特征工程
import featuretools as ft
es = ft.EntitySet(id='customer_data')
es.add_dataframe("customers", df, index="customer_id")
feat_matrix, feat_defs = ft.dfs(entityset=es, target_dataframe_name="customers", max_depth=2)


In [4]:
import numpy as np
from sklearn.model_selection import KFold

original_idx = np.arange(10)
print(f"原始数据索引: {original_idx}\n")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print(f"使用 KFold 进行 5 折交叉验证:")
for fold, (train_idx, val_idx) in enumerate(kf.split(original_idx)):
    print(f"--- 第 {fold + 1} 折 ---")
    print(f"  训练集索引: {train_idx}")
    print(f"  验证集索引: {val_idx}")

原始数据索引: [0 1 2 3 4 5 6 7 8 9]

使用 KFold 进行 5 折交叉验证:
--- 第 1 折 ---
  训练集索引: [0 2 3 4 5 6 7 9]
  验证集索引: [1 8]
--- 第 2 折 ---
  训练集索引: [1 2 3 4 6 7 8 9]
  验证集索引: [0 5]
--- 第 3 折 ---
  训练集索引: [0 1 3 4 5 6 8 9]
  验证集索引: [2 7]
--- 第 4 折 ---
  训练集索引: [0 1 2 3 5 6 7 8]
  验证集索引: [4 9]
--- 第 5 折 ---
  训练集索引: [0 1 2 4 5 7 8 9]
  验证集索引: [3 6]


In [None]:
# https://gemini.google.com/app/3f22d0a18dda3fea
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Preprocessor:
    """负责数据预处理的类，利用 df.pipe() 提升代码可读性和流程清晰度。"""

    def __init__(self, id_cols=None, target_col=None):
        """
        初始化预处理器。

        参数:
            id_cols (list): 需要从特征处理中排除的ID列名列表。
            target_col (str): 目标列名，也需要从特征处理中排除。
        """
        self.num_imputer = SimpleImputer(strategy="median")
        self.cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
        # Store encoders for categorical features that need fitting (OrdinalEncoder/OneHotEncoder)
        self.categorical_transformers = {} # Stores {'col_name': fitted_encoder_pipeline}
        self.standard_scaler = StandardScaler()

        # Define columns to exclude from general feature processing
        self.exclude_cols = set(id_cols if id_cols is not None else ["SK_ID_CURR", "SK_ID_BUREAU", "SK_ID_PREV"])
        if target_col:
            self.exclude_cols.add(target_col)
        self.target_col = target_col


    def _handle_infinities(self, df):
        """将DataFrame中的无穷大值转换为NaN，以便缺失值填充器处理。"""
        logger.info("处理无穷大值...")
        return df.replace([np.inf, -np.inf], np.nan)

    def _impute_numerical(self, df, is_train):
        """处理数值特征的缺失值。"""
        num_cols = df.select_dtypes(include=np.number).columns.tolist()
        # Filter out exclude_cols if they were not removed earlier
        num_cols = [col for col in num_cols if col not in self.exclude_cols]

        if not num_cols:
            logger.info("没有数值特征需要填充。")
            return df

        logger.info(f"处理数值特征缺失值: {len(num_cols)}列...")
        if is_train:
            df[num_cols] = self.num_imputer.fit_transform(df[num_cols])
        else:
            # Handle potential missing columns in test set that were in train set
            existing_num_cols = [col for col in num_cols if col in df.columns]
            if existing_num_cols:
                df[existing_num_cols] = self.num_imputer.transform(df[existing_num_cols])
            # If some num_cols from train are missing in test, we just skip them.
            # Imputer already fitted on train, so it won't complain about new columns.
        return df

    def _impute_categorical(self, df, is_train):
        """处理分类特征的缺失值。"""
        cat_cols = df.select_dtypes(include='object').columns.tolist()
        # Filter out exclude_cols if they were not removed earlier
        cat_cols = [col for col in cat_cols if col not in self.exclude_cols]

        if not cat_cols:
            logger.info("没有分类特征需要填充。")
            return df

        logger.info(f"处理分类特征缺失值: {len(cat_cols)}列...")
        if is_train:
            df[cat_cols] = self.cat_imputer.fit_transform(df[cat_cols])
        else:
            # Similar to numerical, handle potentially missing columns in test set
            existing_cat_cols = [col for col in cat_cols if col in df.columns]
            if existing_cat_cols:
                df[existing_cat_cols] = self.cat_imputer.transform(df[existing_cat_cols])
        return df

    def _encode_categorical(self, df, is_train):
        """编码分类变量，使用 OneHotEncoder 或 OrdinalEncoder。"""
        logger.info("编码分类变量...")

        df_encoded = df.copy() # Make a copy to avoid modifying original df during loop/concat
        processed_cat_cols = [] # Keep track of cols that will be replaced

        cat_cols = df_encoded.select_dtypes(include='object').columns.tolist()
        cat_cols = [col for col in cat_cols if col not in self.exclude_cols] # Ensure we don't encode excluded cols

        if not cat_cols:
            logger.info("没有分类特征需要编码。")
            return df_encoded

        for col in cat_cols:
            # Handle columns that might not exist in test set if they were only in train
            if col not in df_encoded.columns:
                logger.warning(f"列 '{col}' 在当前数据中不存在，跳过编码。")
                continue

            if is_train:
                if df_encoded[col].nunique() <= 2:
                    # Binary/Low-cardinality: OrdinalEncoder
                    encoder_pipeline = Pipeline(steps=[
                        ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
                    ])
                    df_encoded[col] = encoder_pipeline.fit_transform(df_encoded[[col]])
                else:
                    # High-cardinality: OneHotEncoder
                    encoder_pipeline = Pipeline(steps=[
                        ('onehot_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
                    ])
                    # OneHotEncoder returns a numpy array, need to convert back to DataFrame
                    encoded_array = encoder_pipeline.fit_transform(df_encoded[[col]])
                    # Get feature names for new columns
                    feature_names = encoder_pipeline.named_steps['onehot_encoder'].get_feature_names_out([col])
                    temp_df = pd.DataFrame(encoded_array, columns=feature_names, index=df_encoded.index)
                    df_encoded = pd.concat([df_encoded.drop(columns=[col]), temp_df], axis=1)
                self.categorical_transformers[col] = encoder_pipeline # Store the fitted encoder

            else: # is_test
                if col in self.categorical_transformers:
                    encoder_pipeline = self.categorical_transformers[col]
                    if isinstance(encoder_pipeline.named_steps['ordinal_encoder'], OrdinalEncoder):
                        # OrdinalEncoder
                        try:
                            df_encoded[col] = encoder_pipeline.transform(df_encoded[[col]])
                        except ValueError as e:
                            # Handle cases where `unknown_value` might be a class if not handled by handle_unknown='use_encoded_value'
                            # For OrdinalEncoder, handle_unknown='use_encoded_value' should prevent errors for new categories
                            logger.error(f"OrdinalEncoder转换列 '{col}' 时出错: {e}. 确保handle_unknown正确设置.")
                            df_encoded[col] = -1 # Fallback to unknown value if error occurs
                    else: # OneHotEncoder
                        try:
                            encoded_array = encoder_pipeline.transform(df_encoded[[col]])
                            feature_names = encoder_pipeline.named_steps['onehot_encoder'].get_feature_names_out([col])
                            temp_df = pd.DataFrame(encoded_array, columns=feature_names, index=df_encoded.index)
                            df_encoded = pd.concat([df_encoded.drop(columns=[col]), temp_df], axis=1)
                        except ValueError as e:
                            logger.error(f"OneHotEncoder转换列 '{col}' 时出错: {e}. 确保handle_unknown='ignore'正确设置.")
                            # If OneHotEncoder fails (e.g., due to unexpected issues),
                            # we might end up with missing one-hot columns.
                            # Best practice is to ensure handle_unknown='ignore'
                            # If training columns are missing in test after one-hot, fill with 0.
                            pass # handle_unknown='ignore' should prevent this error for new categories
                else:
                    logger.warning(f"列 '{col}' 在训练集中未被拟合，测试集跳过编码。")
        return df_encoded


    def _scale_numerical(self, df, is_train):
        """对数值特征进行标准化。"""
        num_cols = df.select_dtypes(include=np.number).columns.tolist()
        num_cols = [col for col in num_cols if col not in self.exclude_cols] # Ensure we don't scale excluded cols

        if not num_cols:
            logger.info("没有数值特征需要标准化。")
            return df

        logger.info(f"标准化数值特征: {len(num_cols)}列...")
        if is_train:
            df[num_cols] = self.standard_scaler.fit_transform(df[num_cols])
        else:
            existing_num_cols = [col for col in num_cols if col in df.columns]
            if existing_num_cols:
                df[existing_num_cols] = self.standard_scaler.transform(df[existing_num_cols])
        return df


    def preprocess(self, df, is_train=True):
        """
        预处理数据框。

        参数:
            df (DataFrame): 要预处理的数据框。
            is_train (bool): 是否为训练数据。

        返回:
            DataFrame: 预处理后的数据框。
        """
        logger.info(f"开始预处理{'训练' if is_train else '测试'}数据")

        # 将ID列和TARGET列暂时保存，并在所有转换完成后重新合并
        preserved_cols_data = df[list(self.exclude_cols.intersection(df.columns))].copy()
        df_working = df.drop(columns=list(self.exclude_cols.intersection(df.columns)), errors='ignore').copy()

        # 使用 pipe() 链式调用预处理步骤
        processed_df = (df_working
                        .pipe(self._handle_infinities)
                        .pipe(self._impute_numerical, is_train=is_train)
                        .pipe(self._impute_categorical, is_train=is_train)
                        .pipe(self._encode_categorical, is_train=is_train) # Note: _encode_categorical needs to be careful with column changes
                        .pipe(self._scale_numerical, is_train=is_train)
                       )

        # 重新合并保留的列
        final_df = pd.concat([processed_df, preserved_cols_data], axis=1)

        logger.info("预处理完成。")
        return final_df

# --- 示例使用 ---
if __name__ == "__main__":
    # 创建一些示例数据
    data_train = {
        'SK_ID_CURR': [1, 2, 3, 4, 5, 6],
        'Feature_Num1': [10.0, 20.0, np.nan, 40.0, 50.0, 60.0],
        'Feature_Num2': [1.0, np.inf, 3.0, 4.0, -np.inf, 6.0],
        'Feature_Cat1': ['A', 'B', 'A', 'C', 'B', 'A'], # High-cardinality categorical
        'Feature_Cat2': ['Yes', 'No', 'Yes', 'No', np.nan, 'Yes'], # Binary categorical
        'Feature_Cat3': ['X', 'Y', 'X', 'Z', 'Y', 'X'], # Another high-cardinality
        'Some_Other_Col': ['val1', 'val2', 'val3', 'val4', 'val5', 'val6'], # Untouched feature type
        'TARGET': [0, 1, 0, 1, 0, 1]
    }
    df_train = pd.DataFrame(data_train)

    data_test = {
        'SK_ID_CURR': [7, 8, 9, 10],
        'Feature_Num1': [70.0, np.nan, 90.0, 100.0],
        'Feature_Num2': [7.0, 8.0, np.inf, 10.0],
        'Feature_Cat1': ['B', 'D', 'C', 'A'], # 'D' is new category
        'Feature_Cat2': ['No', 'Yes', 'No', 'Yes'],
        'Feature_Cat3': ['Y', 'A', 'Z', 'Y'], # 'A' is new category
        'Some_Other_Col': ['val7', 'val8', 'val9', 'val10'],
        'TARGET': [0, 0, 1, 1]
    }
    df_test = pd.DataFrame(data_test)

    # Initialize preprocessor
    preprocessor = Preprocessor(target_col='TARGET', id_cols=['SK_ID_CURR'])

    # Train data preprocessing
    print("\n--- 训练数据预处理 ---")
    df_train_processed = preprocessor.preprocess(df_train, is_train=True)
    print(df_train_processed.head())
    print(f"训练集处理后形状: {df_train_processed.shape}")
    print(df_train_processed.info(verbose=True, show_counts=True))


    # Test data preprocessing
    print("\n--- 测试数据预处理 ---")
    df_test_processed = preprocessor.preprocess(df_test, is_train=False)
    print(df_test_processed.head())
    print(f"测试集处理后形状: {df_test_processed.shape}")
    print(df_test_processed.info(verbose=True, show_counts=True))

    # Verify column consistency (excluding id/target columns)
    train_cols_for_check = [col for col in df_train_processed.columns if col not in preprocessor.exclude_cols]
    test_cols_for_check = [col for col in df_test_processed.columns if col not in preprocessor.exclude_cols]
    print(f"\n训练集和测试集处理后的**特征**列名是否一致: {set(train_cols_for_check) == set(test_cols_for_check)}")
    # Note: Column order might differ, so check set equality or sort for list equality

In [None]:
# https://grok.com/chat/b1655899-c5e5-469e-9dcc-dcb4fb931ab0
import logging
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Configure logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

class InfinityHandler(BaseEstimator, TransformerMixin):
    """Replace infinite values with NaN."""
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        logger.info("Handling infinite values")
        X = X.copy()
        X.replace([np.inf, -np.inf], np.nan, inplace=True)
        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical variables (label encoding for binary, one-hot for multi-class)."""
    def __init__(self):
        self.label_encoders = {}
        self.one_hot_columns = None

    def fit(self, X, y=None):
        X = X.copy()
        self.label_encoders = {}
        self.one_hot_columns = set()
        for col in X.columns:
            if X[col].nunique() <= 2:  # Label encoding for binary
                self.label_encoders[col] = LabelEncoder()
                X[col] = self.label_encoders[col].fit_transform(X[col].fillna('missing'))
            else:  # One-hot encoding for multi-class
                X = pd.get_dummies(X, columns=[col], prefix=col, dummy_na=False)
                self.one_hot_columns.update([c for c in X.columns if c.startswith(f"{col}_")])
        return self

    def transform(self, X):
        X = X.copy()
        logger.info("Encoding categorical variables")
        for col in X.columns:
            if col in self.label_encoders:  # Label encoding
                X[col] = X[col].map(lambda x: x if x in self.label_encoders[col].classes_ else 'missing')
                if 'missing' not in self.label_encoders[col].classes_:
                    classes = list(self.label_encoders[col].classes_)
                    classes.append('missing')
                    self.label_encoders[col].classes_ = np.array(classes)
                X[col] = self.label_encoders[col].transform(X[col])
            else:  # One-hot encoding
                if X[col].nunique() > 2:
                    X = pd.get_dummies(X, columns=[col], prefix=col, dummy_na=False)
                    X = X.reindex(columns=X.columns.union(self.one_hot_columns), fill_value=0)
        return X

class Preprocessor:
    def __init__(self):
        self.exclude_cols = ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV']
        self.pipeline = None
        self.num_cols = None
        self.cat_cols = None

    def _get_column_types(self, df):
        """Identify numerical and categorical columns, excluding specified columns."""
        self.num_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                        if col not in self.exclude_cols]
        self.cat_cols = [col for col in df.select_dtypes(include=['object']).columns
                        if col not in self.exclude_cols]
        return self.num_cols, self.cat_cols

    def preprocess(self, df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
        """
        Preprocess data using a scikit-learn pipeline.

        :param df: Input DataFrame
        :param is_train: Whether this is the training set
        :return: Preprocessed DataFrame
        """
        logger.info(f"Starting preprocessing for {'training' if is_train else 'test'} data, shape: {df.shape}")
        df_copy = df.copy()

        # Identify column types
        num_cols, cat_cols = self._get_column_types(df_copy)

        # Define pipeline for numerical and categorical columns
        num_pipeline = Pipeline([
            ('inf_handler', InfinityHandler()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        cat_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', CategoricalEncoder())
        ])

        # Combine pipelines with ColumnTransformer
        preprocessor = ColumnTransformer([
            ('num', num_pipeline, num_cols),
            ('cat', cat_pipeline, cat_cols),
            ('passthrough', 'passthrough', self.exclude_cols +
             [col for col in df_copy.columns if col not in num_cols + cat_cols])
        ])

        if is_train:
            self.pipeline = preprocessor
            result = self.pipeline.fit_transform(df_copy)
        else:
            if self.pipeline is None:
                raise ValueError("Pipeline not fitted. Run on training data first.")
            result = self.pipeline.transform(df_copy)

        # Convert back to DataFrame
        all_cols = num_cols + cat_cols + self.exclude_cols + \
                   [col for col in df_copy.columns if col not in num_cols + cat_cols]
        if is_train:
            # Update cat_cols to include one-hot encoded columns
            cat_encoder = self.pipeline.named_transformers_['cat'].named_steps['encoder']
            all_cols = (num_cols +
                       list(cat_encoder.one_hot_columns) +
                       [col for col in cat_cols if col in cat_encoder.label_encoders] +
                       self.exclude_cols +
                       [col for col in df_copy.columns if col not in num_cols + cat_cols])
        df_processed = pd.DataFrame(result, columns=all_cols, index=df_copy.index)
        logger.info(f"Preprocessing complete, final shape: {df_processed.shape}")
        return df_processed

if __name__ == "__main__":
    # Sample data
    data_train = {
        'SK_ID_CURR': [1, 2, 3, 4, 5, 6],
        'Feature_Num1': [10.0, 20.0, np.nan, 40.0, 50.0, 60.0],
        'Feature_Num2': [1.0, np.inf, 3.0, 4.0, -np.inf, 6.0],
        'Feature_Cat1': ['A', 'B', 'A', 'C', 'B', 'A'],
        'Feature_Cat2': ['Yes', 'No', 'Yes', 'No', np.nan, 'Yes'],
        'Feature_Cat3': ['X', 'Y', 'X', 'Z', 'Y', 'X'],
        'Some_Other_Col': ['val1', 'val2', 'val3', 'val4', 'val5', 'val6'],
        'TARGET': [0, 1, 0, 1, 0, 1]
    }
    df_train = pd.DataFrame(data_train)

    data_test = {
        'SK_ID_CURR': [7, 8, 9, 10],
        'Feature_Num1': [70.0, np.nan, 90.0, 100.0],
        'Feature_Num2': [7.0, 8.0, np.inf, 10.0],
        'Feature_Cat1': ['B', 'D', 'C', 'A'],
        'Feature_Cat2': ['No', 'Yes', 'No', 'Yes'],
        'Feature_Cat3': ['Y', 'A', 'Z', 'Y'],
        'Some_Other_Col': ['val7', 'val8', 'val9', 'val10'],
        'TARGET': [0, 0, 1, 1]
    }
    df_test = pd.DataFrame(data_test)

    preprocessor = Preprocessor()
    print("\n--- Training Data Preprocessing ---")
    df_train_processed = preprocessor.preprocess(df_train, is_train=True)
    print(df_train_processed.head())
    print(f"Training set shape: {df_train_processed.shape}")
    print(df_train_processed.info(verbose=True, show_counts=True))

    print("\n--- Test Data Preprocessing ---")
    df_test_processed = preprocessor.preprocess(df_test, is_train=False)
    print(df_test_processed.head())
    print(f"Test set shape: {df_test_processed.shape}")
    print(df_test_processed.info(verbose=True, show_counts=True))