In [50]:
# package imports
import pandas as pd
import numpy as np 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import matplotlib.pyplot as plt
%matplotlib inline

# skelarn packages
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [52]:
titanic = pd.read_csv('train.csv')
titanic.drop(columns = ['PassengerId','Name','Ticket'], inplace = True)
titanic.info()

num_features = ['Age', 'SibSp','Parch','Fare', 'Cabin']
cate_features = ['Pclass', 'Sex']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


# 特征工程

## 离散值类型处理

In [55]:
#oneHot
ohe = OneHotEncoder(sparse=False)
hs_train_transformed = ohe.fit_transform(titanic[cate_features])
ohe.get_feature_names()

array(['x0_1', 'x0_2', 'x0_3', 'x1_female', 'x1_male'], dtype=object)

# 模型验证

## 交叉验证

### KFold

In [None]:
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=4)
kf.get_n_splits(X)

print(kf)

# for i,(train, test) in enumerate(kf.split(Y_train)):
for i,(train_index, test_index) in enumerate(kf.split(X)):
    i
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Transform组合
https://scikit-learn.org/stable/modules/compose.html

## Pipeline
chain multiple estimators into one

In [6]:
pipe =  make_pipeline(PCA(5), SVC())
pipe

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=5,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

### 对pipeline进行网格搜索
https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html#sphx-glr-auto-examples-compose-plot-compare-reduction-py

In [10]:
from sklearn.model_selection import GridSearchCV
param_grid = dict(pca__n_components=[2, 5, 10],
                  svc__C=[0.1, 10, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)
grid_search

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=5, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_s

In [12]:
from sklearn.linear_model import LogisticRegression
param_grid = dict(pca=['passthrough', PCA(5), PCA(10)],
                  svc=[SVC(), LogisticRegression()],
                  svc__C=[0.1, 10, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)
grid_search

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=5, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            pro...
                                     shrinking=True, tol=0.001, 

### ColumnTransformer 组合transformer

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer

data = {'name': ['mw', 'xy', 'mw', 'mt', 'mw', 'xy'],'year': [2000, 2001, 2002, 2003, 2004, 2005]}
df1 = pd.DataFrame(data)

preprocessor = make_column_transformer((OneHotEncoder(),[0]), remainder="drop")
df1 = preprocessor.fit_transform(df1)
df1

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

### FeatureUnion

In [45]:
data = {
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df1 = pd.DataFrame(data)
df1
# df1.loc[df1['pop']>2, 'pop'] = None
# df1

def set_child_type(df):
    df['is_child'] = 3
#     df.loc[df['pop']<3, 'is_child'] = 3
    return df

def set_Cabin_type(df):
    df['is_pop'] = 4
#     df.loc[df['pop']<3, 'is_pop' ] = 4
#     df.loc[df['pop'].isna(), 'pop' ] = 5
    return df

pipeline = make_union(
    FunctionTransformer(set_child_type),
    FunctionTransformer(set_Cabin_type),
    SimpleImputer(strategy="median"), #缺失值用中位数代替
    StandardScaler()#标准化
)
df1 = pipeline.fit_transform(df1)

Unnamed: 0,year,pop
0,2000,1.5
1,2001,1.7
2,2002,3.6
3,2001,2.4
4,2002,2.9
5,2003,3.2


## 自定义Transformer

### FunctionTransformer

In [34]:
data = {
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df1 = pd.DataFrame(data)
df1
# df1.loc[df1['pop']>2, 'pop'] = None
# df1

def set_child_type(df):
    df['is_child'] = 0
    df.loc[df['pop']<3, 'is_child'] = 1
    return df

def set_Cabin_type(df):
    df['is_pop'] = 0
    df.loc[df['pop']<3, 'is_pop' ] = 1
    df.loc[df['pop'].isna(), 'pop' ] = 0
    return df

pipeline = make_pipeline(
    FunctionTransformer(set_child_type),
    FunctionTransformer(set_Cabin_type),
    SimpleImputer(strategy="median"), #缺失值用中位数代替
    StandardScaler()#标准化
)

pipeline.fit_transform(df1).shape


Unnamed: 0,year,pop
0,2000,1.5
1,2001,1.7
2,2002,3.6
3,2001,2.4
4,2002,2.9
5,2003,3.2


(6, 4)