In [314]:
# package imports
import pandas as pd
import numpy as np 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics

from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split

# EDA（Exploratory Data Analysis）

In [315]:
titanic = pd.read_csv('train.csv')
titanic.drop(columns = ['PassengerId','Name','Ticket'], inplace = True)


In [316]:
titanic.shape
titanic.info()

(891, 9)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [317]:
num_features = ['Age', 'SibSp','Parch','Fare', 'Cabin']
cate_features = ['Pclass', 'Sex','Embarked']

for feature in cate_features:
    feature
    titanic[feature].value_counts()

'Pclass'

3    491
1    216
2    184
Name: Pclass, dtype: int64

'Sex'

male      577
female    314
Name: Sex, dtype: int64

'Embarked'

S    644
C    168
Q     77
Name: Embarked, dtype: int64

## 相关系数

In [318]:
corr_matrix = titanic.corr()
corr_matrix['Survived'].sort_values()

Pclass     -0.338481
Age        -0.077221
SibSp      -0.035322
Parch       0.081629
Fare        0.257307
Survived    1.000000
Name: Survived, dtype: float64

# 特征工程

## 去掉缺失值过多的feature

In [319]:
# titanic.drop(columns='Cabin', inplace=True)
# titanic.info()

## 用模型填补缺失值（Age）

In [320]:
from sklearn.ensemble import RandomForestRegressor
def set_missing_ages(df):

    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df['Age'].notna()]
    unknown_age = age_df[age_df['Age'].isna()]

    # y即目标年龄
    y = known_age.iloc[:, 0]

    # X即特征属性值
    X = known_age.iloc[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor()
    rfr.fit(X, y)

    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age.iloc[:, 1:])
#     print predictedAges
    # 用得到的预测结果填补原缺失数据
    df.loc[df['Age'].isna(), 'Age' ] = predictedAges 

    return df, rfr

## Age<10 =is_cild

In [321]:
def set_child_type(df):
    df['is_child'] = 0
    df.loc[df['is_child']<11, 'is_child'] = 1
    return df

## 用有无缺失值去重新定义（Cabin）

In [322]:
def set_Cabin_type(df):
    df.loc[df['Cabin'].notna(), 'Cabin'] = 1
    df.loc[df['Cabin'].isna(), 'Cabin'] = 0
    return df

## 处理numeric、categorical数据

In [323]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

num_pipeline = make_pipeline(
    FunctionTransformer(set_child_type),
    FunctionTransformer(set_Cabin_type),
    SimpleImputer(strategy="median"), #缺失值用中位数代替
    StandardScaler()#标准化
)
cate_pipeline = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='unknown'), #离散型数据用unknown补全
    OneHotEncoder(sparse=False)
)

In [324]:
from sklearn.compose import make_column_transformer

data_pipeline = make_column_transformer(
    (num_pipeline, num_features),
    (cate_pipeline, cate_features),
    remainder='passthrough'
    )
# train_prepared = full_pipeline.fit_transform()
# type(train_prepared)
# train_prepared.shape

# train_prepared = pd.DataFrame(train_prepared, columns=X_train.columns)
# train_prepared.info()

## 特征选择。比如：查看特征工程后的相关系数。待定

In [325]:
# corr_matrix = titanic.corr()
# corr_matrix['Survived'].sort_values()

# 模型选择

## 划分数据集

In [326]:
X_titanic = titanic.drop('Survived', axis = 1)
label = titanic['Survived']


# 将数据集按照`3:7`的比例，切分为训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(X_titanic, label, test_size = 0.3, random_state = 42)
type(X_test)

pandas.core.frame.DataFrame

Todo

In [327]:
# 用模型填补Age缺失值（思考怎么加入Pipline）
X_train, rfr = set_missing_ages(X_train)
tmp_df = X_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[X_test['Age'].isna()]
X = null_age.iloc[:, 1:]
predictedAges = rfr.predict(X)
X_test.loc[X_test['Age'].isna(), 'Age'] = predictedAges

#加入pipeline出错，先手动处理
# X_train = set_child_type(X_train)
# X_train = set_Cabin_type(X_train)

# X_test = set_child_type(X_test)
# X_test = set_Cabin_type(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


test

## 组合pipeline（模型+数据处理）

In [328]:
def full_pipeline(model):
    return make_pipeline(data_pipeline, model)

## 模型训练

In [329]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

### RandomForestClassifier

In [330]:
model = RandomForestClassifier(random_state=42)

### VotingClassifier

In [277]:

# model = RandomForestClassifier(random_state=42)
# model = LogisticRegression(random_state=42)
# model = GradientBoostingClassifier(random_state=42)
# model = VotingClassifier(
#     estimators=[
#         ('mlp_clf', MLPClassifier()),
# #         ('log_clf', LogisticRegression()),
#         ('ab_clf', AdaBoostClassifier()),
# #         ('svm_clf', SVC(probability=True)),
#         ('rf_clf', RandomForestClassifier(n_estimators=100)),
#         ('gbdt_clf', GradientBoostingClassifier())
#     ], voting='soft')

# gg.get_params('votingclassifier')
# for m in model.estimators_:
#     m
#     X_test = data_pipeline.transform(X_test)
#     m.score(X_test, Y_test)


{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                     transformer_weights=None,
                     transformers=[('pipeline-1',
                                    Pipeline(memory=None,
                                             steps=[('functiontransformer-1',
                                                     FunctionTransformer(accept_sparse=False,
                                                                         check_inverse=True,
                                                                         func=<function set_child_type at 0x1a1c79add0>,
                                                                         inv_kw_args=None,
                                                                         inverse_func=None,
                                                                         kw_args=None,
                                                          

0.8022388059701493

### Stacking

In [312]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier

# models = [
#     MLPClassifier(),
#     AdaBoostClassifier(),
#     SVC(probability=True),
#     LogisticRegression(C=0.1,max_iter=100),
#     RandomForestClassifier(n_estimators=100,max_depth=6,oob_score=True),
#     GradientBoostingClassifier(learning_rate=0.3,max_depth=6,n_estimators=100)
# ]


models=[
    ('mlp_clf', MLPClassifier()),
    ('ab_clf', AdaBoostClassifier()),
    ('log_clf', LogisticRegression()),
    ('svm_clf', SVC(probability=True)),
    ('rf_clf', RandomForestClassifier()),
    ('gbdt_clf', GradientBoostingClassifier())
]


model = StackingClassifier(estimators=models, final_estimator=RandomForestClassifier())


## 模型验证

In [331]:
gg = full_pipeline(model)
gg.fit(X_train, Y_train) 

gg.score(X_test, Y_test)
Y_pred = gg.predict(X_test)
print("Accuracy : %.4g" % metrics.accuracy_score(Y_test, Y_pred))

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('functiontransformer-1',
                                                                   FunctionTransformer(accept_sparse=False,
                                                                                       check_inverse=True,
                                                                                       func=<function set_child_type at 0x1a1ec37440>,
                                                                                       inv_kw_args=None,...
                 RandomForestClassifier(bootstrap=True, ccp_alp

0.7873134328358209

Accuracy : 0.7873


# 保存结果

In [239]:
test_data = pd.read_csv('test.csv')
predictions = gg.predict(test_data)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")



Your submission was successfully saved!


## 记录结果
1. stacking：0.79425
2. voting：0.78947

In [332]:
model.coef

AttributeError: 'RandomForestClassifier' object has no attribute 'coef'

In [241]:
data_pipeline.get_feature_names()

AttributeError: Transformer pipeline-1 (type Pipeline) does not provide get_feature_names.