In [411]:
import pandas as pd

data_train = pd.read_csv('./input/train.csv')
data_test = pd.read_csv('./input/test.csv')
PassengerId = data_test.PassengerId
y = data_train['Survived']
del data_train['Survived']

In [412]:
def feature_preprocessing(data):
    # 性别
    data['Sex'] = data['Sex'].map(lambda x: 1 if x == 'male' else 0)

    # 缺失值填充
    data['Embarked'] = data['Embarked'].fillna(
        data['Embarked'].mode().values[0])

    # Fare 船票价格，使用平均数填充
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())

    Pclass_dummies = pd.get_dummies(data['Pclass'], prefix='Pclass')
    Embarked_dummies = pd.get_dummies(data['Embarked'], prefix='Embarked')

    # 丢弃姓名、客舱 Cabin 、船票信息 Ticket
    data.drop(
        columns=[
            'Cabin', 'Ticket', 'Pclass', 'Embarked', 'PassengerId'
        ],
        inplace=True)

    data = pd.concat([data, Pclass_dummies, Embarked_dummies], axis=1)
    return data


data_train = feature_preprocessing(data_train)
data_test = feature_preprocessing(data_test)

In [413]:
from sklearn.ensemble import RandomForestRegressor


def predict_age_by_rfr(data):

    columns = data.columns
    cols = columns.tolist()
    if 'Survived' in cols:
        cols.remove('Survived')
    cols.remove('Name')
    cols.remove('Age')
    X_train_for_age = data.loc[data['Age'].notnull(), cols].values
    y_train_for_age = data.loc[data['Age'].notnull(), 'Age'].values

    rfr = RandomForestRegressor(random_state=666, n_estimators=100)
    rfr.fit(X_train_for_age, y_train_for_age)

    X_test_for_age = data.loc[data['Age'].isnull(), cols].values
    y_pred_for_age = rfr.predict(X_test_for_age)

    data.loc[data['Age'].isnull(), 'Age'] = y_pred_for_age


predict_age_by_rfr(data_train)
predict_age_by_rfr(data_test)

In [414]:
def feature_extract_Title(data):

    data['Title'] = data['Name'].map(
        lambda x: x.split(',')[1].split('.')[0].strip())
    Title_Dictionary = {
        "Mr": "Mr",
        "Mrs": "Mrs",
        "Miss": "Miss",
        "Master": "Master",
        "Don": "Royalty",
        "Rev": "Officer",
        "Dr": "Officer",
        "Mme": "Mrs",
        "Ms": "Mrs",
        "Major": "Officer",
        "Lady": "Royalty",
        "Sir": "Royalty",
        "Mlle": "Miss",
        "Col": "Officer",
        "Capt": "Officer",
        "the Countess": "Royalty",
        "Jonkheer": "Royalty",
        "Dona": 'Mrs'
    }
    data['Title'] = data['Title'].map(Title_Dictionary)

    Title_dummies = pd.get_dummies(data['Title'], prefix='Title')
    data = pd.concat([data, Title_dummies], axis=1)
    # 丢弃姓名、客舱 Cabin 、船票信息 Ticket
    data.drop(columns=['Name', 'Title'], inplace=True)
    return data


combined_data = pd.concat([data_train, data_test], axis=0, sort=True)
combined_data = feature_extract_Title(combined_data)
data_train = combined_data.iloc[:data_train.shape[0], :]
data_test = combined_data.iloc[data_train.shape[0]:, :]

In [416]:
data_train.shape

(891, 17)

In [417]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_train, y, test_size=0.2, random_state=666)

In [421]:
# 各种模型一起用
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

dt = DecisionTreeClassifier(random_state=666)
lr = LogisticRegression(solver='newton-cg')
rfc = RandomForestClassifier(random_state=666, n_estimators=100)
svc = SVC(kernel='rbf', gamma='scale')
ada = AdaBoostClassifier()
gbc = GradientBoostingClassifier()
xgb = XGBClassifier()

models = [('DecisionTreeClassifier', dt), ('LogisticRegression', lr),
          ('RandomForestClassifier', rfc), ('svc', svc),
          ('AdaBoostClassifier', ada), ('GradientBoostingClassifier',
                                        gbc), ('XBClassifier', xgb)]


def predict_by_different_models(models):
    res = dict()
    for model in models:
        model_name = model[0]
        clf = model[1]
        clf.fit(X_train, y_train)
        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        res[model_name] = {
            'train_score': train_score,
            'test_score': test_score
        }
    return res


res = predict_by_different_models(models)
res = pd.DataFrame(res).T
res.reindex(columns=['train_score', 'test_score'])

Unnamed: 0,train_score,test_score
DecisionTreeClassifier,0.983146,0.787709
LogisticRegression,0.842697,0.793296
RandomForestClassifier,0.983146,0.832402
svc,0.76264,0.681564
AdaBoostClassifier,0.84691,0.793296
GradientBoostingClassifier,0.901685,0.821229
XBClassifier,0.886236,0.826816


## 特征选择

In [450]:
np.sort(xgb.feature_importances_)

array([0.        , 0.        , 0.        , 0.00688468, 0.01032702,
       0.01376936, 0.0172117 , 0.02237521, 0.02581756, 0.03098107,
       0.03442341, 0.03614458, 0.04302926, 0.04302926, 0.07917384,
       0.30636832, 0.33046472], dtype=float32)

In [454]:
importance_feature = X_train.columns[rfc.feature_importances_ >= 0.03]
importance_feature

Index(['Age', 'Fare', 'Pclass_3', 'Sex', 'SibSp', 'Title_Miss', 'Title_Mr',
       'Title_Mrs'],
      dtype='object')

In [455]:
%%time
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': list(range(2, 7)),
    'n_estimators': list(range(100, 1100, 200)),
    'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]
}

xgbc= XGBClassifier()

gs = GridSearchCV(xgbc, params, n_jobs=-1, cv=5, verbose=1)

gs.fit(X_train[importance_feature], y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   16.6s


CPU times: user 3.39 s, sys: 197 ms, total: 3.59 s
Wall time: 23.4 s


[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:   23.3s finished


In [456]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}

In [457]:
gs.best_score_

0.8356741573033708

In [459]:
gs.best_estimator_.score(X_test[importance_feature], y_test)

0.7988826815642458

In [460]:
xgbc_best = XGBClassifier(learning_rate=0.1, max_depth=4, n_estimators=100)
xgbc_best.fit(data_train[importance_feature], y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [463]:
y_pred = xgbc_best.predict(data_test[importance_feature])

In [465]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X,y)
y_pred = lr.predict(data_test)



In [466]:
result = pd.DataFrame({'PassengerId': PassengerId, 'Survived': y_pred})
result.to_csv('lr_submission.csv', index=None)

In [468]:
import numpy as np

x = np.array([1, 2, 3, 5, 6, 7, 8, 9])

np.clip(x, 3, 8)

array([3, 3, 3, 5, 6, 7, 8, 8])