In [1]:
# 必要なライブラリの読み込み
import pandas as pd
import numpy as np

# 不要な警告は非表示
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 訓練データとテストデータの読み込み
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# データ前処理等を一括して行うためにまとめたものを作っておく
full_data = [train, test]

In [3]:
# trainの内容を確認
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# testの内容を確認
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# trainとtestのshapeを確認
print('train.shape:', train.shape)
print('test.shape:', test.shape)

train.shape: (891, 12)
test.shape: (418, 11)


In [6]:
# trainの要約統計量を確認
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
# testの要約統計量を確認
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [8]:
# trainの欠損値の個数を確認
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
# testの欠損値の個数を確認
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [10]:
pd.value_counts(train['Embarked'])

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
# trainのEmbarkedの2件の欠損値は最頻値で埋める
train['Embarked'] = train['Embarked'].fillna('S')

In [12]:
# trainとtestのAgeの欠損値は、それぞれの中央値で埋める
train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = test['Age'].fillna(test['Age'].median())

In [13]:
# testのfareの欠損値は、それぞれの中央値で埋める
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

In [14]:
# Sexを数値に変換
train['Sex'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
test['Sex'] = test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [15]:
# Embarkedを数値に変換
train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
test['Embarked'] = test['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [16]:
#　SibSp(兄弟、配偶者の数)とParch(両親、子供の数)から特徴量FamilySize(本人を含めた家族の人数)を追加
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [17]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,2
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,0,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,0,1
5,6,0,3,"Moran, Mr. James",1,28.0,0,0,330877,8.4583,,2,1
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,,0,5
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,347742,11.1333,,0,3
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,237736,30.0708,,1,2


In [18]:
test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,2,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,0,2
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,2,1
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,0,3
5,897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,7538,9.225,,0,1
6,898,3,"Connolly, Miss. Kate",0,30.0,0,0,330972,7.6292,,2,1
7,899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,248738,29.0,,0,3
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",0,18.0,0,0,2657,7.2292,,1,1
9,901,3,"Davies, Mr. John Samuel",1,21.0,2,0,A/4 48871,24.15,,0,3


In [19]:
# scikit-learnの勾配ブースティングに使用するGradientBoostingClassifierをインポート
from sklearn.ensemble import GradientBoostingClassifier

# trainの目的変数と説明変数の値を取得
train_target = train['Survived'].values
train_features = train[['Pclass', 'Age', 'Sex', 'Fare', 'FamilySize', 'Embarked']].values

# モデルを作成
clf = GradientBoostingClassifier(n_estimators=55, random_state=9)
model = clf.fit(train_features, train_target)

# testから使う項目の値を取り出す
test_features = test[['Pclass', 'Age', 'Sex', 'Fare', 'FamilySize', 'Embarked']].values

# 予測をしてCSVへ書き出す
test_target = model.predict(test_features)
PassengerId = np.array(test['PassengerId']).astype(int)
solution_df = pd.DataFrame(test_target, PassengerId, columns=['Survived'])
solution_df.to_csv('../output/submission.csv', index_label=['PassengerId'])

  from numpy.core.umath_tests import inner1d


In [20]:
# 学習済みモデルをpklファイルとして出力
import pickle
pickle.dump(
    model,
    open('../output/classifier.pkl', 'wb'),
    protocol=4
)

In [25]:
print(test_features[8])

[ 3.     18.      0.      7.2292  1.      1.    ]


In [33]:
# def classify(document):
#     label = {0: 'negative', 1: 'positive'}
#     X = vect.transform([document])
#     y = clf.predict(X)[0]
#     proba = np.max(clf.predict_proba(X))
#     return label[y], proba

sample_data_features = [
    [
        3, # Pclass
        34.5, # Age
        1, # Sex
        7.8292, # Fare
        1, # FamilySize
        1 # Embarked
    ],
    [
        3,
        18,
        0,
        7.2292,
        1,
        1
    ]
]
sample_data_target = model.predict(sample_data_features)
# y, proba = classify(review)
print(sample_data_target)

[0 1]
