In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [59]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
ex_df = pd.read_csv("data/gender_submission.csv")

## 前処理のメモ
- PassengerId
- Survived: 生きているかどうか 0 or 1


- Age: 年齢 欠損値あり
  - Pclass, Sex, Parch, SibSp からランダムフォレストで推定
- SibSp: 兄弟姉妹の数
- Parch: 親子の数
- Fare: 運賃 欠損値あり
  - Pclass, Sex, Parch, SibSp から平均値で補完

### Nanとそれ以外で分ける
- Cabin: 客室番号 欠損値あり

### 不要な列
- Name: 名前
- Ticket: チケットの番号

### ワンホットエンコーディング
- Pclass: チケットのクラス 1, 2, 3
- Sex: 性別
- Embarked: 乗船場所 S, C, Q 欠損値あり
  - 欠損値の2人（全て）は生き残っていた
  - 欠損値の時は生存率が高かったCで補完することにする

In [60]:
# train_df, test_dfを結合
test_df["Survived"] = np.nan
df = pd.concat([train_df, test_df], ignore_index=True)
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [61]:
# CabinはNaNの場合は0, それ以外は1
df["Cabin"] = df["Cabin"].notnull().astype(int)

# Embarkedは欠損値の2人はCで補完する
df["Embarked"] = df["Embarked"].fillna("C")

# Fareは欠損値をPclass, Sex, Parch, SibSpの平均値で補完する
df['Fare'] = df['Fare'].fillna(df.groupby(['Pclass', 'Sex', 'Parch', 'SibSp'])['Fare'].transform('mean'))

df.isnull().sum()


PassengerId      0
Survived       418
Pclass           0
Name             0
Sex              0
Age            263
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [62]:
# Age を Pclass, Sex, Parch, SibSp からランダムフォレストで推定
from sklearn.ensemble import RandomForestRegressor


age_df = df[["Age", "Pclass", "Sex", "Parch", "SibSp"]]
age_df = pd.get_dummies(age_df, columns=["Pclass", "Sex"])

# 学習データとテストデータに分離
known_age = age_df[age_df.Age.notnull()].values
unknown_age = age_df[age_df.Age.isnull()].values

# 学習データをX, yに分離
X_train = known_age[:, 1:]
y_train = known_age[:, 0]

# ランダムフォレストで推定モデルを構築
rfr = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1)
rfr.fit(X_train, y_train)

# 補完
predictedAges = rfr.predict(unknown_age[:, 1::])
df.loc[(df.Age.isnull()), "Age"] = predictedAges

In [63]:
# # 年齢別生存曲線と死亡曲線
# facet = sns.FacetGrid(df[0:890], hue="Survived", aspect=2)
# facet.map(sns.kdeplot, "Age", shade=True)
# facet.set(xlim=(0, df.loc[0:, "Age"].max()))
# facet.add_legend()
# plt.show()

In [64]:
# 不要な列を削除
df.drop(["Name", "Ticket"], axis=1, inplace=True)

# ワンホットエンコーディング
df = pd.get_dummies(df, columns=["Pclass", "Sex", "Embarked"])

In [65]:
df.head()
# 前処理終了

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Cabin,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0.0,22.0,1,0,7.25,0,False,False,True,False,True,False,False,True
1,2,1.0,38.0,1,0,71.2833,1,True,False,False,True,False,True,False,False
2,3,1.0,26.0,0,0,7.925,0,False,False,True,True,False,False,False,True
3,4,1.0,35.0,1,0,53.1,1,True,False,False,True,False,False,False,True
4,5,0.0,35.0,0,0,8.05,0,False,False,True,False,True,False,False,True


In [66]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

# 各モデルのインスタンスを作成
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=0),
    "DecisionTree": DecisionTreeClassifier(random_state=0),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=0),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=0),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=0),
    "SVC": SVC(probability=True, random_state=0),
    "KNeighbors": KNeighborsClassifier(),
    "GaussianNB": GaussianNB(),
}

# データの分割
known_data = df[df.Survived.notnull()]
X_known = known_data.drop("Survived", axis=1).values
y_known = known_data["Survived"].values
X_train, X_valid, y_train, y_valid = train_test_split(X_known, y_known, random_state=0)

# モデルの精度比較
for name, model in models.items():
    scores = cross_val_score(model, X_known, y_known, cv=5, scoring="accuracy")
    print(f"{name}: {scores.mean():.4f} (std: {scores.std():.4f})")

"""
LogisticRegression: 0.8047 (std: 0.0122)
DecisionTree: 0.7555 (std: 0.0922)
RandomForest: 0.8138 (std: 0.0304)
GradientBoosting: 0.7757 (std: 0.0793)
AdaBoost: 0.7689 (std: 0.0681)
SVC: 0.6386 (std: 0.0128)
KNeighbors: 0.5398 (std: 0.0924)
GaussianNB: 0.7722 (std: 0.0318)
"""

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression: 0.8036 (std: 0.0125)
DecisionTree: 0.7555 (std: 0.0922)
RandomForest: 0.8193 (std: 0.0217)
GradientBoosting: 0.7757 (std: 0.0793)
AdaBoost: 0.7689 (std: 0.0681)
SVC: 0.6386 (std: 0.0128)
KNeighbors: 0.5398 (std: 0.0924)
GaussianNB: 0.7722 (std: 0.0318)


'\nLogisticRegression: 0.8047 (std: 0.0122)\nDecisionTree: 0.7555 (std: 0.0922)\nRandomForest: 0.8138 (std: 0.0304)\nGradientBoosting: 0.7757 (std: 0.0793)\nAdaBoost: 0.7689 (std: 0.0681)\nSVC: 0.6386 (std: 0.0128)\nKNeighbors: 0.5398 (std: 0.0924)\nGaussianNB: 0.7722 (std: 0.0318)\n'

In [67]:
# RandomForestを使用して提出用データの作成
model = models["RandomForest"]
model.fit(X_train, y_train)

score = model.score(X_valid, y_valid)
print("Validation Accuracy: {:.4f}".format(score))


# 提出用データの作成
unknown_data = df[df.Survived.isnull()]
X_unknown = unknown_data.drop("Survived", axis=1).values
submit = test_df[["PassengerId"]]
submit["Survived"] = model.predict(X_unknown).astype(int)
submit.to_csv("submit2.csv", index=False)

Validation Accuracy: 0.8027


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit["Survived"] = model.predict(X_unknown).astype(int)
