# パッケージのインポート

In [None]:
import pandas as pd
import pandas_profiling
from sklearn.linear_model import LogisticRegression

# データの読み込み

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
gender_submission = pd.read_csv('../input/titanic/gender_submission.csv')

# データの確認（提出用データの形状、学習データ、テストデータ）

In [None]:
gender_submission.head()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
data = pd.concat([train, test], sort=False)
data.tail()

In [None]:
print(len(train), len(test), len(data))
# 欠損値の確認
data.isnull().sum()

# 特徴量エンジニアリング

In [None]:
# 性別：male->0, female->1
data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
# 乗り込み地点：欠損値をSで補間、S->0, C->1, Q->2
data['Embarked'].fillna('S', inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
# 料金：欠損値を平均値で補間
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
# 年齢：欠損値を平均値±標準偏差の間のランダム値で補間
age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

# 学習に使用しないカラムを除去

In [None]:
delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

# 学習データとテストデータに分割

In [None]:
train = data[:len(train)]
test = data[len(train):]
y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

# 機械学習

In [None]:
clf = LogisticRegression(penalty='l2', solver='sag', random_state=0)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)

# 提出用ファイル作成

In [None]:
sub = pd.read_csv('../input/titanic/gender_submission.csv')
sub['Survived'] = list(map(int, y_predict))
sub.to_csv('submission.csv', index=False)