# CONSTANTS

In [1]:
raw_data_path = '../input/titanic'
transformed_data_path = '../output/titanic'
trained_model_path = '../output/titanic'
predicted_data_path = '../output/titanic'

# 1. transform: 特徴量エンジニアリング

In [2]:
import numpy as np
import pandas as pd

In [3]:
train = pd.read_csv(raw_data_path + '/train.csv')
test = pd.read_csv(raw_data_path + '/test.csv')
gender_submission = pd.read_csv(raw_data_path + '/gender_submission.csv')

In [4]:
data = pd.concat([train, test], sort=False)

In [5]:
data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)

In [6]:
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [7]:
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)

In [8]:
age_avg = data['Age'].mean()
age_std = data['Age'].std()

data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

In [9]:
delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

In [10]:
train = data[:len(train)]
test = data[len(train):]

In [11]:
y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

In [12]:
y_train.to_csv(transformed_data_path + "/y_train.csv")
X_train.to_csv(transformed_data_path + "/X_train.csv")
X_test.to_csv(transformed_data_path + "/X_test.csv")

# 2. trainer: 機械学習アルゴリズムの学習

In [13]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import pickle

In [14]:
y_train = pd.read_csv(transformed_data_path + "/y_train.csv", index_col=0)
X_train = pd.read_csv(transformed_data_path + "/X_train.csv", index_col=0)

In [15]:
model = LogisticRegression(penalty='l2', solver='sag', random_state=0)
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=0, solver='sag')

In [16]:
pickle.dump(model, open(trained_model_path + 'model_titanic.sav', 'wb'))

# 3. predictor: 機械学習アルゴリズムの予測

In [17]:
!pip install kaggle

Collecting kaggle
  Using cached kaggle-1.5.12-py3-none-any.whl
Installing collected packages: kaggle
Successfully installed kaggle-1.5.12


In [18]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import pickle
from kaggle.api.kaggle_api_extended import KaggleApi

In [19]:
X_test = pd.read_csv(transformed_data_path + "/X_test.csv", index_col=0)

In [20]:
loaded_model = pickle.load(open(trained_model_path + 'model_titanic.sav', 'rb'))

In [21]:
y_pred = loaded_model.predict(X_test)

In [22]:
# submit
sub = pd.read_csv(raw_data_path + '/gender_submission.csv')
sub['Survived'] = list(map(int, y_pred))
sub.to_csv(predicted_data_path + 'submission.csv', index=False)

In [23]:
api = KaggleApi()
api.authenticate()
#api.competition_submit(file_name=predicted_data_path + 'submission.csv', message='update', competition='titanic')