In [0]:
from google.colab import drive
import keras
from keras import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler,LabelEncoder

sns.set()

In [37]:
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
pd.options.display.max_rows=100

# **데이터 전처리**

In [0]:
train = pd.read_csv('/gdrive/My Drive/Titanic/train.csv')
test = pd.read_csv('/gdrive/My Drive/Titanic/test.csv')

train_test_data = [train, test]

In [0]:
for dataset in train_test_data:
    dataset.loc[dataset['Sex']=='male','Sex'] = 0
    dataset.loc[dataset['Sex']=='female','Sex'] = 1

In [0]:
all_data = pd.concat(train_test_data)

In [0]:
Ticket_data = all_data["Ticket"].value_counts().sort_index()
Ticket_list = Ticket_data.index.tolist()
for Ticket_number in Ticket_list:
  for dataset in train_test_data:
    dataset.loc[(dataset.Ticket==Ticket_number),'Ticket_Group']=Ticket_list.index(Ticket_number)

In [0]:
for dataset in train_test_data:
  dataset['Embarked'] = dataset['Embarked'].fillna('S')
  dataset['Embarked'] = dataset['Embarked'].astype(str)

In [0]:
for dataset in train_test_data:
  dataset['Fare'] = dataset['Fare'].fillna(7.180641176470588)

In [0]:
for dataset in train_test_data:
  dataset.loc[dataset['Age'].isnull()==True,'Age_Nan'] = 1
  dataset.loc[dataset['Age'].isnull()==False,'Age_Nan'] = 0 

In [0]:
for dataset in train_test_data:
  dataset['Cabin'] = dataset['Cabin'].fillna('U')
  dataset['Cabin'] = dataset['Cabin'].astype(str)

In [0]:
for dataset in train_test_data:
    dataset['CabinAlpha'] = dataset['Cabin'].str[:1]

In [0]:
for dataset in train_test_data:
  dataset.loc[dataset.CabinAlpha=='U','CabinAlpha'] = 1
  dataset.loc[dataset.CabinAlpha=='C','CabinAlpha'] = 4
  dataset.loc[dataset.CabinAlpha=='B','CabinAlpha'] = 6
  dataset.loc[dataset.CabinAlpha=='D','CabinAlpha'] = 8
  dataset.loc[dataset.CabinAlpha=='E','CabinAlpha'] = 7
  dataset.loc[dataset.CabinAlpha=='A','CabinAlpha'] = 2
  dataset.loc[dataset.CabinAlpha=='F','CabinAlpha'] = 5
  dataset.loc[dataset.CabinAlpha=='G','CabinAlpha'] = 3
  dataset.loc[dataset.CabinAlpha=='T','CabinAlpha'] = 0

In [0]:
for dataset in train_test_data:
  dataset['Fname'] = dataset['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())

In [0]:
train.loc[796,"Fname"] = "Mrs"

In [0]:
train['Fname'].replace(['Capt','Col','Don','Dr','Jonkheer','Lady','Major','Master','Miss','Mlle','Mme','Mr','Mrs','Ms','Rev','Sir','the Countess'],
                      ['Mr','Mr','Mr','Mr','Mr','Mrs','Mr','Master','Miss','Miss','Miss','Mr','Mrs','Miss','Mr','Mr','Mrs'],inplace=True)

In [0]:
test['Fname'].replace(['Col','Dona','Dr','Master','Miss','Mr','Mrs','Ms','Rev'],
                      ['Mr','Mrs','Mr','Master','Miss','Mr','Mrs','Miss','Mr'],inplace=True)

In [0]:
for dataset in train_test_data:
  dataset.loc[(dataset.Age.isnull())&(dataset.Fname=='Master'),'Age']=5.482642
  dataset.loc[(dataset.Age.isnull())&(dataset.Fname=='Miss'),'Age']=21.834533
  dataset.loc[(dataset.Age.isnull())&(dataset.Fname=='Mr'),'Age']=32.784298
  dataset.loc[(dataset.Age.isnull())&(dataset.Fname=='Mrs'),'Age']=37.114943

In [0]:
for dataset in train_test_data:
  dataset.loc[dataset['Fname']=='Mr','Fname'] = 0
  dataset.loc[dataset['Fname']=='Master','Fname'] = 1
  dataset.loc[dataset['Fname']=='Miss','Fname'] = 2
  dataset.loc[dataset['Fname']=='Mrs','Fname'] = 3

In [0]:
for dataset in train_test_data:
    dataset.loc[dataset['Age'] <= 17.0, 'AgePoint'] = 7
    dataset.loc[(dataset['Age'] > 17.0) & (dataset['Age'] <= 21.835), 'AgePoint'] = 2
    dataset.loc[(dataset['Age'] > 21.835) & (dataset['Age'] <= 25.0), 'AgePoint'] = 3
    dataset.loc[(dataset['Age'] > 25.0) & (dataset['Age'] <= 30.0), 'AgePoint'] = 4
    dataset.loc[(dataset['Age'] > 30.0) & (dataset['Age'] <= 32.784), 'AgePoint'] = 6
    dataset.loc[(dataset['Age'] > 32.784) & (dataset['Age'] <= 36.0), 'AgePoint'] = 0
    dataset.loc[(dataset['Age'] > 36.0) & (dataset['Age'] <= 45.0), 'AgePoint'] = 5
    dataset.loc[dataset['Age'] > 45.0, 'AgePoint'] = 1

In [56]:
train.isnull().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Cabin           0
Embarked        0
Ticket_Group    0
Age_Nan         0
CabinAlpha      0
Fname           0
AgePoint        0
dtype: int64

# **정규화**

In [0]:
for dataset in train_test_data:
    dataset['AgePoint'] = StandardScaler().fit_transform(dataset['AgePoint'].values.reshape(-1, 1))
    dataset['Fname'] = StandardScaler().fit_transform(dataset['Fname'].values.reshape(-1, 1))
    dataset['CabinAlpha'] = StandardScaler().fit_transform(dataset['CabinAlpha'].values.reshape(-1, 1))
    dataset['Age_Nan'] = StandardScaler().fit_transform(dataset['Age_Nan'].values.reshape(-1, 1))
    dataset['Fare'] = StandardScaler().fit_transform(dataset['Fare'].values.reshape(-1, 1))
    dataset['Pclass'] = StandardScaler().fit_transform(dataset['Pclass'].values.reshape(-1, 1))
    dataset['Sex'] = StandardScaler().fit_transform(dataset['Sex'].values.reshape(-1, 1))
    dataset['SibSp'] = StandardScaler().fit_transform(dataset['SibSp'].values.reshape(-1, 1))
    dataset['Parch'] = StandardScaler().fit_transform(dataset['Parch'].values.reshape(-1, 1))
    dataset['Ticket_Group'] = StandardScaler().fit_transform(dataset['Ticket_Group'].values.reshape(-1, 1))

# **불필요한 열 삭제**

In [0]:
train.drop(['Cabin', 'Embarked', 'Ticket', 'Name', 'PassengerId', 'Age'], axis=1, inplace=True)
test.drop(['Cabin', 'Embarked', 'Ticket', 'Name', 'Age'], axis=1, inplace=True)

In [0]:
target = train['Survived']
train_data = train.drop('Survived', axis=1)

In [60]:
print(test.shape)
print(target.shape)
print(train_data.shape)

(418, 11)
(891,)
(891, 10)


# **TEST**

In [61]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = SVC()
# clf = DecisionTreeClassifier()
clf.fit(train_data, target)
 
test_data = test.drop("PassengerId", axis=1).copy()
prediction = clf.predict(test_data)
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction
    })
 
submission.to_csv('/gdrive/My Drive/Titanic/submission_Cabin.csv', index=False)
submission = pd.read_csv('/gdrive/My Drive/Titanic/submission_Cabin.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
