In [1]:
#必要なライブラリを読み込む
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#train1.csvを読み込む
df = pd.read_csv('train1.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 前処理

In [4]:
# クラスラベルと整数を対応させるディクショナリを生成させる
sex_mapping = {'male': 0, 'female': 1}
sex_mapping
# マッピングの実行をさせる
df['Sex'] = df['Sex'].map(sex_mapping)
embarked_dm = pd.get_dummies(df['Embarked'])
df = pd.concat((df,embarked_dm),axis=1)
df = df.drop('Embarked',axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,0,1


In [5]:
#データの欠損値の確認します
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
C                0
Q                0
S                0
dtype: int64

891分の687なので、Cabinは消去して良い

Ageの欠損処理は次のやつで行ってみよう。

数値化が難しい、Name,Ticketは消去。
⇨もしかしたら、Nameを家族でラベリングして、何人家族だと生き残ったとか、あるかもしれないので、別のやつでやってみたい。

In [6]:
df = df.drop(['Name','Ticket','Cabin'],axis=1)

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,1,0,3,0,22.0,1,0,7.25,0,0,1
1,2,1,1,1,38.0,1,0,71.2833,1,0,0
2,3,1,3,1,26.0,0,0,7.925,0,0,1
3,4,1,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,0,35.0,0,0,8.05,0,0,1


In [8]:
df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.036847,-0.057527,-0.001652,0.012658,-0.001205,-0.033606,0.022148
Survived,-0.005007,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307,0.16824,0.00365,-0.15566
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495,-0.243292,0.221009,0.08172
Sex,-0.042939,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333,0.082853,0.074115,-0.125722
Age,0.036847,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067,0.036261,-0.022405,-0.032523
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651,-0.059528,-0.026354,0.070941
Parch,-0.001652,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225,-0.011069,-0.081228,0.063036
Fare,0.012658,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0,0.269335,-0.117216,-0.166603
C,-0.001205,0.16824,-0.243292,0.082853,0.036261,-0.059528,-0.011069,0.269335,1.0,-0.148258,-0.778359
Q,-0.033606,0.00365,0.221009,0.074115,-0.022405,-0.026354,-0.081228,-0.117216,-0.148258,1.0,-0.496624


In [9]:
#相関係数0.1以上のものだけを用いるそれ以外はgoodby
df = df.drop(['Q','Parch','SibSp','Age','PassengerId'],axis=1)

In [10]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Fare,C,S
0,0,3,0,7.25,0,1
1,1,1,1,71.2833,1,0
2,1,3,1,7.925,0,1
3,1,1,1,53.1,0,1
4,0,3,0,8.05,0,1


In [11]:
#x,yへ値を代入する。
df_y = df['Survived']
y_1 = np.array(df_y) 
y_1

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1,

In [12]:
df_X = df[['Pclass','Sex','Fare','C','S']]
X_1 = np.array(df_X) 
X_1

array([[  3.    ,   0.    ,   7.25  ,   0.    ,   1.    ],
       [  1.    ,   1.    ,  71.2833,   1.    ,   0.    ],
       [  3.    ,   1.    ,   7.925 ,   0.    ,   1.    ],
       ..., 
       [  3.    ,   1.    ,  23.45  ,   0.    ,   1.    ],
       [  1.    ,   0.    ,  30.    ,   1.    ,   0.    ],
       [  3.    ,   0.    ,   7.75  ,   0.    ,   0.    ]])

In [13]:
#データの標準化をする
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_1)
X_1_std = scaler.transform(X_1)

In [14]:
#ホールド・アウト法による分割を行う
from sklearn.model_selection import train_test_split
X_1_train,X_1_test,y_1_train,y_1_test = train_test_split(X_1_std,y_1,test_size=0.3,random_state=0)

学習を行う

In [15]:
#線形SVCの学習
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_1_train,y_1_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

正答率をみる

In [16]:
# 正答率を出力
svc.score(X_1_test, y_1_test)

0.78731343283582089