In [468]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [469]:
train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [470]:
#Check existence of missing values
train_df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [471]:
#Check missing values in test_df
test_df.isnull().any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool

In [472]:
train_df[['Age','Cabin','Embarked']].isnull().sum() * 100 / len(train_df)

Age         19.865320
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [473]:
test_df[['Age','Cabin','Embarked']].isnull().sum() * 100 / len(test_df)

Age         20.574163
Cabin       78.229665
Embarked     0.000000
dtype: float64

Age should naturally be correlated to survival rate. There are too many missing entries in the cabin column, so we will drop that column. We check whether 'Embarked' is correlated to survival rate.

In [474]:
train_df.groupby(['Embarked','Survived']).size()

Embarked  Survived
C         0            75
          1            93
Q         0            47
          1            30
S         0           427
          1           217
dtype: int64

In [475]:
#Drop unnecessary columns
train_df = train_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [476]:
test_df = test_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [477]:
#Handle missing data in 'Age', 'Embarked' and 'Fare'
#1. Fill in missing 'Age' using random values based on the distribution of 'Age'
all_df = [train_df, test_df]
for data in all_df:
    mean = data["Age"].mean()
    std = data["Age"].std()
    is_null = data["Age"].isnull().sum()
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)

    age = data["Age"].copy()
    age[np.isnan(age)] = rand_age
    data["Age"] = age
    data["Age"] = data["Age"].astype(int)

In [478]:
#For train_df, fill in missing 'Embarked' using mode
train_df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [479]:
train_df["Embarked"] = train_df["Embarked"].fillna('S')

In [480]:
#For test_df, fill in missing 'Fare' using random values based on the distribution of 'Fare'
mean = test_df["Fare"].mean()
std = test_df["Fare"].std()
is_null = test_df["Fare"].isnull().sum()
rand_fare = np.random.randint(mean - std, mean + std, size = is_null)

fare = test_df["Fare"].copy()
fare[np.isnan(fare)] = rand_fare
test_df["Fare"] = fare

In [481]:
#Encode categorical data
train_df['Sex'] = train_df['Sex'].astype('category').cat.codes
train_df['Embarked'] = train_df['Embarked'].astype('category').cat.codes
test_df['Sex'] = test_df['Sex'].astype('category').cat.codes
test_df['Embarked'] = test_df['Embarked'].astype('category').cat.codes

In [482]:
X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]
X_test  = test_df

In [483]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [484]:
#Train model using SVM
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train) 
y_pred = model.predict(X_test)

In [485]:
#Check model accuracy
from sklearn.metrics import accuracy_score
model.score(X_train, Y_train)
model = round(model.score(X_train, Y_train) * 100, 2)
model

83.95

In [486]:
#Create submission file
submission = pd.DataFrame({
        "PassengerId": pd.read_csv('../input/titanic/test.csv')["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('./submission.csv', index=False)