In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.svm import SVC

### Prepare full training set data

In [2]:
# Set data path and load training data
file_path = "../data"
train_full = pd.read_csv(f'{file_path}/input/train.csv')
train_full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Drop unncessary columns 
train_full = train_full.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

# Drop cabin as it has too many missing
train_full = train_full.drop('Cabin', axis=1)

# Drop rows that have missing embarked as we cannot impute it
train_full = train_full.dropna(subset = ['Embarked'])

# Drop rows that have missing values for the target
if train_full['Survived'].isnull().any() == True:
    train_full = train_full.dropna(subset = ['Survived'])
else:
    pass

### Prepare test data to look like the final train data

In [4]:
# Load test data
test_full = pd.read_csv(f'{file_path}/input/test.csv')
print(test_full.shape)
test_full.head()

# Drop unncessary columns 
ids = test_full['PassengerId']
test_full = test_full.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

# Create objects for numerical and categorical columns
object_cols = [col for col in test_full.columns if test_full[col].dtype == 'object']
numerical_cols = [col for col in test_full.columns if test_full[col].dtype in ['int64', 'float64']]

# Get a count of missing values for each variable
for var in test_full.columns:
    total_na = test_full[var].isnull().sum()
    print(f'{var}: {total_na/len(test_full)}')
    
# Drop cabin as it has too many missing
test_full = test_full.drop('Cabin', axis=1)

(418, 11)
Pclass: 0.0
Sex: 0.0
Age: 0.20574162679425836
SibSp: 0.0
Parch: 0.0
Fare: 0.0023923444976076554
Cabin: 0.7822966507177034
Embarked: 0.0


In [5]:
# Impute Fare and Age variables in the test set using information from the training set
imp = SimpleImputer(strategy='mean')

train_full['Age'] = imp.fit_transform(train_full[['Age']])
test_full['Age'] = imp.transform(test_full[['Age']])

train_full['Fare'] = imp.fit_transform(train_full[['Fare']])
test_full['Fare'] = imp.transform(test_full[['Fare']])

In [6]:
# For categorical variables in test and train, convert to dummies
train_full = pd.get_dummies(train_full)
test_full = pd.get_dummies(test_full)

In [7]:
# Standardize numerical cols based on info from the train set
# For numerical variables in train and test, standardize the variables
# Create X and y objects
X = train_full.drop('Survived', axis = 1)
y = train_full['Survived']

scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
test_full[numerical_cols] = scaler.transform(test_full[numerical_cols])

### Generate predictions for the test set

In [8]:
# Fit SVM 
svm = SVC(kernel = 'rbf', C = 1.0, gamma = 0.1, random_state = 1000)
svm_model = svm.fit(X,y)
svm_preds = svm_model.predict(test_full)

output = pd.DataFrame({'PassengerId': ids, 'Survived': svm_preds})
output.to_csv(f'{file_path}/output/submission.csv', index=False)