In [2]:
#Common Model Algorithms
import numpy as np
import pandas as pd

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import model_selection

#Visualization
import matplotlib.pyplot as plt

In [None]:
#import data from file: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
data_org = pd.read_csv('train.csv')
data_test_org = pd.read_csv('test.csv')

In [None]:
#data info
print(data_org.head())
print(data_org.info())

In [None]:
#to play with our data we'll create a copy
data_train = data_org.copy(deep = True)
data_test = data_test_org.copy(deep = True)

In [None]:
#make list to clean both datasets at once
data_cleaner = [data_train, data_test]

In [None]:
#clean data
for dataset in data_cleaner:
    #missing value in Fare and Embarked (Age later)
    #complete Embarked with mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    #complete missing Fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
    #new feature Title from name
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    stat_min = 10 #while small is arbitrary, we'll use the common minimum in statistics: http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/
    title_names = (dataset['Title'].value_counts() < stat_min) #this will create a true false series with title name as index
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
    
    #new feature Family size
    dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1
    
    #new feature IsAlone
    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1
       
    # create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
    dummy_pclass = pd.get_dummies(dataset['Pclass'])
    dummy_pclass.columns = ['Class_1','Class_2','Class_3']
    dummy_pclass.drop(['Class_3'], axis=1, inplace=True)
    dataset.drop(['Pclass'], axis=1, inplace=True)
    dataset['Class_1'] = dummy_pclass['Class_1']
    dataset['Class_2'] = dummy_pclass['Class_2']
    
    #drop feature
    drop_feature=['PassengerId','Name','Ticket','Cabin']
    dataset.drop(drop_feature, axis=1, inplace=True)

In [None]:
#define x and y variables for dummy features original
train_dummy = pd.get_dummies(data_train)
test_dummy = pd.get_dummies(data_test)

In [None]:
#complete Age with RF
train_data_age = train_dummy['Age']>0
train_age_x = train_dummy.drop(['Age','Survived'], axis=1, inplace=False).loc[train_data_age]
train_age_y = train_dummy['Age'].loc[train_data_age]
train_missing_age_x = train_dummy.drop(['Age','Survived'], axis=1, inplace=False).loc[train_data_age == False]
test_data_age = test_dummy['Age']>0
test_missing_age_x = test_dummy.drop(['Age'], axis=1, inplace=False).loc[test_data_age == False]

# Fitting RandomForest to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(random_state = 10, warm_start = True, 
                                  n_estimators = 26,
                                  max_depth = 6, 
                                  max_features = 'sqrt')
regressor.fit(train_age_x, train_age_y)
train_dummy['Age'][train_data_age == False] = regressor.predict(train_missing_age_x)
test_dummy['Age'][test_data_age == False] = regressor.predict(test_missing_age_x)


In [None]:
# if age under 12 ,set sex to child(not male or female) 
train_dummy['Sex_female'][train_dummy['Age'] <= 12] = 0
train_dummy['Sex_male'][train_dummy['Age'] <= 12] = 0
test_dummy['Sex_female'][test_dummy['Age'] <= 12] = 0
test_dummy['Sex_male'][test_dummy['Age'] <= 12] = 0

In [None]:
#split data to train and cv
train_x, cv_x, train_y, cv_y = model_selection.train_test_split(train_dummy.drop(['Survived'], axis=1, inplace=False), train_dummy['Survived'], test_size = 0.2, random_state = None)

In [None]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
train_x = sc_X.fit_transform(train_x)
cv_x = sc_X.transform(cv_x)

In [None]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(C=1, kernel = 'rbf' , random_state = 0)
classifier.fit(train_x, train_y)

In [None]:
y_pred = classifier.predict(cv_x)

In [None]:
#accuracy
from sklearn.metrics import accuracy_score
acc = accuracy_score(cv_y, y_pred, normalize=True, sample_weight=None)

In [None]:
#test_x Feature scaling
test_x = sc_X.fit_transform(test_dummy)
#predict test data
y_pred_test = classifier.predict(test_x)

In [None]:
#output 
b = np.c_[data_test_org['PassengerId'],y_pred_test]
ans = pd.DataFrame(b)
ans.columns = ['PassengerId','Survived']
ans.to_csv('titanic_svm_01.csv', index = False, header = True)