In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import preprocessing #to transform the feature labels
from sklearn.feature_extraction import DictVectorizer #to turn categorial variables into numeric arrays
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
from sklearn.ensemble import VotingClassifier

In [2]:
train_set = pd.read_csv("Lab5_train.csv")
test_set = pd.read_csv("Lab5_test.csv")
train_set.head()

Unnamed: 0,PassengerID,Survived,Pclass,Name,Sex,Age,SibSp,ParCh,Ticket,Fare,Cabin,Embarked
0,1302,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C
1,369,0,2,"Chapman, Mr. John Henry",male,37.0,1,0,SC/AH 29037,26.0,,S
2,1128,0,3,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,,S
3,491,0,2,"Mack, Mrs. (Mary)",female,57.0,0,0,S.O./P.P. 3,10.5,E77,S
4,1087,0,3,"Olsson, Miss. Elina",female,31.0,0,0,350407,7.8542,,S


Checking majority in train set

In [3]:
len(train_set[train_set["Survived"] == 1])/len(train_set) * 100

38.68852459016394

In [4]:
len(train_set[train_set["Survived"] == 0])/len(train_set) * 100

61.31147540983607

So, approx 61% didnt survive and 39% survived.

Extract Features: Using Pclass, Sex, Age, SubSp, parCh to train SVM
Adding additional features Fare and Embarked

In [5]:
train_set_features = train_set[['Pclass', 'Sex', 'Age', 'SibSp', 'ParCh','Survived']]
test_set_features = test_set[['Pclass', 'Sex', 'Age', 'SibSp', 'ParCh']]

In [6]:
train_set_features['Age'].fillna(train_set_features['Age'].mean(), inplace=True)
test_set_features['Age'].fillna(test_set_features['Age'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [7]:
target = train_set_features.Survived

Encoding categorical features

In [8]:
vec = DictVectorizer()
train_set_non_class = vec.fit_transform(train_set_features[['Pclass', 'Sex', 'SibSp', 'ParCh']].to_dict('records')).toarray()
test_set_non_class = vec.fit_transform(test_set_features[['Pclass', 'Sex', 'SibSp', 'ParCh']].to_dict('records')).toarray()

Scale feature values

In [9]:
min_max_scaler = preprocessing.MinMaxScaler()
train_set_age_scaled = min_max_scaler.fit_transform(train_set_features.Age)
test_set_age_scaled = min_max_scaler.fit_transform(test_set_features.Age)



In [10]:
train_set_age_scaled_non_class = np.concatenate((train_set_non_class, np.reshape(train_set_age_scaled,(915,1))), axis=1)
test_set_age_scaled_non_class = np.concatenate((test_set_non_class, np.reshape(test_set_age_scaled,(394,1))), axis=1)

Train machine learning classifier

In [11]:
#Use cross validation to improve score
C_range = range(-5, 16)
gamma_range = range(-15, 4)
best_score = 0
best_C = 0
best_gamma = 0
best_model = svm.SVC(kernel='rbf')
for C in C_range:
    for gamma in gamma_range:
        clf = svm.SVC(kernel='rbf', C=2**C, gamma=2**gamma)
        scores = cross_val_score(clf, train_set_age_scaled_non_class, target, cv=5)
        score = scores.mean()
        if best_score < score:
            best_score = score
            best_C = C
            best_gamma = gamma
            best_model = clf
print(best_score, best_C, best_gamma)

0.811977604244 11 -6


In [14]:
clf1 = svm.SVC(kernel='rbf', C=2**best_C, gamma=2**best_gamma) # best c is 1, best gamma is -6
clf2 = MLPClassifier(solver='lbfgs', activation='logistic')
clf3 = DecisionTreeClassifier()


In [20]:
eclf = VotingClassifier(estimators=[('svc', clf1), ('mlp', clf2), ('dc', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['SVM', 'Neural Network', 'Decision Tree', 'Ensemble']):
    scores = cross_val_score(clf, train_set_age_scaled_non_class, target, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.81 (+/- 0.02) [SVM]
Accuracy: 0.79 (+/- 0.02) [Neural Network]
Accuracy: 0.78 (+/- 0.01) [Decision Tree]
Accuracy: 0.80 (+/- 0.02) [Ensemble]


In [21]:
eclf.fit(train_set_age_scaled_non_class, target)
target_predicted = eclf.predict(test_set_age_scaled_non_class)

Write to csv

In [22]:
output = pd.DataFrame({"PassengerId": test_set.PassengerID.values, 
              "Survived": target_predicted})
output.to_csv("submission_ensemble.csv", index=False)