In [0]:
import numpy as np 
import pandas as pd 

Reading data from csv file

In [0]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_labels = pd.read_csv("gender_submission.csv")
targets = test_labels['Survived']

test['Survived'] = targets

In [16]:
# Total missing values for each feature in train
print(train.isnull().sum())
print(len(train))


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
891


In [17]:
# Total missing values for each feature in test
print(test.isnull().sum())
print(len(test))

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Survived         0
dtype: int64
418


a) Delete data with missing values

In [0]:
new_train = train.dropna()

new_test = test.dropna()

labels = new_train['Survived']
test_labels = new_test['Survived']

#Remove labels and non important features for classification
x = new_train.drop(columns=['Survived', 'Name', 'SibSp', 'Ticket', 'Cabin'])
x_test = new_test.drop(columns=['Survived', 'Name', 'SibSp', 'Ticket', 'Cabin'])


In [19]:
# Check if any missing values remains
print(x.isnull().values.any())
print(x_test.isnull().values.any())

False
False


Data preprocessing

In [0]:
x['Sex'] = x['Sex'].map({'male': 1, 'female': 0})
x['Embarked'] = x['Embarked'].map({'Q': 2, 'S': 1, 'C': 0})


x_test['Sex'] = x_test['Sex'].map({'male': 1, 'female': 0})
x_test['Embarked'] = x_test['Embarked'].map({'Q': 2, 'S': 1, 'C': 0})


In [21]:
from sklearn import svm

c = svm.SVC()
c.fit(x, labels)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [22]:
preds = c.predict(x_test)
print(accuracy_score(test_labels, preds))

0.5057471264367817


b) Estimate missing values

In [0]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_labels = pd.read_csv("gender_submission.csv")
targets = test_labels['Survived']

test['Survived'] = targets

In [42]:
# Total missing values for each feature in train
print(train.isnull().sum())
print(len(train))


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
891


In [43]:
# Total missing values for each feature in test
print(test.isnull().sum())
print(len(test))

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Survived         0
dtype: int64
418


In [4]:
age_avg = train['Age'].mean()
train['Age'][np.isnan(train['Age'])] = age_avg

age_avg_test = test['Age'].mean()
test['Age'][np.isnan(test['Age'])] = age_avg_test

print(age_avg, age_avg_test)

29.69911764705882 30.272590361445783


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Embarked feature also has some missing values, so we can fill them with the most fequent value of Embarked which is S

In [5]:
print(train.groupby(['Embarked'])['PassengerId'].count())

train["Embarked"] = train["Embarked"].fillna("S")

Embarked
C    168
Q     77
S    644
Name: PassengerId, dtype: int64


In [0]:
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

In [53]:
print(train.groupby(['Cabin'])['PassengerId'].count())

# train["Embarked"] = train["Embarked"].fillna("S")

Cabin
A10    1
A14    1
A16    1
A19    1
A20    1
      ..
F33    3
F38    1
F4     2
G6     4
T      1
Name: PassengerId, Length: 147, dtype: int64


In [0]:
import re

train['Cabin'] = train['Cabin'].fillna("D")
train['tmp'] = train['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
train['tmp'] = train['tmp'].map({"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8})
train['tmp'] = train['tmp'].fillna(4)


train = train.drop(['Cabin'], axis=1)

test['Cabin'] = test['Cabin'].fillna("D")
test['tmp'] = test['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
test['tmp'] = test['tmp'].map({"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8})
test['tmp'] = test['tmp'].fillna(4)


test = test.drop(['Cabin'], axis=1)



In [36]:
# No missing values remain

print(test.isnull().sum())
print(train.isnull().sum())

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Survived       0
tmp            0
dtype: int64
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
tmp            0
dtype: int64


Data preprocessing

In [0]:
labels = train['Survived']
test_labels = test['Survived']

#Remove labels and non important features for classification
x = train.drop(columns=['Survived', 'Name', 'SibSp', 'Ticket'])
x_test = test.drop(columns=['Survived', 'Name', 'SibSp', 'Ticket'])


x['Sex'] = x['Sex'].map({'male': 1, 'female': 0})
x['Embarked'] = x['Embarked'].map({'Q': 2, 'S': 1, 'C': 0})


x_test['Sex'] = x_test['Sex'].map({'male': 1, 'female': 0})
x_test['Embarked'] = x_test['Embarked'].map({'Q': 2, 'S': 1, 'C': 0})


In [9]:
from sklearn import svm

c = svm.SVC()
c.fit(x, labels)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [10]:
from sklearn.metrics import accuracy_score

preds = c.predict(x_test)
print(accuracy_score(test_labels, preds))


0.6483253588516746
