In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.neighbors import KNeighborsClassifier
from fancyimpute import KNN

In [2]:
test = pd.read_csv('include/test.csv')
train = pd.read_csv('include/train.csv').dropna()
print(train.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   183.000000  183.000000  183.000000  183.000000  183.000000   
mean    455.366120    0.672131    1.191257   35.674426    0.464481   
std     247.052476    0.470725    0.515187   15.643866    0.644159   
min       2.000000    0.000000    1.000000    0.920000    0.000000   
25%     263.500000    0.000000    1.000000   24.000000    0.000000   
50%     457.000000    1.000000    1.000000   36.000000    0.000000   
75%     676.000000    1.000000    1.000000   47.500000    1.000000   
max     890.000000    1.000000    3.000000   80.000000    3.000000   

            Parch        Fare  
count  183.000000  183.000000  
mean     0.475410   78.682469  
std      0.754617   76.347843  
min      0.000000    0.000000  
25%      0.000000   29.700000  
50%      0.000000   57.000000  
75%      1.000000   90.000000  
max      4.000000  512.329200  


In [3]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str(big_string).find(substring) != -1:
            return substring
    return ''

In [4]:
train_reformated = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
train_reformated['Deck'] = train['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
train_reformated['Family_Size'] = train_reformated['SibSp'] + train_reformated['Parch']
train_reformated['Fare_Per_Person'] = train_reformated['Fare'] /(train_reformated['Family_Size'] + 1)

train_reformated['Sex'] =  LabelEncoder().fit_transform(train_reformated['Sex'])
train_reformated['Embarked'] = LabelEncoder().fit_transform(train_reformated['Embarked'])
train_reformated['Deck'] = LabelEncoder().fit_transform(train_reformated['Deck'])
train_reformated.drop(['Fare', 'Embarked', 'Family_Size', 'Parch', 'SibSp', 'Deck'], axis=1, inplace=True)
print(train_reformated.head())

    Survived  Pclass  Sex   Age  Fare_Per_Person
1          1       1    0  38.0        35.641650
3          1       1    0  35.0        26.550000
6          0       1    1  54.0        51.862500
10         1       3    0   4.0         5.566667
11         1       1    0  58.0        26.550000


In [5]:
clf = KNeighborsClassifier()
clf.fit(train_reformated.loc[:, train_reformated.columns != 'Survived'], train_reformated['Survived'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [6]:
test_passengerId = test['PassengerId']
test_reformated = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
test_reformated['Deck'] = test['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
test_reformated['Family_Size'] = test_reformated['SibSp'] + test_reformated['Parch']
test_reformated['Fare_Per_Person'] = test_reformated['Fare'] /(test_reformated['Family_Size'] + 1)

test_reformated['Sex'] =  LabelEncoder().fit_transform(test_reformated['Sex'])
test_reformated['Embarked'] = LabelEncoder().fit_transform(test_reformated['Embarked'])
test_reformated['Deck'] = LabelEncoder().fit_transform(test_reformated['Deck'])
test_reformated.drop(['SibSp', 'Parch', 'Fare', 'Embarked', 'Family_Size', 'Deck'], axis=1, inplace=True)
print(test_reformated.head())
print(test_reformated.isnull().sum().sum() / test_reformated.shape[0])

   Pclass  Sex   Age  Fare_Per_Person
0       3    1  34.5         7.829200
1       3    0  47.0         3.500000
2       2    1  62.0         9.687500
3       3    1  27.0         8.662500
4       3    0  22.0         4.095833
0.20813397129186603


In [7]:
print(test_reformated.columns[test_reformated.isnull().any()].tolist())
test_reformated_filled = KNN(k=3).complete(test_reformated)

test_predictions = clf.predict(test_reformated_filled).reshape(-1, 1)

combined = np.column_stack((test_passengerId, test_predictions))
df = pd.DataFrame(combined, columns=["PassengerId", "Survived"])

['Age', 'Fare_Per_Person']
Imputing row 1/418 with 0 missing, elapsed time: 0.030
Imputing row 101/418 with 0 missing, elapsed time: 0.078
Imputing row 201/418 with 1 missing, elapsed time: 0.079
Imputing row 301/418 with 0 missing, elapsed time: 0.081
Imputing row 401/418 with 0 missing, elapsed time: 0.082


In [8]:
df.to_csv('csv/answers17.csv',index=False)

In [9]:
print(test.isnull().sum())


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
