In [71]:
import numpy as np
from data import Instance, DataSet
from miscellaneous import initialize_data, plot_graph, plot_points
from neural_network import NeuralNetwork
from activation_function import *
import matplotlib.pyplot as plt
from keras.datasets import mnist
import pandas as pd
import copy
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

#### Carregando dados e Ajustando labels

In [72]:
# Carregando dados
train = pd.read_csv('../dataset/train.csv')
test = pd.read_csv('../dataset/test.csv')

# Ajustando nomes
train_labels = [i.lower() for i in train.columns.values]
test_labels = [i.lower() for i in test.columns.values]

rename_dict_train = { i: j for i,j in zip(train.columns.values, train_labels) }
rename_dict_test = { i: j for i,j in zip(test.columns.values, test_labels) }

train.rename(columns = rename_dict_train, inplace = True)
test.rename(columns = rename_dict_test, inplace = True)

#### Selecionando atributos relevantes

In [73]:
train = train.filter(['pclass', 'sex', 'age', 'fare', 'cabin', 'embarked', 'survived'], axis = 1)
test = test.filter(['pclass', 'sex', 'age', 'fare', 'cabin', 'embarked', 'survived'], axis = 1)

##### Age

In [74]:
train['age'].fillna(train['age'].median(), inplace = True)
test['age'].fillna(test['age'].median(), inplace = True)

##### Sex

In [75]:
binarizer = LabelBinarizer()
train['sex'] = binarizer.fit_transform(train['sex'])
test['sex'] = binarizer.fit_transform(test['sex'])

##### Fare

In [76]:
train['fare'].fillna(train['fare'].median(), inplace = True)
test['fare'].fillna(test['fare'].median(), inplace = True)

##### Embarked

In [77]:
train['embarked'].fillna(str(train['embarked'].mode()), inplace = True)
test['embarked'].fillna(str(test['embarked'].mode()), inplace = True)

encoder = LabelEncoder()
train['embarked'] = encoder.fit_transform(train['embarked'])
test['embarked'] = encoder.fit_transform(test['embarked'])

##### Cabin

In [78]:
train.loc[train['cabin'].notnull(), 'cabin'] = train['cabin'].str[0]
train['cabin'].fillna('H', inplace = True)

test.loc[test['cabin'].notnull(), 'cabin'] = test['cabin'].str[0]
test['cabin'].fillna('H', inplace = True)

train['cabin'] = encoder.fit_transform(train['cabin'])
test['cabin'] = encoder.fit_transform(test['cabin'])

##### Remoção de Outliers

In [79]:
for c1, c2 in zip(train, test):
    train[c1][~(np.abs(train[c1]-train[c1].mean())>(3*train[c1].std()))]
    test[c2][~(np.abs(test[c2]-test[c2].mean())>(3*test[c2].std()))]

##### Normalização

In [80]:
min_max_s = MinMaxScaler(feature_range = (0,1))

for c1, c2 in zip(train, test):
    train[c1] = min_max_s.fit_transform(train[c1].reshape(-1, 1))
    test[c2] = min_max_s.fit_transform(test[c2].reshape(-1, 1))
    
print(train)

     pclass  sex       age      fare  cabin  embarked  survived
0       1.0  1.0  0.271174  0.014151  0.875  1.000000         0
1       0.0  0.0  0.472229  0.139136  0.250  0.333333         1
2       1.0  0.0  0.321438  0.015469  0.875  1.000000         1
3       0.0  0.0  0.434531  0.103644  0.250  1.000000         1
4       1.0  1.0  0.434531  0.015713  0.875  1.000000         0
5       1.0  1.0  0.346569  0.016510  0.875  0.666667         0
6       0.0  1.0  0.673285  0.101229  0.500  1.000000         0
7       1.0  1.0  0.019854  0.041136  0.875  1.000000         0
8       1.0  0.0  0.334004  0.021731  0.875  1.000000         1
9       0.5  0.0  0.170646  0.058694  0.875  0.333333         1
10      1.0  0.0  0.044986  0.032596  0.750  1.000000         1
11      0.0  0.0  0.723549  0.051822  0.250  1.000000         1
12      1.0  1.0  0.246042  0.015713  0.875  1.000000         0
13      1.0  1.0  0.484795  0.061045  0.875  1.000000         0
14      1.0  0.0  0.170646  0.015330  0.

  after removing the cwd from sys.path.
  """


In [14]:
x_train = train.filter(['pclass', 'sex', 'age', 'fare', 'cabin', 'embarked'], axis = 1)
x_test = test.filter(['pclass', 'sex', 'age', 'fare', 'cabin', 'embarked'], axis = 1)

y_train = train.filter(['survived'], axis=1)
y_test = test.filter(['survived'], axis=1)