In [1]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB 
from sklearn.neural_network import MLPClassifier

In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_y = pd.read_csv('gender_submission.csv')

In [7]:
test_y = test_y.loc[test_y['PassengerId'] > 891]
test_columns = ['Survived'] 
test_y_new = test_y.filter(items=test_columns)
test_y_new.count()

Survived    418
dtype: int64

In [8]:
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(0)
test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            418
SibSp          418
Parch          418
Ticket         418
Fare           418
Cabin           91
Embarked       418
dtype: int64

In [9]:
train_columns = ['PassengerId','Sex','Pclass', 'Age', 'Fare', 'Embarked', 'Parch', 'SibSp'] 
train_x_new = train.filter(items=train_columns)
train_y_new = train['Survived']

test_columns = ['PassengerId','Sex','Pclass',  'Age', 'Fare', 'Embarked', 'Parch', 'SibSp'] 
test_x_new = test.filter(items=test_columns)

test_x_new.count()

PassengerId    418
Sex            418
Pclass         418
Age            418
Fare           418
dtype: int64

In [10]:
train_dummies = pd.get_dummies(train_x_new,drop_first=True)
test_dummies = pd.get_dummies(test_x_new,drop_first=True)
test_dummies.head(3)

Unnamed: 0,PassengerId,Pclass,Age,Fare,Sex_male
0,892,3,34.5,7.8292,1
1,893,3,47.0,7.0,0
2,894,2,62.0,9.6875,1


In [11]:
train_dummies['male_fare'] = 0
train_dummies.loc[(train_dummies['Fare'] < 20) & (train_dummies['Sex_male'] == 1), 'male_fare'] = 1
test_dummies['male_fare'] = 0
test_dummies.loc[(test_dummies['Fare'] < 20) & (test_dummies['Sex_male'] == 1), 'male_fare'] = 1
test_dummies.count()

PassengerId    418
Pclass         418
Age            418
Fare           418
Sex_male       418
male_fare      418
dtype: int64

In [12]:
train_dummies['male_age'] = 0
train_dummies.loc[(train_dummies['Age'] > 6) & (train_dummies['Sex_male'] == 1), 'male_age'] = 1
test_dummies['male_age'] = 0
test_dummies.loc[(test_dummies['Age'] > 6) & (test_dummies['Sex_male'] == 1), 'male_age'] = 1
test_dummies.count()
train_dummies['male_age'].value_counts()

1    553
0    338
Name: male_age, dtype: int64

In [13]:
# train_dummies['male_Pclass'] = 0
# train_dummies.loc[(train_dummies['Pclass'] <= 2.5) & (train_dummies['male_age'] == 0), 'male_Pclass'] = 1
# test_dummies['male_Pclass'] = 0
# test_dummies.loc[(test_dummies['Age'] > 6) & (test_dummies['male_age'] == 0), 'male_Pclass'] = 1
# test_dummies.count()

## Split Model

## Decision Tree **

In [14]:
clf_dec = DecisionTreeClassifier(max_depth=2, min_samples_leaf=20, min_samples_split=0.2)
# clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_dummies, train_y_new)
print(clf_dec.score(test_dummies, test_y_new))
print(f1_score(test_y_new, clf_dec.predict(test_dummies)))

0.8229665071770335
0.6837606837606838


In [15]:
pd.Series(
    clf_dec.feature_importances_,
    clf_dec.feature_names_in_
).sort_values().to_frame("Importance")

Unnamed: 0,Importance
PassengerId,0.0
Age,0.0
Fare,0.0
Sex_male,0.0
male_fare,0.0
Pclass,0.253817
male_age,0.746183


In [16]:
# plt.figure(figsize=(40,40))
# plot_tree(clf_dec, feature_names=train_dummies.columns, proportion=True, filled=True)
# plt.show()

## Linear Regression

In [17]:
clf_ln = LogisticRegression()
clf_ln.fit(train_dummies, train_y_new)
print(clf_ln.score(test_dummies, test_y_new))
print(f1_score(test_y_new, clf_ln.predict(test_dummies)))

0.9760765550239234
0.967948717948718


## Naive Bayes

In [18]:
clf_gnb = GaussianNB()
clf_gnb.fit(train_dummies, train_y_new)
print(clf_gnb.score(test_dummies, test_y_new)) 
print(f1_score(test_y_new, clf_gnb.predict(test_dummies)))

0.9593301435406698
0.9470404984423676


## Redes Neurais

In [19]:
clf_neural = MLPClassifier(max_iter=100, verbose=True, tol=0.0000100)
clf_neural.fit(train_dummies, train_y_new)


Iteration 1, loss = 7.64651827
Iteration 2, loss = 1.98807717
Iteration 3, loss = 2.48293435
Iteration 4, loss = 1.43799862
Iteration 5, loss = 1.51868138
Iteration 6, loss = 0.98524689
Iteration 7, loss = 0.83234046
Iteration 8, loss = 0.79128906
Iteration 9, loss = 0.75611541
Iteration 10, loss = 0.70945105
Iteration 11, loss = 0.66577053
Iteration 12, loss = 0.61255138
Iteration 13, loss = 0.61352561
Iteration 14, loss = 0.60107198
Iteration 15, loss = 0.59239422
Iteration 16, loss = 0.59012124
Iteration 17, loss = 0.58399745
Iteration 18, loss = 0.57946154
Iteration 19, loss = 0.58026988
Iteration 20, loss = 0.57335002
Iteration 21, loss = 0.56300024
Iteration 22, loss = 0.56574256
Iteration 23, loss = 0.56115668
Iteration 24, loss = 0.55090933
Iteration 25, loss = 0.54830663
Iteration 26, loss = 0.54232400
Iteration 27, loss = 0.53633222
Iteration 28, loss = 0.53682554
Iteration 29, loss = 0.53140772
Iteration 30, loss = 0.53492513
Iteration 31, loss = 0.52835203
Iteration 32, los

In [20]:
print(clf_neural.score(test_dummies, test_y_new)) 
print(f1_score(test_y_new, clf_neural.predict(test_dummies)))

0.7631578947368421
0.6024096385542168


## Final Transformations

In [21]:
df_test = test_dummies.drop(columns=['Age','Pclass','Fare','male_age','Sex_male','male_fare'])
df_test["Survived"] = clf_neural.predict(test_dummies)
df_test.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [None]:
df_test.to_csv("./submission.csv", index = False)

In [None]:
# testando = pd.read_csv('./submission.csv')
# testando.head()