# Projeto 1

Alunos:

- Lucas Amin
- Rafael Toyomoto
- William Henrique

## Imports

In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

In [3]:
from sklearn import tree
from sklearn.model_selection import train_test_split

## Functions

In [4]:
def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots(figsize = (12,10))
    cmap = sns.diverging_palette(220,10, as_cmap=True)
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={'shrink':.9}, 
        ax=ax, 
        annot=True, 
        annot_kws={'fontsize':12})

In [5]:
def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get('row', None)
    col = kwargs.get('col', None)
    facet = sns.FacetGrid(df,row = row, col=col)
    facet.map(sns.barplot, cat, target)
    facet.add_legend()

In [6]:
def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

## Dataset
Carregando o dataset do titanic e observando os dados

In [7]:
#Adaptado de https://www.kaggle.com/sachinkulkarni/titanic/an-interactive-data-science-tutorial

df_train = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')
df_full  = pd.concat([df_train, df_test], sort=True, ignore_index=True)

In [8]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Avaliar integridade

### Dataset de treino

In [10]:
df_train.info()
df_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
df_train.describe(include=["O"])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Butler, Mr. Reginald Fenton",male,347082,G6,S
freq,1,577,7,4,644


**Considerações**

- É possível observar problemas com os campos "Age", "Cabin" e "Embarked" quanto a valores nulos
- A média de sobrevivência indica que a maioria das pessoas morreram
- Idade mínima ser 0,42 indica problema nos valores contidos nesse campo (?)
- Fare igual a 0 indica possível problema nesse campo (alguem entrou de graça)

### Dataset de teste

In [12]:
df_test.info()
df_test.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [13]:
df_test.describe(include=["O"])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,418,418,418,91,418
unique,418,2,363,76,3
top,"Olsson, Mr. Oscar Wilhelm",male,PC 17608,B57 B59 B63 B66,S
freq,1,266,5,3,270


### Avaliar inconsistências
Ver registros com features nulas

In [14]:
print("Missing data counts in Training Data : ")
print(df_train.isnull().sum())

print("Missing data counts in Test Data : ")
print(df_test.isnull().sum())

print("Percentage of data missing Training Data: ")
print(df_train.isnull().sum()/df_train.shape[0])

print("Percentage of data missing Test Data: ")
print(df_test.isnull().sum()/df_test.shape[0])

Missing data counts in Training Data : 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Missing data counts in Test Data : 
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
Percentage of data missing Training Data: 
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64
Percentage of data missing Test Data: 
PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex 

Avaliar se os ID dos passageiros estão certos

In [15]:
print("Train", df_train["PassengerId"].unique().size, "=", df_train.shape[0])
print("Test", df_test["PassengerId"].unique().size, "=", df_test.shape[0])

Train 891 = 891
Test 418 = 418


Ver labels das **features** categóricas

In [16]:
for label in ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]:
    print(label, df_train[label].unique())
#C = Cherbourg
#Q = Queenstown
#S = Southampton

Pclass [3 1 2]
Sex ['male' 'female']
SibSp [1 0 3 4 2 5 8]
Parch [0 1 2 5 3 4 6]
Embarked ['S' 'C' 'Q' nan]


## Tratando os dados

In [17]:
train = df_train.copy()
test  = df_test.copy()

### Análise dos dados faltantes em grande volume

Cerca de 77% dos registros no **dataset de treino** e 78% no **dataset de teste** estão sem a informação de **Cabin**, sendo inevitável que esse atributo deverá ser ignorado.

Já para o atributo **Age**, em ambos **dataset** observamos uma taxa de valores faltando de 20%, aproximadamente, sendo possível tentar completar esses registros corrompidos.

In [18]:
train = train.drop(['Cabin'], axis=1)
test  = test.drop(['Cabin'], axis=1)

### Tratar o atributo Ticket

Ticket é apenas um nome, vamos ignorar o atributo que não vai ser útil para o treinamento

In [40]:
train = train.drop(['Ticket'], axis=1)
test  = test.drop(['Ticket'], axis=1)

### Tratar o atributo Fare

In [19]:
df_test[df_test["Fare"].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


Para o atributo **Fare** vamos completar com a média

In [20]:
test['Fare'] = df_test['Fare'].fillna(value=df_test['Fare'].mean(),axis=0)

### Tratar o atributo Embarked

In [21]:
df_train[df_train["Embarked"].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


Para o atributo **Embarked** vamos completar com o valor mais comum

In [22]:
train['Embarked'] = df_train['Embarked'].fillna(value='S',axis=0)

### Tratar atributos Parch e SibSp

In [43]:
train['FamilySize'] = train['Parch'] + train['SibSp'] + 1
test['FamilySize']  = test['Parch'] + test['SibSp'] + 1

train = train.drop(['Parch', 'SibSp'], axis=1)
test  = test.drop(['Parch', 'SibSp'], axis=1)

### Tratar os nomes
Tentar extrair "título" dos nomes das pessoas.

In [23]:
# Extrair todos os titulos do conjunto de treinamento
def extract_titles(df):
    pos = df.columns.get_loc('Name')
    titles = set({})
    for row in df.values:
        title = row[pos].split(',')[1].split('.')[0] + '.'.strip()
        titles.add(title)

    return titles

# Adicionar uma coluna nova de titulos para o conjunto de treinamento
def add_titles_to_df(df) :
    titles = extract_titles(df)
    pos = df.columns.get_loc('Name')
    title_list = []
    for row in df.values:
        for title in titles:
            if title in row[pos]:
                title_list.append(title)
                break
    df['Title'] = title_list
    return df

# Adicionar uma coluna com os titulos no conjunto de treinamentos
print(train)
add_titles_to_df(train)
add_titles_to_df(test)

train.head()

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
5              6         0       3   
6              7         0       1   
7              8         0       3   
8              9         1       3   
9             10         1       2   
10            11         1       3   
11            12         1       1   
12            13         0       3   
13            14         0       3   
14            15         0       3   
15            16         1       2   
16            17         0       3   
17            18         1       2   
18            19         0       3   
19            20         1       3   
20            21         0       2   
21            22         1       2   
22            23         1       3   
23            24         1       1   
24            25         0       3   
25          

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs.
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss.
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs.
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr.


Foi verificado que existem títulos com poucas entradas, o que pode indicar ruídos

In [24]:
train['Title'].value_counts()

 Mr.              517
 Miss.            182
 Mrs.             125
 Master.           40
 Dr.                7
 Rev.               6
 Mlle.              2
 Major.             2
 Col.               2
 Don.               1
 the Countess.      1
 Lady.              1
 Capt.              1
 Jonkheer.          1
 Mme.               1
 Ms.                1
 Sir.               1
Name: Title, dtype: int64

Combinando os titulos incomuns com os comuns

In [25]:
male_titles = [' Col.',' Major.',' Capt.',' Jonkheer.',' Don.',' Sir.']
female_titles = [' Lady.',' Mme.',' the Countess.',' Dona.',' Mlle.']

def replace_uncommon_titles(df,new_title,title_list):
    pos = df.columns.get_loc('Title')
    for title in title_list:
        for i in range(0,df.shape[0]):
            if df.iloc[i,pos] == title:
                print(title)
                df.iloc[i,pos] = new_title                
    return df

train = replace_uncommon_titles(train,' Mr.',male_titles)
train = replace_uncommon_titles(train," Miss.",female_titles)
test = replace_uncommon_titles(test," Mr.",male_titles)
test = replace_uncommon_titles(test," Miss.",female_titles)

# Checando os dados de cada titulo
titles = list(train['Title'].value_counts().index)
for title in titles:
    print("Title train:: ", title)
    print(train[train['Title'] == title].describe()["Age"])
    print("Title test:: ", title)
    print(test[test['Title'] == title].describe()["Age"])

 Col.
 Col.
 Major.
 Major.
 Capt.
 Jonkheer.
 Don.
 Sir.
 Lady.
 Mme.
 the Countess.
 Mlle.
 Mlle.
 Col.
 Col.
 Dona.
Title train::   Mr.
count    406.000000
mean      32.740148
std       12.930425
min       11.000000
25%       23.000000
50%       30.000000
75%       39.750000
max       80.000000
Name: Age, dtype: float64
Title test::   Mr.
count    185.000000
mean      32.194595
std       11.891720
min       14.000000
25%       23.000000
50%       29.000000
75%       40.000000
max       67.000000
Name: Age, dtype: float64
Title train::   Miss.
count    151.000000
mean      22.066225
std       12.983121
min        0.750000
25%       14.750000
50%       22.000000
75%       30.000000
max       63.000000
Name: Age, dtype: float64
Title test::   Miss.
count    65.000000
mean     22.039846
std      10.593381
min       0.170000
25%      18.000000
50%      22.000000
75%      30.000000
max      45.000000
Name: Age, dtype: float64
Title train::   Mrs.
count    108.000000
mean      35.898148
st

### Tratando a idade (Age)
Utilizando os titulos para aproximar as idades faltantes

In [26]:
age_mean = train.groupby("Title").mean()['Age']

def fill_age_na(df,age_mean):
    rows_with_age_missing = df[df['Age'].isnull()]
    pos = df.columns.get_loc("Age")
    for title in age_mean.index:
        passengerIds = rows_with_age_missing[rows_with_age_missing['Title'] == title]["PassengerId"]
        for Id in passengerIds:
            df.iloc[df[df['PassengerId'] == Id].index.values,pos] = age_mean[title]
    return df

train = fill_age_na(train,age_mean)
test = fill_age_na(test,age_mean)

train[train['Age'].isnull()]
test[test['Age'].isnull()]
train.isnull().sum()
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Title          0
dtype: int64

## Tratando atributos categóricos

Para o sexo, vamos apenas atribuir valor 1 para 'male' e 0 para 'female'

In [54]:
train["Sex_Code"] = np.where(train["Sex"] == 'male', 1, 0)
test["Sex_Code"]  = np.where(test["Sex"] == 'male', 1, 0)

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Embarked,Title,FamilySize,Sex_Code
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,S,Mr.,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,C,Mrs.,2,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,S,Miss.,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,S,Mrs.,2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,8.05,S,Mr.,1,1


In [39]:
#Cria uma nova variável para cada valor único de "Embarked" (no caso, Embarked_C  Embarked_Q  Embarked_S)
embarked = pd.get_dummies(train.Embarked, prefix='Embarked')

embarked.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [29]:
#Cria uma nova variável para cada valor único de "Pclass"
pclass = pd.get_dummies(df_full.Pclass , prefix='Pclass' )

pclass.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [31]:
#As distinções refletiam o status social e podem ser utilziados para prever a probabilidade de sobrevivência
title = pd.get_dummies(df_full.Pclass , prefix='Pclass' )

title.head()

Unnamed: 0,Master,Miss,Mr,Mrs,Officer,Royalty
0,0,0,1,0,0,0
1,0,0,0,1,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,0,1,0,0,0


In [32]:
#Extrai a categoria da cabine a partir do número
cabin = pd.DataFrame()

#Substitui dados faltantes por "U" (Uknown)
cabin['Cabin'] = df_full.Cabin.fillna( 'U' )

#Mapeia cada valor de cabine com a letra
cabin['Cabin'] = cabin['Cabin'].map(lambda c : c[0])

#Cria uma variável para cada categoria
cabin = pd.get_dummies(cabin['Cabin'] , prefix = 'Cabin')

cabin.head()

Unnamed: 0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1


In [33]:
#Extrai a classe de cada ticket a partir do seu número
#Caso não tenha prefixo, retorna XXX
def cleanTicket( ticket ):
    ticket = ticket.replace('.', '')
    ticket = ticket.replace('/', '')
    ticket = ticket.split()
    ticket = map( lambda t : t.strip() , ticket )
    ticket = list(filter( lambda t : not t.isdigit() , ticket))
    if len( ticket ) > 0:
        return ticket[0]
    else: 
        return 'XXX'

ticket = pd.DataFrame()

#Cria uma nova variável para cada caso
ticket['Ticket'] = df_full['Ticket'].map(cleanTicket)
ticket = pd.get_dummies(ticket['Ticket'] , prefix='Ticket')

ticket.shape
ticket.head()

Unnamed: 0,Ticket_A,Ticket_A4,Ticket_A5,Ticket_AQ3,Ticket_AQ4,Ticket_AS,Ticket_C,Ticket_CA,Ticket_CASOTON,Ticket_FC,...,Ticket_SOTONO2,Ticket_SOTONOQ,Ticket_SP,Ticket_STONO,Ticket_STONO2,Ticket_STONOQ,Ticket_SWPP,Ticket_WC,Ticket_WEP,Ticket_XXX
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [34]:
#Cria variáveis para representar o tamanho da família e também cada categoria
family = pd.DataFrame()

#Cria nova característica que representa o tamanho da família (quantidade de membros)
family['FamilySize'] = df_full['Parch'] + df_full['SibSp'] + 1

#Cria nova características para representar o tipo de família 
family['Family_Single'] = family['FamilySize'].map(lambda s : 1 if s == 1 else 0)
family['Family_Small']  = family['FamilySize'].map(lambda s : 1 if 2 <= s <= 4 else 0)
family['Family_Large']  = family['FamilySize'].map(lambda s : 1 if 5 <= s else 0)

family.head()

Unnamed: 0,FamilySize,Family_Single,Family_Small,Family_Large
0,2,0,1,0
1,2,0,1,0
2,1,1,0,0
3,2,0,1,0
4,1,1,0,0


In [35]:
#Seleciona as características que serão incluídas no descritor (vetor de características)
full_X = pd.concat([imputed, embarked, family, sex, title] , axis=1)
full_X.head()

Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize,Family_Single,Family_Small,Family_Large,Sex,Master,Miss,Mr,Mrs,Officer,Royalty
0,22.0,7.25,0,0,1,2,0,1,0,1,0,0,1,0,0,0
1,38.0,71.2833,1,0,0,2,0,1,0,0,0,0,0,1,0,0
2,26.0,7.925,0,0,1,1,1,0,0,0,0,1,0,0,0,0
3,35.0,53.1,0,0,1,2,0,1,0,0,0,0,0,1,0,0
4,35.0,8.05,0,0,1,1,1,0,0,1,0,0,1,0,0,0
