In [48]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import impute
from sklearn.dummy import DummyClassifier
#from sklearn import metrics

from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)
from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC,
)
from yellowbrick.model_selection import (
    LearningCurve,
)
from sklearn.experimental import (
    enable_iterative_imputer,
)

# Collecting Data

In [2]:
url = (
    "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
)
df = pd.read_excel(url)
orig_df = df

In [3]:
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


# Cleaning Data

In [4]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [5]:
df.shape

(1309, 14)

In [6]:
df.describe().iloc[:, :2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [7]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [8]:
df.isnull().sum(axis=1).loc[:10]

0     1
1     1
2     2
3     1
4     2
5     1
6     1
7     2
8     1
9     2
10    1
dtype: int64

In [9]:
mask = df.isnull().any(axis=1)

In [10]:
mask

0       True
1       True
2       True
3       True
4       True
        ... 
1304    True
1305    True
1306    True
1307    True
1308    True
Length: 1309, dtype: bool

In [11]:
(mask == None).sum()

0

In [12]:
df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [13]:
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [14]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

# Creating Attributes

In [15]:
name = df.name

In [16]:
name

0                         Allen, Miss. Elisabeth Walton
1                        Allison, Master. Hudson Trevor
2                          Allison, Miss. Helen Loraine
3                  Allison, Mr. Hudson Joshua Creighton
4       Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
                             ...                       
1304                               Zabour, Miss. Hileni
1305                              Zabour, Miss. Thamine
1306                          Zakarian, Mr. Mapriededer
1307                                Zakarian, Mr. Ortin
1308                                 Zimmerman, Mr. Leo
Name: name, Length: 1309, dtype: object

In [17]:
df = df.drop(
    columns=[
        "name",
        "ticket",
        "home.dest",
        "boat",
        "body",
        "cabin",
    ]
)

In [18]:
df = pd.get_dummies(df)

In [19]:
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [20]:
df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,29.0,0,0,211.3375,1,0,0,0,1
1,1,1,0.9167,1,2,151.55,0,1,0,0,1
2,1,0,2.0,1,2,151.55,1,0,0,0,1
3,1,0,30.0,1,2,151.55,0,1,0,0,1
4,1,0,25.0,1,2,151.55,1,0,0,0,1


In [21]:
df = df.drop(columns="sex_male")

In [22]:
df = pd.get_dummies(df, drop_first=True)

In [23]:
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [24]:
y = df.survived
X = df.drop(columns="survived")

# Separating Samples

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Imputing Data

In [26]:
    num_cols=[
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare",
        "sex_female",
    ]

In [27]:
imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(
    X_train[num_cols]
)
X_train.loc[:, num_cols] = imputed
imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

## Imputing Data using Median

In [28]:
meds = X_train.median()
X_train = X_train.fillna(meds)
X_test = X_test.fillna(meds)

# Normalizing Data

In [29]:
# cols = "pclass,age,sibsp,fare".split(",")
cols = X.columns
cols

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female', 'embarked_C',
       'embarked_Q', 'embarked_S'],
      dtype='object')

In [30]:
sca = preprocessing.StandardScaler()

In [31]:
X_train = sca.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=cols)

In [32]:
X_train = X_train.drop(
    columns=[
        "sex_female",
        "embarked_C",
        "embarked_Q",
        "embarked_S"
    ]
)

In [33]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare
0,0.825248,-0.128878,-0.498616,-0.432553,-0.473599
1,0.825248,-0.205639,-0.498616,-0.432553,-0.48812
2,-0.363317,-0.751431,-0.498616,-0.432553,-0.145224
3,0.825248,-2.198733,6.897852,1.805972,0.679618
4,0.825248,-0.049698,-0.498616,-0.432553,-0.490408


In [34]:
X_test = sca.fit_transform(X_test)
X_test = pd.DataFrame(X_test, columns=cols)

In [35]:
X_test = X_test.drop(
    columns=[
        "sex_female",
        "embarked_C",
        "embarked_Q",
        "embarked_S"
    ]
)

In [36]:
X_test.head()

Unnamed: 0,pclass,age,sibsp,parch,fare
0,0.882021,0.290523,-0.433273,-0.47975,-0.513001
1,0.882021,-0.761661,0.637647,0.777195,-0.337639
2,0.882021,-0.271347,-0.433273,-0.47975,-0.497314
3,0.882021,-0.271424,-0.433273,-0.47975,-0.494176
4,0.882021,-0.271274,-0.433273,-0.47975,-0.500281


# Refactoring

In [37]:
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

In [38]:
def get_train_X_y(df, y_col, size=0.3, std_cols=None):
    y = df[y_col]
    X = df.drop(columns=y_col)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=size, random_state=42
    )
    
    cols = X.columns
    num_cols = [
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare",
    ]
    
    fi = impute.IterativeImputer()
    
    X_train.loc[:, num_cols] = fi.fit_transform(X_train[num_cols])
    X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])
    
    if std_cols:
        std = preprocessing.StandardScaler()
        X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
        X_test.loc[:, std_cols] = std.transform(X_test[std_cols])
        
    return X_train, X_test, y_train, y_test

In [39]:
ti_df = tweak_titanic(orig_df)
std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_X_y(
    ti_df, "survived", std_cols=std_cols
)

# Base Model

In [40]:
bm = DummyClassifier()

In [41]:
bm.fit(X_train, y_train)
bm.score(X_test, y_test) #precisão

0.5699745547073791

In [49]:
metrics.precision_score(y_test, bm.predict(X_test))

  _warn_prf(average, modifier, msg_start, len(result))


0.0