In [88]:
import numpy as np
import pandas as pd

import aquire

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

In [89]:
df = aquire.get_titanic_data()

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        889 non-null object
class           891 non-null object
deck            203 non-null object
embark_town     889 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [91]:
df.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,445.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.602694
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.489615
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,222.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,445.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,667.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [92]:
df.shape

(891, 13)

In [93]:
df.isnull().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [94]:
df.embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: embarked, dtype: int64

In [95]:
df.drop(columns="deck",inplace=True)

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        889 non-null object
class           891 non-null object
embark_town     889 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(4)
memory usage: 83.6+ KB


In [97]:
df.fillna(np.nan,inplace=True)

In [98]:
train, test = train_test_split(df, train_size = .8, random_state=123)

In [99]:
train.embarked.value_counts(dropna=False)

S      515
C      128
Q       67
NaN      2
Name: embarked, dtype: int64

In [100]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")

imp_mode.fit(train[["embarked"]])

train.embarked = imp_mode.transform(train[["embarked"]])
test.embarked = imp_mode.transform(test[["embarked"]])

In [101]:
train.embarked.value_counts(dropna=False)

S    517
C    128
Q     67
Name: embarked, dtype: int64

In [102]:
imp_median = SimpleImputer(missing_values=np.nan, strategy="median")

imp_median.fit(train[["age"]])

train["age"] = imp_median.transform(train[["age"]])
test["age"] = imp_median.transform(test[["age"]])

In [103]:
train.age.isnull().sum()

0

In [104]:
int_encoder = LabelEncoder()
int_encoder.fit(train.embarked)
train.embarked = int_encoder.transform(train.embarked)
test.embarked = int_encoder.transform(test.embarked)

In [105]:
train.embarked.value_counts()

2    517
0    128
1     67
Name: embarked, dtype: int64

In [112]:
embarked_array = np.array(train.embarked).reshape(len(train.embarked),1)

ohe = OneHotEncoder(sparse=False, categories='auto')

embarked_ohe = ohe.fit_transform(embarked_array)
embarked_ohe[0:5]

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [113]:
embarked_array = np.array(test.embarked).reshape(len(test.embarked),1)

embarked_test_ohe = ohe.transform(embarked_array)
embarked_test_ohe[0:5]

array([[0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])