In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load data

In [2]:
# load data 
df = pd.read_csv('../data/titanic.csv')

In [3]:
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [4]:
df = df[['sex', 'embarked', 'cabin', 'survived']]

In [5]:
df

Unnamed: 0,sex,embarked,cabin,survived
0,female,S,B5,1
1,male,S,C22,1
2,female,S,C22,0
3,male,S,C22,0
4,female,S,C22,0
...,...,...,...,...
1304,female,C,,0
1305,female,C,,0
1306,male,C,,0
1307,male,C,,0


In [6]:
# get the first letter from cabin 

df['cabin'] = df['cabin'].str[0]
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,sex,embarked,cabin,survived
0,female,S,B,1
1,male,S,C,1
2,female,S,C,0
3,male,S,C,0
4,female,S,C,0


# Split data 

In [7]:

X_train, X_test, y_train, y_test = train_test_split(df[['sex', 'embarked', 'cabin']], 
                                                    df['survived'], 
                                                    test_size=0.3, 
                                                    random_state=0)

X_train.shape, X_test.shape

((916, 3), (393, 3))

# Explore cardinality

In [8]:
X_train['sex'].unique()

array(['female', 'male'], dtype=object)

In [9]:
X_train['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
X_train['cabin'].unique()

array([nan, 'E', 'C', 'D', 'B', 'A', 'F', 'T', 'G'], dtype=object)

# (1) One hot encoding with pandas 

In [11]:
# to return k dummies 
temp = pd.get_dummies(X_train['sex'])
temp.head()

Unnamed: 0,female,male
501,1,0
588,1,0
402,1,0
1193,0,1
686,1,0


In [12]:
# to return k-1 dummies 
temp = pd.get_dummies(X_train['sex'], drop_first=True)
temp.head()

Unnamed: 0,male
501,0
588,0
402,0
1193,1
686,0


In [13]:
temp = pd.get_dummies(X_train['embarked'])
temp.head()

Unnamed: 0,C,Q,S
501,0,0,1
588,0,0,1
402,1,0,0
1193,0,1,0
686,0,1,0


In [14]:
temp = pd.get_dummies(X_train['cabin'])
temp.head()

Unnamed: 0,A,B,C,D,E,F,G,T
501,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0
402,0,0,0,0,0,0,0,0
1193,0,0,0,0,0,0,0,0
686,0,0,0,0,0,0,0,0


In [15]:
X_train['cabin'].head()

501     NaN
588     NaN
402     NaN
1193    NaN
686     NaN
Name: cabin, dtype: object

In [16]:
# this returns 13 columns 
pd.get_dummies(X_train)

Unnamed: 0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,cabin_A,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T
501,1,0,0,0,1,0,0,0,0,0,0,0,0
588,1,0,0,0,1,0,0,0,0,0,0,0,0
402,1,0,1,0,0,0,0,0,0,0,0,0,0
1193,0,1,0,1,0,0,0,0,0,0,0,0,0
686,1,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,1,0,0,0,1,0,0,0,0,0,0,0,0
835,0,1,0,0,1,0,0,0,0,0,0,0,0
1216,1,0,0,1,0,0,0,0,0,0,0,0,0
559,1,0,0,0,1,0,0,0,0,0,0,0,0


In [17]:
# this returns 12 columns - 1 variable less. Not suitable for production! 
pd.get_dummies(X_test)

Unnamed: 0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,cabin_A,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G
1139,0,1,0,0,1,0,0,0,0,0,0,0
533,1,0,0,0,1,0,0,0,0,0,0,0
459,0,1,0,0,1,0,0,0,0,0,0,0
1150,0,1,0,0,1,0,0,0,0,0,0,0
393,0,1,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
914,0,1,0,0,1,0,0,0,0,0,0,0
580,1,0,0,0,1,0,0,0,0,0,0,0
1080,0,1,0,1,0,0,0,0,0,0,0,0
1249,0,1,0,1,0,0,0,0,0,0,1,0


# (1) One hot encoding with sklearn 

In [18]:
encoder = OneHotEncoder(categories='auto', 
                        drop='first', # to return k-1, use drop=False to return k dummies
                       sparse=False)

encoder.fit(X_train.fillna('Missing'))

OneHotEncoder(drop='first', sparse=False)

In [19]:
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Missing', 'Q', 'S'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Missing', 'T'], dtype=object)]

In [20]:
temp = encoder.transform(X_train.fillna('Missing'))
temp

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [21]:
temp = encoder.transform(X_test.fillna('Missing'))
temp

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])