In [160]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns

In [161]:
data = pd.read_csv('titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [162]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Dealing With Categorical  Data

In [163]:
cat_cols = data.select_dtypes(include= 'object').columns
cat_cols

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

1: Check null values

In [164]:
data[cat_cols].isnull().sum()

Name          0
Sex           0
Ticket        0
Cabin       687
Embarked      2
dtype: int64

In [165]:
data['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [166]:
#for just to two values, we can use replace function: 
#data['Sex'].replace({'male':0,'female':1})
# Step 2: Label Encoding (e.g., 'sex')
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])  # male=1, female=0

In [167]:
data['Sex'].value_counts()#so it become numerical col!

Sex
1    577
0    314
Name: count, dtype: int64

In [200]:
le.inverse_transform([1,  0])

array(['male', 'female'], dtype=object)

In [201]:
le.classes_

array(['female', 'male'], dtype=object)

In [168]:
data['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [169]:
#first, let's handle two null values in Embarked Column:
sc_Embraked = SimpleImputer(strategy='most_frequent')
data[['Embarked']] = sc_Embraked.fit_transform(data[['Embarked']])

In [170]:
#After Imputting
data['Embarked'].value_counts()

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [171]:
# we will use One-Hot Encoding (e.g., 'embarked') , we can use pd.get_dummies
# ohe = pd.get_dummies(df['embarked'], prefix='embarked')
# df = pd.concat([df, ohe], axis=1)
#but it's Not keep their information about the transformation, so we'll use onehotEncoding class from scikit-learn.

In [172]:
# we can apply one hot encoding here:
onehot_embarked = OneHotEncoder( drop='first', sparse_output=False , handle_unknown='ignore')
encoded_ = onehot_embarked.fit_transform(data[['Embarked']])
encoded_

array([[0., 1.],
       [0., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 0.],
       [1., 0.]], shape=(891, 2))

In [173]:
encoded_cols = onehot_embarked.get_feature_names_out(['Embarked'])
encoded_cols

array(['Embarked_Q', 'Embarked_S'], dtype=object)

In [174]:
#combine and drop the col name:
data[encoded_cols] = encoded_
data.drop(columns = ['Embarked'], inplace= True)

In [175]:
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,0.0,1.0


In [176]:
data['Ticket'].value_counts()

Ticket
347082              7
1601                7
CA. 2343            7
3101295             6
CA 2144             6
                   ..
PC 17590            1
17463               1
330877              1
373450              1
STON/O2. 3101282    1
Name: count, Length: 681, dtype: int64

In [177]:
#replace non-digital to nothing
data['Ticket'] = data['Ticket'].str.replace(r'\D+', '', regex= True)

In [178]:
data['Ticket'].value_counts()


Ticket
347082      7
2343        7
1601        7
3101295     6
2144        6
           ..
392076      1
211536      1
112053      1
111369      1
23101287    1
Name: count, Length: 679, dtype: int64

In [179]:
data['Cabin'].isnull().sum() /data.shape[0] #so we remove  Cabin feature

np.float64(0.7710437710437711)

In [180]:
data['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [181]:
# make sure titles exist in each name
data['Name'].apply(lambda x: '.' in x).all()

np.True_

In [182]:
data['Name'].apply(lambda x: x.split(', '))

0                              [Braund, Mr. Owen Harris]
1      [Cumings, Mrs. John Bradley (Florence Briggs T...
2                               [Heikkinen, Miss. Laina]
3         [Futrelle, Mrs. Jacques Heath (Lily May Peel)]
4                             [Allen, Mr. William Henry]
                             ...                        
886                              [Montvila, Rev. Juozas]
887                       [Graham, Miss. Margaret Edith]
888           [Johnston, Miss. Catherine Helen "Carrie"]
889                              [Behr, Mr. Karl Howell]
890                                [Dooley, Mr. Patrick]
Name: Name, Length: 891, dtype: object

In [183]:
w = data['Name'].apply(lambda x: x.split(', ')[-1])
w

0                                 Mr. Owen Harris
1      Mrs. John Bradley (Florence Briggs Thayer)
2                                     Miss. Laina
3              Mrs. Jacques Heath (Lily May Peel)
4                               Mr. William Henry
                          ...                    
886                                   Rev. Juozas
887                          Miss. Margaret Edith
888                Miss. Catherine Helen "Carrie"
889                               Mr. Karl Howell
890                                   Mr. Patrick
Name: Name, Length: 891, dtype: object

In [184]:
w.apply(lambda x:x.split('. '))

0                                 [Mr, Owen Harris]
1      [Mrs, John Bradley (Florence Briggs Thayer)]
2                                     [Miss, Laina]
3              [Mrs, Jacques Heath (Lily May Peel)]
4                               [Mr, William Henry]
                           ...                     
886                                   [Rev, Juozas]
887                          [Miss, Margaret Edith]
888                [Miss, Catherine Helen "Carrie"]
889                               [Mr, Karl Howell]
890                                   [Mr, Patrick]
Name: Name, Length: 891, dtype: object

In [185]:
w.apply(lambda x:x.split('. '))

0                                 [Mr, Owen Harris]
1      [Mrs, John Bradley (Florence Briggs Thayer)]
2                                     [Miss, Laina]
3              [Mrs, Jacques Heath (Lily May Peel)]
4                               [Mr, William Henry]
                           ...                     
886                                   [Rev, Juozas]
887                          [Miss, Margaret Edith]
888                [Miss, Catherine Helen "Carrie"]
889                               [Mr, Karl Howell]
890                                   [Mr, Patrick]
Name: Name, Length: 891, dtype: object

In [186]:
w.apply(lambda x:x.split('. ')[0])

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Name, Length: 891, dtype: object

In [187]:
# extracting titles from names
titles = data['Name'].apply(lambda x: x.split(', ')[-1].split('. ')[0])

In [188]:
titles.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [189]:
len(titles.unique())

17

In [190]:
data.drop('Name', axis = 1, inplace = True)
data['title']=titles
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S,title
0,1,0,3,1,22.0,1,0,521171,7.25,,0.0,1.0,Mr
1,2,1,1,0,38.0,1,0,17599,71.2833,C85,0.0,0.0,Mrs
2,3,1,3,0,26.0,0,0,23101282,7.925,,0.0,1.0,Miss
3,4,1,1,0,35.0,1,0,113803,53.1,C123,0.0,1.0,Mrs
4,5,0,3,1,35.0,0,0,373450,8.05,,0.0,1.0,Mr


In [191]:
data['title'].nunique()

17

In [192]:
# widen categories (small categories may have the same meaning)
data['title'] = data['title'].replace({
                  'Col': 'Colonel',
                  'Mme': 'Mrs',
                  'Capt': 'Captain',
                  'Mlle': 'Miss',
                  'Dona': 'Mrs',
                  'Don': 'Mr',
                  'Rev': 'Reverend',
                  'Lady': 'Mrs',
                  'Sir' : 'Mrs',
                  'Jonkheer' :'Mr'

              })
data['title']

0            Mr
1           Mrs
2          Miss
3           Mrs
4            Mr
         ...   
886    Reverend
887        Miss
888        Miss
889          Mr
890          Mr
Name: title, Length: 891, dtype: object

In [193]:
data['title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Reverend', 'Dr', 'Ms', 'Major',
       'Colonel', 'Captain', 'the Countess'], dtype=object)

In [194]:
data['title'].nunique()

11

In [195]:
# we can apply one hot encoding here:
onehot_name = OneHotEncoder( drop='first', sparse_output=False , handle_unknown='ignore')
encoded_ = onehot_name.fit_transform(data[['title']])
encoded_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(891, 10))

In [196]:
encoded_cols = onehot_name.get_feature_names_out(['title'])
encoded_cols

array(['title_Colonel', 'title_Dr', 'title_Major', 'title_Master',
       'title_Miss', 'title_Mr', 'title_Mrs', 'title_Ms',
       'title_Reverend', 'title_the Countess'], dtype=object)

In [197]:
encoded_df = pd.DataFrame(encoded_, columns = onehot_name.get_feature_names_out(['title']))
encoded_df

Unnamed: 0,title_Colonel,title_Dr,title_Major,title_Master,title_Miss,title_Mr,title_Mrs,title_Ms,title_Reverend,title_the Countess
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
887,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
888,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
889,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [198]:
#combine and drop the col name:
data[encoded_cols] = encoded_

In [199]:
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,title_Colonel,title_Dr,title_Major,title_Master,title_Miss,title_Mr,title_Mrs,title_Ms,title_Reverend,title_the Countess
0,1,0,3,1,22.0,1,0,521171,7.25,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,1,1,0,38.0,1,0,17599,71.2833,C85,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,1,3,0,26.0,0,0,23101282,7.925,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
