In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
data = sns.load_dataset('titanic')

In [3]:
#print the columns names
data.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [4]:
#Get info about data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [5]:
data.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
#check the sum of null values on the embarked table
data.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
#drop rows is any values is none and threshhold is equal to 10
data.dropna(axis=0, inplace=True,how='any')

In [8]:
#data contains 891 rows ans 15 feautures, data.shape gives info about data
data.shape

(182, 15)

In [9]:
data['sex'].value_counts()

male      94
female    88
Name: sex, dtype: int64

In [10]:
data['age'].fillna(data['age'].mean(),inplace=True)

In [11]:
data['age'].isnull().sum()

0

In [12]:
#count the occurence of each variable in the column 'fare'
data[['embarked']].value_counts()

embarked
S           115
C            65
Q             2
dtype: int64

In [13]:
#Show the a boolean telling if an element is duplicated
data.duplicated()

1      False
3      False
6      False
10     False
11     False
       ...  
871    False
872    False
879    False
887    False
889    False
Length: 182, dtype: bool

In [14]:
#Drop duplicates
data.drop_duplicates(inplace=True)

In [15]:
data.shape
#(891, 15)


(181, 15)

In [16]:
#rename a feature
data.rename(columns= {'fare':'sex1'},inplace=False)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,sex1,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [17]:
data.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [18]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True




---



---
 
# 1- Encode the caegorical data

In [19]:
#call the function : LabelEncoder
encoder = LabelEncoder()
#fit the transformation to our data here it s the feature called : class
data['deck'] = encoder.fit_transform(data['deck'])


In [20]:
#print the column class
data['deck'].value_counts()

2    51
1    42
3    31
4    30
0    12
5    11
6     4
Name: deck, dtype: int64



---



---

# 2 - Use a dictionary to replace categorical data

In [21]:
data = sns.load_dataset('titanic')
data.dropna(axis=0,inplace=True)
deck_dictionary = { 'deck' :{'A':0,'B':12,'C':20,'D':13,'E':4} }

In [22]:
data.replace(deck_dictionary,inplace=True)

In [23]:
data.deck

1      20
3      20
6       4
10      G
11     20
       ..
871    13
872    12
879    20
887    12
889    20
Name: deck, Length: 182, dtype: category
Categories (7, object): [0, 12, 20, 13, 4, 'F', 'G']

In [24]:
data.deck.value_counts()

20    51
12    43
13    31
4     30
0     12
F     11
G      4
Name: deck, dtype: int64

#3 - ONE HOT ENCODING

In [25]:
data = sns.load_dataset('titanic')
one_hot = pd.get_dummies(data['deck'])

In [26]:
data.drop('deck',axis=1,inplace=True)

In [27]:
data.join(one_hot)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,embark_town,alive,alone,A,B,C,D,E,F,G
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,Southampton,no,False,0,0,0,0,0,0,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,Cherbourg,yes,False,0,0,1,0,0,0,0
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,Southampton,yes,True,0,0,0,0,0,0,0
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,Southampton,yes,False,0,0,1,0,0,0,0
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,Southampton,no,True,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,Southampton,no,True,0,0,0,0,0,0,0
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,Southampton,yes,True,0,1,0,0,0,0,0
888,0,3,female,,1,2,23.4500,S,Third,woman,...,Southampton,no,False,0,0,0,0,0,0,0
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,Cherbourg,yes,True,0,0,1,0,0,0,0
