### Numerical and Catagorical Features

In [1]:
import pandas as pandas
import numpy as np 
import seaborn as sns

In [58]:
penguins = sns.load_dataset("penguins")

penguinscopy = penguins.copy()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


>## What other datasets can we get from seaborn?


In [11]:
# sns.get_dataset_names()

In [12]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


- We need to transform these object type features.
- These are catagorical features > numerical

In [13]:
penguins.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female


In [20]:
# Label encoding
# very simple, for some models, it can be not as good

species_coded = {
    'Adelie': 0,
    'Chinstrap': 1,
    'Gentoo': 2
}

# create a new column species_1
penguins['species_1'] = penguins['species'].map(species_coded)
penguins.sample(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_1
192,Chinstrap,Dream,49.0,19.5,210.0,3950.0,Male,1
335,Gentoo,Biscoe,55.1,16.0,230.0,5850.0,Male,2
92,Adelie,Dream,34.0,17.1,185.0,3400.0,Female,0
132,Adelie,Dream,36.8,18.5,193.0,3500.0,Female,0
302,Gentoo,Biscoe,47.4,14.6,212.0,4725.0,Female,2


In [27]:
penguins['sex']

0        Male
1      Female
2      Female
3         NaN
4      Female
        ...  
339       NaN
340    Female
341      Male
342    Female
343      Male
Name: sex, Length: 344, dtype: object

In [39]:
# Do the same thing for sex

sex_coded = {
    'Male': 0,
    'Female': 1,
    'NaN': 2
}

penguins2 = penguins.copy()

# drop the nan values
penguins2.dropna(inplace=True)

# create a new column species_1
penguins2['sex_1'] = penguins2['sex'].map(sex_coded)
penguins2.head(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_1,sex_1
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0,1
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,0,0


In [None]:
#  label encoding using SKLEARN

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# species
penguins['species_le'] = encoder.fit_transform(penguins['species'])
penguins['sex_le'] = encoder.fit_transform(penguins['sex'])
penguins['island_le'] = encoder.fit_transform(penguins['island'])

penguins_encoded = penguins[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g','species_le','sex_le','island_le']]
penguins_encoded.head(5)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_le,sex_le,island_le
0,39.1,18.7,181.0,3750.0,0,1,2
1,39.5,17.4,186.0,3800.0,0,0,2
2,40.3,18.0,195.0,3250.0,0,0,2
3,,,,,0,2,2
4,36.7,19.3,193.0,3450.0,0,0,2


> label encoding - giving different weights to different classes
- confuses models, to think that one class is more important than others

In [None]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_1,sex_1,species_le,sex_le,island_le
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0,0.0,0,1,2
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0,1.0,0,0,2
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,1.0,0,0,2
3,Adelie,Torgersen,,,,,,0,,0,2,2
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0,1.0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,,2,,2,2,0
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,2,1.0,2,0,0
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,2,0.0,2,1,0
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,2,1.0,2,0,0
