In [1]:
import pandas as pd
import numpy as np

# Read dataset

In [2]:
data = pd.read_csv('penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


# Shape of data

In [3]:
data.shape

(344, 7)

# Info of dataset

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


# Unique species

In [5]:
data['species'].value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

# Number of 'nan' values

In [8]:
pd.DataFrame(data.isna().sum(), columns=['number_of_NaN']).T

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
number_of_NaN,0,0,2,2,2,2,10


# Number of culmen depth in each island

In [82]:
data['cleaned_culmen_depth_mm'] = data['culmen_depth_mm'].ffill()
data['cleaned_culmen_depth_mm'].isna().sum()

np.int64(0)

In [81]:
data.groupby(by='island', as_index=False)['cleaned_culmen_depth_mm'].agg(mean_culmen_depth='mean',
                                                                         count_culmen_depth='count')

Unnamed: 0,island,mean_culmen_depth,count_culmen_depth
0,Biscoe,15.861905,168
1,Dream,18.344355,124
2,Torgersen,18.421154,52


# Body mass in each sex
##### First we cleaned 'sex' and 'body_mass_g' columns, then operated them. For cleaning <b>NaN</b> values we apply forward fill method.

In [73]:
data['cleaned_sex'] = data['sex'].str.lower()
data['cleaned_sex'] = data['cleaned_sex'].replace({'.': np.nan})
data['cleaned_sex'] = data['cleaned_sex'].ffill()
data['cleaned_sex'].isna().sum()

np.int64(0)

In [74]:
data['cleaned_body_mass_g'] = data['body_mass_g'].ffill()
data['cleaned_body_mass_g'].isna().sum()

np.int64(0)

In [80]:
data.groupby(by=['cleaned_sex'], as_index=False)['cleaned_body_mass_g'].agg(mean_body_mass='mean',
                                                                            count_body_mass='count')

Unnamed: 0,cleaned_sex,mean_body_mass,count_body_mass
0,female,3864.97006,167
1,male,4518.220339,177


# Species in each island

In [83]:
data.groupby(by='island', as_index=False)['species'].agg(count_species='count')

Unnamed: 0,island,count_species
0,Biscoe,168
1,Dream,124
2,Torgersen,52


# Share of female penguins in each species

In [92]:
(data[data['cleaned_sex'] == 'female']
.groupby(by=['species'], as_index=False)['cleaned_sex']
.agg(cnt_female_panguins='count'))

Unnamed: 0,species,cnt_female_panguins
0,Adelie,74
1,Chinstrap,34
2,Gentoo,59


# Flipper length comparisons between species ( min, max, mean )
##### For cleaning <b>NaN</b> values we apply forward fill method.

In [96]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [93]:
data['cleaned_flipper_length'] = data['flipper_length_mm'].ffill()
data['cleaned_flipper_length'].isna().sum()

np.int64(0)

In [98]:
data.groupby(by=['species'], as_index=False)['cleaned_flipper_length'].agg({
                                                                            'min_flipper_length': 'min',
                                                                            'max_flipper_length': 'max',
                                                                            'mean_flipper_length': 'mean'
                                                                        })

Unnamed: 0,species,min_flipper_length,max_flipper_length,mean_flipper_length
0,Adelie,172.0,210.0,189.986842
1,Chinstrap,178.0,212.0,195.823529
2,Gentoo,203.0,231.0,217.16129
