In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import seaborn as sns
from pandas_profiling import ProfileReport
import plotly.express as px


# Setting the float precision to 2 decimal places.

pd.set_option("precision", 2)
pd.options.display.float_format = '{:.2f}'.format

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv
/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_lter.csv


# Inputting the data 

We are going to use the penguins_size.csv because it is a simplified dataset, as written in the description on Kaggle.

In [2]:
df_size = pd.read_csv('/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv')

# First glances on the dataset

Let's have an overview of the dataset.

In [3]:
df_size.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


One can notice that we have mainly information about penguins' culmen and flipper. It's also noticeable that we have a few NaN values, now we are going to have a deeper dive on the dataset.

# Which columns does have NaN values?

In [4]:
df_size.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [5]:
df_size.describe()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92,17.15,200.92,4201.75
std,5.46,1.97,14.06,801.95
min,32.1,13.1,172.0,2700.0
25%,39.23,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [6]:
profile = ProfileReport(df_size, title='Penguins profiling report', explorative=True)
profile.to_widgets()

Summarize dataset:   0%|          | 0/20 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

As we have only a few missing values we are going to drop those lines.

In [7]:
df_size.dropna(inplace=True)

# Checking if we truly have no missing values anymore

df_size.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

# How many different species does the dataset have?

In [8]:
df_size['species'].value_counts()

Adelie       146
Gentoo       120
Chinstrap     68
Name: species, dtype: int64

In [9]:
df_size['species'].value_counts(normalize=True)

Adelie      0.44
Gentoo      0.36
Chinstrap   0.20
Name: species, dtype: float64

There are 3 different species on the dataset, which almost half is from the Adelie specie.

# How's the body mass according to sex?

In [10]:
# Defining a list of sexes
sex = ['MALE', 'FEMALE'] 
    
# selecting rows based on condition 
df_sex = df_size.loc[df_size['sex'].isin(sex)] 

fig = px.box(df_sex, x="sex", y="body_mass_g")
fig.show()

In [11]:
df_male = df_sex.loc[df_size['sex']=='MALE']

df_male['body_mass_g'].describe()

count    168.00
mean    4545.68
std      787.63
min     3250.00
25%     3900.00
50%     4300.00
75%     5312.50
max     6300.00
Name: body_mass_g, dtype: float64

In [12]:
df_female = df_sex.loc[df_size['sex']=='FEMALE']

df_female['body_mass_g'].describe()

count    165.00
mean    3862.27
std      666.17
min     2700.00
25%     3350.00
50%     3650.00
75%     4550.00
max     5200.00
Name: body_mass_g, dtype: float64

### One can understand that there is a considerable difference between female and male body mass.

About 25% of male penguins weighs more than the maximum of females penguins.

# Flipper length

In [13]:
df_flipper = df_size.copy()

### Let's understand the distribution of flipper length according to sex

In [14]:
sex = ['MALE', 'FEMALE'] 
    
# selecting rows based on condition 
df_sex_flipper = df_size.loc[df_size['sex'].isin(sex)] 
    
    
fig = px.box(df_sex, x="sex", y="flipper_length_mm")
fig.show()

# Body mass per specie

In [15]:
df_species = df_size.copy()

In [16]:
sex = ['MALE', 'FEMALE'] 
    
# selecting rows based on condition 
df_sex_flipper = df_size.loc[df_size['sex'].isin(sex)] 
    
fig = px.box(df_sex, x="species", y="body_mass_g")
fig.show()

# Correlation

How is the correlation between the variables?

In [17]:
df_size.corr().style.background_gradient(cmap="Blues")

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
culmen_length_mm,1.0,-0.23,0.65,0.59
culmen_depth_mm,-0.23,1.0,-0.58,-0.47
flipper_length_mm,0.65,-0.58,1.0,0.87
body_mass_g,0.59,-0.47,0.87,1.0


One can observe that the correlation among the variables is not that big. It usually changes considerably through each other.