In [1]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import plotly.express as px



##another way to install penguins dataset

In [None]:
!pip install palmerpenguins
from palmerpenguins import load_penguins
penguins = load_penguins()
pen = pd.DataFrame(penguins)
pen.head()


In [2]:
# Download the dataset
!wget -q https://storage.googleapis.com/download.tensorflow.org/data/palmer_penguins/penguins.csv -O /tmp/penguins.csv

# Load a dataset into a Pandas Dataframe.
df = pd.read_csv("/tmp/penguins.csv")

# Display the first 3 examples.
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007


#Find the number of rows and columns in the dataset

In [3]:
df.shape

(344, 8)

#check for the missing values in the data

In [4]:
df.isna().sum()

Unnamed: 0,0
species,0
island,0
bill_length_mm,2
bill_depth_mm,2
flipper_length_mm,2
body_mass_g,2
sex,11
year,0


##statistical analysis of the dataset

In [35]:
df.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,342.0,342.0,342.0,342.0,344.0
mean,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,5.459584,1.974793,14.061714,801.954536,0.818356
min,32.1,13.1,172.0,2700.0,2007.0
25%,39.225,15.6,190.0,3550.0,2007.0
50%,44.45,17.3,197.0,4050.0,2008.0
75%,48.5,18.7,213.0,4750.0,2009.0
max,59.6,21.5,231.0,6300.0,2009.0


##information about the dataset

In [45]:
df.info()

#observation: sex column has lot of missing values. There are 5 columns with the missing values in it.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


#year analysis

###How many years were used to conduct this study?

In [47]:
total_yrs = df.year.unique()
print('Total number of years are',len(total_yrs))

Total number of years are 3


###list the years that the study was conducted?

In [48]:
print('The years are',total_yrs)

The years are [2007 2008 2009]


#Find the number of each specie in the dataset?

In [5]:
df.groupby('species').size()

Unnamed: 0_level_0,0
species,Unnamed: 1_level_1
Adelie,152
Chinstrap,68
Gentoo,124


##Show the frequency of each specie in a histogram

In [6]:
px.histogram(df, x="species", color="species", text_auto=True, title="Number of each species", template="plotly_white")

##How many male and female penguin in the dataset, use pivot table method?

In [7]:
pd.pivot_table(df, index=['sex'], aggfunc='size')

Unnamed: 0_level_0,0
sex,Unnamed: 1_level_1
female,165
male,168


###Find the number of male and female in each specie, use groupby method?

In [9]:
df.groupby(['species', 'sex']).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
species,sex,Unnamed: 2_level_1
Adelie,female,73
Adelie,male,73
Chinstrap,female,34
Chinstrap,male,34
Gentoo,female,58
Gentoo,male,61


##number of male and female in each species with histogram.

In [10]:
px.histogram(df, x="sex", color="species", text_auto=True, title="Number of male/female", template="plotly_white")

##facet plot to compare all species

In [11]:
from pathlib import WindowsPath
#create a facet plot with plotly.express with 3 columns. one column for each specie. Count the number of male and female penguins in each specie. use color to display male and female

fig = px.histogram(df, x="sex", color="sex", facet_col="species",
                  title="Number of Male and Female Penguins per Species",
                  template="plotly_white", text_auto = True)
fig.update_layout(barmode='group', width=1420, height = 640)
fig.show()


#bill length analysis

##Find the three highest bill length?

In [13]:
df['bill_length_mm'].nlargest(3)

Unnamed: 0,bill_length_mm
185,59.6
293,58.0
253,55.9


###Find the observations where (all columns) the highest bill length are found.

In [14]:
# find the rows where bill length is 59.6, 58, 55.9

bill_lengths_to_find = [59.6, 58, 55.9]
rows_with_specific_bill_lengths = df[df['bill_length_mm'].isin(bill_lengths_to_find)]
rows_with_specific_bill_lengths


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
185,Gentoo,Biscoe,59.6,17.0,230.0,6050.0,male,2007
253,Gentoo,Biscoe,55.9,17.0,228.0,5600.0,male,2009
293,Chinstrap,Dream,58.0,17.8,181.0,3700.0,female,2007


###frequency of each specie in the dataset.

In [15]:
px.histogram(df, x="species", color="species", text_auto=True, title="Number of each species", template="plotly_dark")


##What is the penguin population in each island?

In [16]:
df.groupby('island')['species'].size()

Unnamed: 0_level_0,species
island,Unnamed: 1_level_1
Biscoe,168
Dream,124
Torgersen,52


In [17]:
fig = px.histogram(df, x="island", color="species", text_auto=True, title="Number of each species", template="plotly_dark")
fig.update_layout(barmode='group', width=1420, height = 640)
fig.show()

#What is the sex(male/female) population in each island?

In [18]:
pd.pivot_table(df, index=['island', 'sex'], aggfunc='size')

Unnamed: 0_level_0,Unnamed: 1_level_0,0
island,sex,Unnamed: 2_level_1
Biscoe,female,80
Biscoe,male,83
Dream,female,61
Dream,male,62
Torgersen,female,24
Torgersen,male,23


#What is the percent of species in the dataset?

In [19]:
# prompt: find the percent of each specie in the dataset

# Calculate the percentage of each species
species_counts = df.groupby('species').size()
species_percentages = (species_counts / len(df)) * 100

print(species_percentages)


species
Adelie       44.186047
Chinstrap    19.767442
Gentoo       36.046512
dtype: float64


In [20]:
# prompt: use crosstap to find the percent of each specie in the dataset

import pandas as pd
pd.crosstab(df['species'], columns='count', normalize=True) * 100


col_0,count
species,Unnamed: 1_level_1
Adelie,44.186047
Chinstrap,19.767442
Gentoo,36.046512


##Display the percent of each specie as percent through pie chart.

In [21]:
px.pie(df, names='species', title='Percent of Species')

##What is the percent of male and female of specie on each island?

In [22]:
pd.crosstab(index=[df['island'], df['sex']], columns='count', normalize=True) * 100 # Added columns argument with a value of 'count'

Unnamed: 0_level_0,col_0,count
island,sex,Unnamed: 2_level_1
Biscoe,female,24.024024
Biscoe,male,24.924925
Dream,female,18.318318
Dream,male,18.618619
Torgersen,female,7.207207
Torgersen,male,6.906907


#facet plot by species

In [23]:
fig = px.scatter(df, x='bill_length_mm', y = 'bill_depth_mm',  color="sex", facet_col="species", template='plotly_white')
fig.show()

#observation: Gentoo has the smallest bill length and bill depth

##Display the bill length distribution.

In [24]:
df.isnull().sum()

Unnamed: 0,0
species,0
island,0
bill_length_mm,2
bill_depth_mm,2
flipper_length_mm,2
body_mass_g,2
sex,11
year,0


In [25]:
#there are 2 missing values in the bill length column. We cust drop those rows before we create the distribution plot.

#there are 2 missing values in the bill length column. We must drop those rows before we create the distribution plot.
df_1 = df.dropna(subset=['bill_length_mm']) # Drop rows with NaN in 'bill_length_mm'

df_1.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


##checking for null values in bill_length_mm column

In [26]:
df_1['bill_length_mm'].isnull().sum()

0

In [27]:
import plotly.figure_factory as ff
import numpy as np
np.random.seed(1)

x = df_1.bill_length_mm
hist_data = [x]
group_labels = ['bill_length_mm'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(title_text='Curve an Rug Plot')

fig.show()

# bill_length and bill_depth by the island/species

In [28]:
fig = px.scatter(df, x='bill_length_mm', y = 'bill_depth_mm',  color="species", facet_col="island", template='plotly_dark')
fig.show()

In [29]:
df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
      dtype='object')

In [30]:
df.bill_depth_mm.max()


21.5

In [31]:
df.bill_depth_mm.min()


13.1

In [33]:
print('the max value is',df.bill_length_mm.max())
print('the min value is', df.bill_length_mm.min())
print('the range of bill_length',df.bill_length_mm.max() - df.bill_length_mm.min())


the max value is 59.6
the min value is 32.1
the range of bill_length 27.5


##Find the unusual bill length in each species?

In [None]:
# find outliers in bill_length_mm column for each specie

def find_outliers_iqr(df, column, species_col):
  """Finds outliers in a column for each species using the IQR method.

  Args:
    df: The Pandas DataFrame.
    column: The name of the column to check for outliers.
    species_col: The name of the column representing the species.

  Returns:
    A dictionary where keys are species and values are lists of outlier values
    in the specified column.
  """
  outliers = {}
  for species in df[species_col].unique():
    species_df = df[df[species_col] == species]
    Q1 = species_df[column].quantile(0.25)
    Q3 = species_df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_for_species = species_df[
        (species_df[column] < lower_bound) | (species_df[column] > upper_bound)
    ][column].tolist()
    outliers[species] = outliers_for_species
  return outliers

# Find outliers in 'bill_length_mm' for each species
outliers_bill_length = find_outliers_iqr(df, 'bill_length_mm', 'species')

# Print the outliers for each species
for species, outlier_values in outliers_bill_length.items():
  print(f"Outliers in bill_length_mm for {species}: {outlier_values}")


###checking for outliers in bill_length_column for each specie.

In [None]:
fig = px.box(df, x='species', y = 'bill_length_mm',  color="sex", facet_col="species", template='plotly_white')
fig.show()



## weight analysis

###What is the mean average weight of each specie?

In [None]:
pd.pivot_table(df, index ='species', values='body_mass_g', aggfunc='mean').round(2)

#observation: Gentoo penguines have the highest body mass

In [None]:
# display the mean body_mass_g of each specie. Use plotly.express to make a chart

import plotly.express as px

# Calculate the mean body_mass_g for each species
mean_body_mass_by_species = df.groupby('species')['body_mass_g'].mean()

# Create a bar chart using Plotly Express
fig = px.bar(
    x=mean_body_mass_by_species.index,
    y=mean_body_mass_by_species.values,
    labels={'x': 'Species', 'y': 'Mean Body Mass (g)'},
    title='Mean Body Mass of Penguins by Species',
    text_auto= True,
    template = 'ggplot2',
)
fig.update_layout(width = 1420, height = 620)

fig.show()


##What is the body mass as percent of each specie?

In [None]:
#calculate the body mass as percent of each specie

# Calculate the total body mass for each species
total_body_mass_by_species = df.groupby('species')['body_mass_g'].sum()

# Calculate the percentage of body mass for each species
body_mass_percentage_by_species = (total_body_mass_by_species / df['body_mass_g'].sum()) * 100
print(body_mass_percentage_by_species)
print('\n')

print()
# Create a pie chart using Plotly Express
fig = px.pie(
    values=body_mass_percentage_by_species.values,
    names=body_mass_percentage_by_species.index,
    title='Percentage of Body Mass by Species',
    template = 'plotly_dark'
)
fig.show()
