# EDA simple ML project using FastApi

- This notebook is part of a simple ML project using FastAPI for EDA (Exploratory Data Analysis).
- It includes loading a dataset and performing basic operations.
- The dataset is assumed to be in CSV format and located in the same directory as this script

In [8]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('../data/penguins.csv')
# Display the first few rows of the dataset
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,,,,,,,


In [9]:
# evaluate the dataset
df.info()
# Display the first few rows of the dataset
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            348 non-null    object 
 1   island             348 non-null    object 
 2   bill_length_mm     346 non-null    float64
 3   bill_depth_mm      346 non-null    float64
 4   flipper_length_mm  346 non-null    float64
 5   body_mass_g        346 non-null    float64
 6   sex                337 non-null    object 
dtypes: float64(4), object(3)
memory usage: 19.5+ KB


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,,,,,,,


In [10]:
# drop na
df = df.dropna()
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 337 entries, 0 to 354
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            337 non-null    object 
 1   island             337 non-null    object 
 2   bill_length_mm     337 non-null    float64
 3   bill_depth_mm      337 non-null    float64
 4   flipper_length_mm  337 non-null    float64
 5   body_mass_g        337 non-null    float64
 6   sex                337 non-null    object 
dtypes: float64(4), object(3)
memory usage: 21.1+ KB


In [11]:
# evaluate categorical columns species, island, sex
# species
species_counts = df['species'].value_counts()
print("Species counts:")
print(species_counts)
# island
island_counts = df['island'].value_counts()
print("\nIsland counts:")
print(island_counts)
# sex
sex_counts = df['sex'].value_counts()
print("\nSex counts:")
print(sex_counts)


Species counts:
species
Adelie       147
Gentoo       121
Chinstrap     69
Name: count, dtype: int64

Island counts:
island
Biscoe       166
Dream        124
Torgersen     47
Name: count, dtype: int64

Sex counts:
sex
Male      169
Female    168
Name: count, dtype: int64


In [12]:
# evaluate numerical columns
# Display summary statistics for numerical columns
numerical_summary = df.describe()
print("\nNumerical summary:")
print(numerical_summary)


Numerical summary:
       bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
count      337.000000     337.000000         337.000000   337.000000
mean        44.006825      17.159347         200.991098  4209.347181
std          5.446907       1.963001          14.011153   807.050049
min         32.100000      13.100000         172.000000  2700.000000
25%         39.500000      15.600000         190.000000  3550.000000
50%         44.900000      17.300000         197.000000  4050.000000
75%         48.500000      18.700000         213.000000  4800.000000
max         59.600000      21.500000         231.000000  6300.000000
