In [1]:
# Load necessary models
import pandas as pd
import numpy as np

In [2]:
# Load Data
data = pd.read_csv('iris-data.csv')

In [3]:
#Check the dataframe
data.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# Numeric information
data.describe()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
count,150.0,150.0,150.0,145.0
mean,5.644627,3.054667,3.758667,1.236552
std,1.312781,0.433123,1.76442,0.755058
min,0.055,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.4
50%,5.7,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [5]:
# Check the datatypes
data.dtypes

sepal_length_cm    float64
sepal_width_cm     float64
petal_length_cm    float64
petal_width_cm     float64
class               object
dtype: object

In [6]:
# Values of the categorical column
data['class'].unique()

array(['Iris-setosa', 'Iris-setossa', 'Iris-versicolor', 'versicolor',
       'Iris-virginica'], dtype=object)

It seems we have a few details with incorrect name.
###### Iris-setossa = Iris-setosa
###### versicolor = Iris-versicolor
Let's rename them. In the end we should end up with 3 distinct categories.

In [7]:
#Fixing the typos and replacing them in the dataframe
class_dict = {
    'Iris-setossa': 'Iris-setosa',
    'versicolor': 'Iris-versicolor'
}
data['class'].replace(class_dict, inplace=True)
data['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [8]:
data.isna().sum()

sepal_length_cm    0
sepal_width_cm     0
petal_length_cm    0
petal_width_cm     5
class              0
dtype: int64

#### We are missing 5 values on the petal_width_cm. 
Since this was observed data there's not much we can do, so let's drop those rows.

In [9]:
#Dropping the null values
data.dropna(inplace=True)

In [10]:
data.isna().sum()

sepal_length_cm    0
sepal_width_cm     0
petal_length_cm    0
petal_width_cm     0
class              0
dtype: int64

In [11]:
data.describe()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
count,145.0,145.0,145.0,145.0
mean,5.670303,3.046207,3.836552,1.236552
std,1.32654,0.435096,1.742955,0.755058
min,0.055,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.4
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


#### We also see outliers on the sepal_length_cm.
The outliers are all the rows with sepal_length under 1 cm.
Altough they can be a typo, we have no way of knowing it. 
So I'll drop those rows because I think it's better to not have data than to have incorrect data.
But first let's see the percentage of missing data

In [12]:
per_missing = len(data[data['sepal_length_cm'] < 1]) /len(data)* 100
print("We are missing", len(data[data['sepal_length_cm'] < 1]), "records, which entitle for", round(per_missing,2), "% of the data")

We are missing 5 records, which entitle for 3.45 % of the data


In this case we can safelly drop them since it's 'only' 3.45% of the data 

In [13]:
#DROP
data.drop(data[data['sepal_length_cm'] < 1].index, inplace=True)

In [14]:
data.describe()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
count,140.0,140.0,140.0,140.0
mean,5.870714,3.06,3.827143,1.235714
std,0.806136,0.433955,1.770089,0.766574
min,4.4,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.35
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [16]:
# Create a new index and drop old  one 
data.reset_index(inplace=True)


In [18]:
data.drop('index', axis=1, inplace=True)

In [19]:
data

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
135,6.7,3.0,5.2,2.3,Iris-virginica
136,6.3,2.5,5.0,2.3,Iris-virginica
137,6.5,3.0,5.2,2.0,Iris-virginica
138,6.2,3.4,5.4,2.3,Iris-virginica


In [None]:
# Now that we have a cleaner dataset let's save a new csv
data.to_csv('iris_clean.csv')