In [13]:
import pandas as pd
import numpy as np

# Load data

In [14]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = pd.read_csv(url, header=None, names=['sepal_length','sepal_width', 'petal_length', 'petal_width', 'class'])
df = iris.copy()

# Data cleaning

## Missing values

### Is there any missing value in the dataframe?

In [15]:
df.isna().any()

sepal_length    False
sepal_width     False
petal_length    False
petal_width     False
class           False
dtype: bool

### Lets set the values of the rows 10 to 29 of the column 'petal_length' to NaN

In [16]:
df.loc[10:29, 'petal_length'] = np.NaN

### Which column has the maximum number of missing values?

In [17]:
df.isna().sum().idxmax()

'petal_length'

### Try to substitute the NaN values with two methods:
- replace null values with column mean (apply it to a copy of the dataframe)
- replace null values with 1.0



In [18]:
df_copy = df.copy()
df_copy['petal_length'] = df_copy['petal_length'].fillna(df_copy['petal_length'].mean())
df_copy.loc[10:35]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
10,5.4,3.7,4.108462,0.2,Iris-setosa
11,4.8,3.4,4.108462,0.2,Iris-setosa
12,4.8,3.0,4.108462,0.1,Iris-setosa
13,4.3,3.0,4.108462,0.1,Iris-setosa
14,5.8,4.0,4.108462,0.2,Iris-setosa
15,5.7,4.4,4.108462,0.4,Iris-setosa
16,5.4,3.9,4.108462,0.4,Iris-setosa
17,5.1,3.5,4.108462,0.3,Iris-setosa
18,5.7,3.8,4.108462,0.3,Iris-setosa
19,5.1,3.8,4.108462,0.3,Iris-setosa


### Set the first 3 rows as NaN

In [19]:
df.iloc[:3] = np.NaN
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,,,,,
1,,,,,
2,,,,,
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Delete the rows that have all NaN

In [20]:
df = df.dropna()

### Reset the index so it begins with 0 again

In [21]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,4.6,3.1,1.5,0.2,Iris-setosa
1,5.0,3.6,1.4,0.2,Iris-setosa
2,5.4,3.9,1.7,0.4,Iris-setosa
3,4.6,3.4,1.4,0.3,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa


## Duplicates

### Does the dataframe contain duplicated rows? If any, visualize all duplicated rows (don't omit first or last occurrences)

In [23]:
df = iris.copy()
iris[iris.duplicated(keep=False)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
9,4.9,3.1,1.5,0.1,Iris-setosa
34,4.9,3.1,1.5,0.1,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa
101,5.8,2.7,5.1,1.9,Iris-virginica
142,5.8,2.7,5.1,1.9,Iris-virginica


### Which row is the most repeated?

In [47]:
df = iris.copy()
df[df.duplicated(keep=False)].groupby(list(df.columns.values)).size()

#iris_duplicate_rows = iris[iris.duplicated(keep=False)]
#iris_duplicate_rows.groupby(list(iris_duplicate_rows.columns.values)).size()

sepal_length  sepal_width  petal_length  petal_width  class         
4.9           3.1          1.5           0.1          Iris-setosa       3
5.8           2.7          5.1           1.9          Iris-virginica    2
dtype: int64

### Drop duplicated rows

In [59]:
iris_dd = iris.drop_duplicates()
iris.index.size 
iris_dd.index.size
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Detect outliers, e.g., values that are higher than 85th percentile and lower than 25th percentile.

In [82]:
iris_numeric = iris_dd.loc[:, 'sepal_length':'petal_width']

q_85_serie = iris_numeric.quantile(q=.85)
q_25_serie = iris_numeric.quantile(q=.25)

filter = ((iris_numeric < q_25_serie) | (iris_numeric > q_85_serie)).any(axis=1)
iris_numeric[filter]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
143,6.8,3.2,5.9,2.3
144,6.7,3.3,5.7,2.5
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9


# Data transformation

## Replace class values by removing "Iris-" prefix (use a dictionary)

In [None]:
iris_copy = iris.copy()
classes_set = set(iris['class'].values)
class_dict = {name : name.replace('Iris-', '') for name in classes_set}

iris_copy['class'] = iris_copy['class'].map(class_dict)
iris_copy.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Delete columns
Delete for example class column

In [97]:
iris_copy = iris.copy()
iris_copy = iris_copy.drop(columns=['class'])
iris_copy.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## How to normalize all columns in a dataframe?
- Normalize all columns of df by subtracting the column mean and divide by standard deviation.
- Range all columns of df such that the minimum value in each column is 0 and max is 1.

In [121]:
iris_copy = iris.copy()
iris_copy = iris.loc[:, 'sepal_length':'petal_width']

def normalize_col(column: pd.Series, mean:float, std:float) -> pd.Series:
    return (column - mean) / std 

def min_max_scaler(column: pd.Series, min:float, max:float) -> pd.Series:
    return (column - min) / (max - min)

iris_copy = iris_copy.apply(lambda col : normalize_col(col, col.mean(), col.std()), axis=0)
iris_copy = iris_copy.apply(lambda col : min_max_scaler(col, col.min(), col.max()), axis=0)
iris_copy.head(100)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
95,0.388889,0.416667,0.542373,0.458333
96,0.388889,0.375000,0.542373,0.500000
97,0.527778,0.375000,0.559322,0.500000
98,0.222222,0.208333,0.338983,0.416667


In [125]:
iris_copy = iris.copy()
iris_copy = iris.loc[:, 'sepal_length':'petal_width']

iris_std = (iris_copy - iris_copy.mean()) / iris_copy.std()
iris_scaled = (iris_std - iris_std.min()) / (iris_std.max() - iris_std.min())
iris_scaled.head(100)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
95,0.388889,0.416667,0.542373,0.458333
96,0.388889,0.375000,0.542373,0.500000
97,0.527778,0.375000,0.559322,0.500000
98,0.222222,0.208333,0.338983,0.416667


## Binning and discretization
Discretize dataframe columns in 4 bins and get the new value frequency distribution

In [139]:
iris_copy = iris.loc[:, 'sepal_length':'petal_width'].copy()

iris_binned = iris_copy.apply(lambda col : pd.qcut(col, 4), axis=0)
iris_binned.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,"(4.2989999999999995, 5.1]","(3.3, 4.4]","(0.999, 1.6]","(0.099, 0.3]"
1,"(4.2989999999999995, 5.1]","(2.8, 3.0]","(0.999, 1.6]","(0.099, 0.3]"
2,"(4.2989999999999995, 5.1]","(3.0, 3.3]","(0.999, 1.6]","(0.099, 0.3]"
3,"(4.2989999999999995, 5.1]","(3.0, 3.3]","(0.999, 1.6]","(0.099, 0.3]"
4,"(4.2989999999999995, 5.1]","(3.3, 4.4]","(0.999, 1.6]","(0.099, 0.3]"


## Binarize categorical data (dummy variables)
Based on the prevoius result, binarize all dataframe columns