# Data Processing
First rule in data science:

Garbage in -> Garbage out

It is very important to clean the data and ensure it is possible to work with it

In [2]:
import pandas as pd # used for data manipulation, Python Data Analysis Library
import numpy as np # used for numerical calculus, also pandas is built using numpy, Numerical Python
from sklearn.preprocessing import Binarizer, MinMaxScaler, StandardScaler # for feature extraction

## Missing values
Missing values are tricky to dealing with. A missing value is missing information, sometimes we can afford to lose that information if our data base is large, in that situation we can choose to delete the missing values.

In [3]:
# reading data
data = pd.read_csv(r"data/iris-with-errors.csv", header = 0)
# header is the row to bu used as the headr
print(f'Rows:\t{data.shape[0]:2.0f}\nCols:\t{data.shape[1]:2.0f}')

# Observe the first rows
# there is some Not a Number (NaN) values wich may be a problem
# also, there is some duplicate rows
data.head(6)

Rows:	25
Cols:	 5


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,duplicada
1,5.1,3.5,1.4,0.2,duplicada
2,?,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,5.1,3.5,1.4,0.2,duplicada
5,,3.1,1.5,0.2,setosa


In [3]:
# before solving the NaN problem
# note that the second line contains a ? value, we have to change it to NaN too
data = data.replace("?",np.nan)

# now we can solve the NaN problem
data = data.dropna()
data.head(6)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,duplicada
1,5.1,3.5,1.4,0.2,duplicada
3,4.7,3.2,1.3,0.2,setosa
4,5.1,3.5,1.4,0.2,duplicada
6,5.0,3.6,1.4,0.2,setosa
7,5.4,3.9,1.7,0.4,duplicada


In [4]:
# solving the duplicated problem
data.duplicated() # tell us the duplicated rows
data = data.drop_duplicates()
data.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,duplicada
3,4.7,3.2,1.3,0.2,setosa
6,5.0,3.6,1.4,0.2,setosa
7,5.4,3.9,1.7,0.4,duplicada
10,5.0,3.4,1.5,0.2,setosa


#### Next step
After removing duplicate and NaN rows we can work with the data
Always, always be sure that your data is in good condition no the machine learning analysis

In [5]:
# first, we will work only with the length
data.columns # access the dataframe columns

# we can drop columns using the index or the names
# I'll go with the names

data = data.drop(['sepal_width','petal_width'], axis = 1)
data = data.drop(data.index[[0,2]], axis = 0)

## Replacing Missing values
Sometimes we can't afford to delete missing values so we replace them with something that won't harm our algorithm performance. We can then replace the values with:

- Mean
- Median
- Other Measure

In [16]:
# let us reload the data again
data = pd.read_csv(r"data/iris-with-errors.csv", header = 0)
# let's replace the ? values with NaN
data.replace('?', np.nan, inplace = True)
print(data.shape)
data.head(5)

(25, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,duplicada
1,5.1,3.5,1.4,0.2,duplicada
2,,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,5.1,3.5,1.4,0.2,duplicada


In [38]:
# the thing is, we have to estimate the mean value of the columns that have NaN to substitute them
# how do we do it? using numpy

# the array without the last column
X = np.array(data[data.columns[0:data.shape[1]-1]], dtype = float)
avrgs = np.nanmean(X, axis = 0)

for i in np.arange(0,X.shape[0]):
    for j in np.arange(0,X.shape[1]):
        if np.isnan(X[i,j]) == True:
            X[i,j] = avrgs[j]
# we chosed the mean value to replace, but we could use median or any other measurement

array([[5.1       , 3.5       , 1.4       , 0.2       ],
       [5.1       , 3.5       , 1.4       , 0.2       ],
       [5.02272727, 3.        , 1.4       , 0.2       ],
       [4.7       , 3.2       , 1.3       , 0.2       ],
       [5.1       , 3.5       , 1.4       , 0.2       ],
       [5.02272727, 3.1       , 1.5       , 0.2       ],
       [5.        , 3.6       , 1.4       , 0.2       ],
       [5.4       , 3.9       , 1.7       , 0.4       ],
       [5.4       , 3.9       , 1.7       , 0.4       ],
       [4.6       , 3.4       , 1.4       , 0.22608696],
       [5.        , 3.4       , 1.5       , 0.2       ],
       [4.4       , 2.9       , 1.4       , 0.2       ],
       [4.9       , 3.1       , 1.5       , 0.1       ],
       [5.4       , 3.7       , 1.5       , 0.2       ],
       [4.4       , 2.9       , 1.4       , 0.2       ],
       [4.8       , 3.4       , 1.6       , 0.2       ],
       [4.8       , 3.        , 1.4       , 0.1       ],
       [4.4       , 2.9       ,

In [5]:
# reading file
data = pd.read_csv(r'data/iris.csv', header = 0)
print(f'shape = {data.shape}')


X = np.array(data[data.columns[0:data.shape[1]-1]], dtype = float)
Z = np.array(data[data.columns[0:data.shape[1]-1]], dtype = float)

# print('\nOriginal:')
# for i in range(X.shape[1]):
#     print(f"Coluna {i} Maior: {max(X[:,i])}")
#     print(f"Coluna {i} Menor: {min(X[:,i])}\n")
    
## functions to trasnform the data

# Normalizing
scaler = MinMaxScaler(feature_range = (0,1))
X = scaler.fit_transform(X)

# print('\n\nNormalized:')
# for i in range(X.shape[1]):
#     print(f"Coluna {i} Maior: {max(X[:,i])}")
#     print(f"Coluna {i} Menor: {min(X[:,i])}\n")
    
# Padronizing
scaler = StandardScaler().fit(Z)
Z = scaler.transform(Z)

# print('\n\nPadronized:')
# for i in range(Z.shape[1]):
#     print(f"Coluna {i} Maior: {max(Z[:,i])}")
#     print(f"Coluna {i} Menor: {min(Z[:,i])}\n")

shape = (150, 5)


In [8]:
## Binarization

X = np.array(data[data.columns[0:data.shape[1]-1]], dtype = float)

T = 0.2
# print('Limiar:', T)
# print('---------------------')

# change scale
scaler = MinMaxScaler(feature_range = (0,1))
X_norm = scaler.fit_transform(X)
X_norm
(min(X_norm[:,i]))

# binarization
binarizer = Binarizer(threshold = T).fit(X_norm)
binaryX = binarizer.transform(X_norm)
# binaryX