# Data preprocessing:

## Creating Test data

In [39]:
import pandas as pd
from io import StringIO

data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

csv_data = StringIO(data)

df = pd.read_csv(csv_data)

#We can always access the underlying 
#NumPy array of a DataFrame via the values attribute before we 
#feed it into a scikit-learn estimator

#print(type(df.values))

## Removing missing data:

In [40]:
#removing rows
print(df.dropna(axis=0))

#removing columns
print(df.dropna(axis=1))

#only drop rows where NaN appear in specific columns (here: 'C')
print(df.dropna(subset=['C']))

     A    B    C    D
0  1.0  2.0  3.0  4.0
      A     B
0   1.0   2.0
1   5.0   6.0
2  10.0  11.0
      A     B     C    D
0   1.0   2.0   3.0  4.0
2  10.0  11.0  12.0  NaN


## Imputing missing values
### Interpolation techniques:

In [41]:
#mean imputation
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(df.values)
imputed_data = imp.transform(df.values)
print(df.values)
print(imputed_data)


[[ 1.  2.  3.  4.]
 [ 5.  6. nan  8.]
 [10. 11. 12. nan]]
[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [10.  11.  12.   6. ]]


## Handling categorical data

In [42]:
#Creating sample data
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'], 
    ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


### Mapping ordinal features
Ordinal features need to be manually mapped with a dictionary that assigns a numerical value to each key

In [43]:
size_mapping = {
    'XL':3,
    'L':2,
    'M':1
}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [44]:
#we can create a reverse mapping dictionary like this
reverse_size_mapping = {v : k for k, v in size_mapping.items()}
reverse_size_mapping

{3: 'XL', 2: 'L', 1: 'M'}

### Mapping class labels
Class labels are not ordinal but can be treated exactly the same way as we did the ordinal features, the only difference is that the order or the value itself won't matter, so we can assign any number.

In [45]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df.iloc[:, -1]))}
reverse_class_mapping = {v: k for k, v in class_mapping.items()}

#map class labels
df.iloc[:, -1] = df.iloc[:, -1].map(class_mapping)

#reverse mapping
df.iloc[:, -1] = df.iloc[:, -1].map(reverse_class_mapping)
df

  df.iloc[:, -1] = df.iloc[:, -1].map(class_mapping)


Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


### Mapping class labels using sklearn 

In [46]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(df.iloc[:, -1])
encoded_labels

array([0, 1, 0])

In [47]:
encoder.inverse_transform(encoded_labels)

array(['class1', 'class2', 'class1'], dtype=object)

### Mapping nominal values using One Hot Encoding technique

In [48]:
X = df.iloc[:, 0:3].values

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#ohe = OneHotEncoder(categorical_features=[0])
ct = ColumnTransformer([("color", OneHotEncoder(), [0])], remainder="passthrough")
ct.fit_transform(X)


array([[0.0, 1.0, 0.0, 1, 10.1],
       [0.0, 0.0, 1.0, 2, 13.5],
       [1.0, 0.0, 0.0, 3, 15.3]], dtype=object)

### One hot encoding using built in pandas method
An even more convenient way to create those dummy features via one-hot encoding 
is to use the get_dummies method implemented in pandas.

In [49]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


When we are using one-hot encoding datasets, we have to keep in mind that it introduces multicollinearity. To reduce the correlation among variables, we can simply remove one feature column from the one-hot encoded array.

In [50]:
pd.get_dummies(df[['price', 'size', 'color']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0
