# Data Preprocessing Tools

## Importing the libraries

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [30]:
df = pd.read_csv('Data.csv')

In [31]:
df.head(20)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [33]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [34]:
ix = df['Age'] > 35.1
ix

0     True
1    False
2    False
3     True
4     True
5    False
6    False
7     True
8     True
9     True
Name: Age, dtype: bool

In [35]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Taking care of missing data

In [36]:
df = df.fillna(df.mean(axis='rows', numeric_only=True))
df.head(20)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [76]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
xy = pd.read_csv('Data.csv')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [77]:
# imp.fit(x[['Age', 'Salary']])
x = xy.iloc[:, :-1].values # Convert to np array
y = xy.iloc[:,-1].values # Create y/n vector

In [78]:
#imp.fit(x[:, 1:3]) # Fit the imputer
x[:, 1:3]= imp.fit_transform(x[:, 1:3]) # Transform the data

In [79]:
x # print the data

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding categorical data

### Encoding the Independent Variables

In [41]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# Ensure clean data for section
xy = pd.read_csv('Data.csv')
x = xy.iloc[:, :-1].values
y = xy.iloc[:,-1].values # Create y/n vector
x[:, 1:3]= imp.fit_transform(x[:, 1:3]) # Transform the data

In [83]:
# Setup for section
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# Setup the encoder
transformers = [('encoder', OneHotEncoder(), [0])]
tf = ColumnTransformer(transformers, remainder='passthrough')

In [82]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [84]:
# Perform encoding
x = np.array(tf.fit_transform(x))

In [85]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Encoding the Dependent Variable

In [87]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [88]:
y = le.fit_transform(y)

In [89]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the Training set and Test set

In [90]:
from sklearn.model_selection import train_test_split

In [116]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=0)

In [113]:
# Floating stuff display rounding tests, none of them work.
# np.set_printoptions(precision=2, formatter={'float_kind':"{:.2f}".format, 'complex_kind':"{:.2}".format})
# np.get_printoptions()
xtrain

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [114]:
print(xtest)

[[0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 50.0 83000.0]]


In [94]:
ytrain

array([1, 1, 1, 0, 1, 0, 0, 1])

In [95]:
ytest

array([0, 0])

## Feature Scaling

In [96]:
from sklearn.preprocessing import StandardScaler

In [97]:
sc = StandardScaler()


In [117]:
xtrain[:, 3:]=sc.fit_transform(xtrain[:, 3:])

In [99]:
xtrain

array([[0.0, 1.0, 0.0, 0.2630675731713538, 0.1238147854838185],
       [1.0, 0.0, 0.0, -0.25350147960148617, 0.4617563176278856],
       [0.0, 0.0, 1.0, -1.9753983221776195, -1.5309334063940294],
       [0.0, 0.0, 1.0, 0.05261351463427101, -1.1114197802841526],
       [1.0, 0.0, 0.0, 1.6405850472322605, 1.7202971959575162],
       [0.0, 0.0, 1.0, -0.08131179534387283, -0.16751412153692966],
       [1.0, 0.0, 0.0, 0.9518263102018072, 0.9861483502652316],
       [1.0, 0.0, 0.0, -0.5978808481167128, -0.48214934111933727]],
      dtype=object)

In [118]:
xtest[:, 3:]=sc.transform(xtest[:, 3:])


In [120]:
xtest

array([[0.0, 1.0, 0.0, -1.4588292694047795, -0.9016629672292141],
       [0.0, 1.0, 0.0, 1.984964415747487, 2.139810822067393]],
      dtype=object)

{'edgeitems': 3,
 'threshold': 1000,
 'floatmode': 'maxprec',
 'precision': 8,
 'suppress': False,
 'linewidth': 75,
 'nanstr': 'nan',
 'infstr': 'inf',
 'sign': '-',
 'formatter': None,
 'legacy': False}