# Data Preprocessing Tools

## Importing the libraries

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import regressor
from openpyxl import load_workbook

## Importing the dataset

In [47]:
df = pd.read_csv('Data.csv')

In [16]:
df.head(20)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        10 non-null     float64
 2   Salary     10 non-null     float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [38]:
df.describe()

Unnamed: 0,Age,Salary
count,10.0,10.0
mean,38.777778,63777.777778
std,7.253777,11564.099406
min,27.0,48000.0
25%,35.5,55000.0
50%,38.388889,62388.888889
75%,43.0,70750.0
max,50.0,83000.0


In [46]:
ix = df['Age'] > 35.1
ix

0     True
1    False
2    False
3     True
4     True
5    False
6     True
7     True
8     True
9     True
Name: Age, dtype: bool

In [50]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Taking care of missing data

In [None]:
df = df.fillna(df.mean(axis='rows', numeric_only=True))
df.head(20)

In [100]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
xy = pd.read_csv('Data.csv')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [101]:
# imp.fit(x[['Age', 'Salary']])
x = xy.iloc[:, :-1].values # Convert to np array
y = xy.iloc[:,-1].values # Create nan vector

In [102]:
#imp.fit(x[:, 1:3]) # Fit the imputer
x[:, 1:3]= imp.fit_transform(x[:, 1:3]) # Transform the data

In [103]:
x # print the data

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding categorical data

### Encoding the Independent Variables

In [107]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [129]:
# Setup for section
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# Setup the encoder
transformers = [('encoder', OneHotEncoder(), [0])]
tf = ColumnTransformer(transformers, remainder='passthrough')

In [130]:
# Ensure clean data for section
x = pd.read_csv('Data.csv').iloc[:, :-1].values
x[:, 1:3]= imp.fit_transform(x[:, 1:3]) # Transform the data

In [131]:
# Perform encoding
x = np.array(tf.fit_transform(x))

In [132]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Encoding the Dependent Variable

In [135]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [136]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the Training set and Test set

In [140]:
from sklearn.model_selection import train_test_split

In [143]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=0)

In [176]:
# Floating stuff display rounding tests, none of them work.
# np.set_printoptions(precision=2, formatter={'float_kind':"{:.2f}".format, 'complex_kind':"{:.2}".format})
# np.get_printoptions()
xtrain

'[[0.0 1.0 0.0 40.0 63777.77777777778]\n [1.0 0.0 0.0 37.0 67000.0]\n [0.0 0.0 1.0 27.0 48000.0]\n [0.0 0.0 1.0 38.77777777777778 52000.0]\n [1.0 0.0 0.0 48.0 79000.0]\n [0.0 0.0 1.0 38.0 61000.0]\n [1.0 0.0 0.0 44.0 72000.0]\n [1.0 0.0 0.0 35.0 58000.0]]'

In [146]:
xtest

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0]], dtype=object)

In [147]:
ytrain

array([1, 1, 1, 0, 1, 0, 0, 1])

In [148]:
ytest

array([0, 0])

## Feature Scaling