In [205]:
"""
independent variables: inputs, predictors, or features that are used to explain or influence the outcomes 
dependent variables: responses or outcomes influenced by changes in independent variables

training set: data that is given or used to train and from which a model learns and predicts or figures out relationships between inputs and outputs
test set: isolated from training set that could have unseen and real world data to evaluate model's performance

feature scaling: a method to standardized independent variables that adjusts the range of data features so that all features are equally proportionally
"""

"\nindependent variables: inputs, predictors, or features that are used to explain or influence the outcomes \ndependent variables: responses or outcomes influenced by changes in independent variables\n\ntraining set: data that is given or used to train and from which a model learns and predicts or figures out relationships between inputs and outputs\ntest set: isolated from training set that could have unseen and real world data to evaluate model's performance\n\nfeature scaling: a method to standardized independent variables that adjusts the range of data features so that all features are equally proportionally\n"

In [206]:
"""
min-max normalization: transform data so that every feature value falls into desired range (commonly being [0, 1])

it could be mapped via a function where f(x) if x is minimum is mapped to 0 and if it is max it is mapped to 1

f(x_min) = a * x_min + b = 0 => b = -a * x_min
f(x_max) = a * x_max + b = 1 => substituting b with -a * x_min => 1  = a (x_max - x_min) = 1
a = 1 / (x_max - x_min)
f(x) = (1/(x_max - x_min)) * x - x_min/ (x_max - x_min) = (x - x_min )/ (x_max - x_min)
"""

'\nmin-max normalization: transform data so that every feature value falls into desired range (commonly being [0, 1])\n\nit could be mapped via a function where f(x) if x is minimum is mapped to 0 and if it is max it is mapped to 1\n\nf(x_min) = a * x_min + b = 0 => b = -a * x_min\nf(x_max) = a * x_max + b = 1 => substituting b with -a * x_min => 1  = a (x_max - x_min) = 1\na = 1 / (x_max - x_min)\nf(x) = (1/(x_max - x_min)) * x - x_min/ (x_max - x_min) = (x - x_min )/ (x_max - x_min)\n'

In [207]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer

In [208]:
data_set = pd.read_csv("../data/Data.csv")

# extract features from the dataset by locating the indexes of the columns and rows
X = data_set.iloc[:, :-1]

# extract the dependent values by locating the last index which is the output or dependent column
y = data_set.iloc[:, -1]
print(X)
print(y)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0
0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


In [209]:
# imputer module of sklearn helps in handling missing data, of which SImpleImputer is a class which replaces missing values with some statistical value
# missing_values: all the missing values that have the type of values specified 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# connect or apply the imputer to matrix of features and use the transform method to replace the missing values with respective strategy
imputer.fit(X=X.iloc[:, 1:])

# update the columns of X with new values with updated missing values
X.iloc[:, 1:] = imputer.transform(X=X.iloc[:, 1:])
print(X)
print(y)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000
0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


In [210]:
# encoding categorical data or features: convert into n number of columns for n categorie with values where they have respectively 0 or 1 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# object instance of Columntranfoomer class, remainder: passthrough: remainders are kept or preserved as they are
ct = ColumnTransformer(
    transformers = [('encoder', OneHotEncoder(), [0])], 
    remainder='passthrough')
X = np.array(ct.fit_transform(X))

print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


In [211]:
# encode the dependent variables of yes/no 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


In [212]:
iris = load_iris()
# pd.Dataframe: pandas function that conevrts numpy array of iris data intoa pandas dataframe
# iris.data: numpy array containing feature values of the dataset iris with each row corresponding to one iris flower and each column representing one of the four measurements
# iris.feature_names: list of strings that represents each column that can be used by pandas as names of the columns
iris_pd = pd.DataFrame(iris.data, columns=iris.feature_names)

# iris.target: a numpy array that has target values or labels for each of the iris sample corresponding to respective iris species
# by adding a new column target that assigns specific information for each sample
iris_pd['target'] = iris.target

X_iris = iris_pd.iloc[:, :-1]
y_iris = iris_pd.iloc[:, -1]

print(X_iris)
print(y_iris)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]
0      0
1      0
2      0
3      0
4   

In [213]:
# splitting the data into training and tests set
# training set: data that is fed to develop the model
# test set: to validate and correct the model based on the way the function is developed based on the trainig set and its output values
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=32)


In [214]:
# feature scaling is operated on data to ensure that all the values or features have equal emphasis
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.fit_transform(X_test[:, 3:])

In [215]:
print(X_train)
print(X_test)

[[ 0.          1.          0.         -1.68580983 -1.05553882]
 [ 0.          1.          0.          1.53377393  1.64209282]
 [ 0.          0.          1.         -0.39797633 -0.40438635]
 [ 1.          0.          0.          0.5678988   0.61885324]
 [ 0.          1.          0.         -0.07601795 -0.14599252]
 [ 0.          0.          1.         -0.27277029 -1.24158238]
 [ 1.          0.          0.         -0.88091389 -0.68345169]
 [ 1.          0.          0.          1.21181555  1.2700057 ]]
[[ 1.  0.  0.  1.  1.]
 [ 0.  0.  1. -1. -1.]]
