In [1]:
# load the Libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [2]:
#load the dataset

df = pd.read_csv('https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv', header=None)

In [3]:
#select column from dataset
df = df[[0,1,2]]

In [4]:
#rename the columns of the dataset
df.columns =['Class', 'Alcohol', 'Malic']

In [5]:
df.head()

Unnamed: 0,Class,Alcohol,Malic
0,1,14.23,1.71
1,1,13.2,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59


In [6]:
#Take copy of the dataset
dfnorm = df.copy()
dfstand = df.copy()

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler()
dfnorm[['Alcohol', 'Malic']] = scaling.fit_transform(df[['Alcohol', 'Malic']])

In [8]:
dfnorm.head()

Unnamed: 0,Class,Alcohol,Malic
0,1,0.842105,0.1917
1,1,0.571053,0.205534
2,1,0.560526,0.320158
3,1,0.878947,0.23913
4,1,0.581579,0.365613


In [9]:
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()
dfstand[['Alcohol', 'Malic']] = scaling.fit_transform(df[['Alcohol', 'Malic']])

In [10]:
dfstand.head()

Unnamed: 0,Class,Alcohol,Malic
0,1,1.518613,-0.56225
1,1,0.24629,-0.499413
2,1,0.196879,0.021231
3,1,1.69155,-0.346811
4,1,0.2957,0.227694


# Now we can see the performance of the Rawdata, Normalize data and Standardize data 

### 1. Raw data 

In [11]:
#copy the original data
raw_data = df.copy()
#split the data into X(independent variable) and Y(Dependent Variable) 
Y = raw_data[['Class']]
X = raw_data[['Alcohol', 'Malic']]

#split the dataset into test and train
x_train, x_test, y_train, ytest = train_test_split(X,Y, test_size = 0.2, random_state = 42)\

# define K-nn Model with k = 5
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
y_pred = knn.fit(x_train, y_train).predict(x_test)

y_pred = [int(i) for i in y_pred] #Convert continuous values to discreate values
print('Square root mean squared error:  '+str(np.sqrt(mean_squared_error(ytest,y_pred))))
print('multilabel_confusion_matrix :  '+ str(accuracy_score(ytest,y_pred)))

Square root mean squared error:  0.5773502691896257
multilabel_confusion_matrix :  0.75


### 2. Normalization

### Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. 
### It is also known as Min-Max scaling
####  Xnorm = Xi - X.min() / X.max() - X.min()

In [12]:
### 2.Normalization data

#split the data into X(independent variable) and Y(Dependent Variable) 
Y = dfnorm[['Class']]
X = dfnorm[['Alcohol', 'Malic']]

#split the dataset into train and test dataset
x_train, x_test, y_train, ytest = train_test_split(X,Y, test_size = 0.2, random_state = 42)

#define knn model with k = 5
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
y_pred = knn.fit(x_train, y_train).predict(x_test)

y_pred = [int(i) for i in y_pred] # convert continuous value to discreate values

#Performance Metrics
print('Square root mean squared error:  '+str(np.sqrt(mean_squared_error(ytest,y_pred))))
print('multilabel_confusion_matrix :  '+ str(accuracy_score(ytest,y_pred)))


Square root mean squared error:  0.6666666666666666
multilabel_confusion_matrix :  0.7222222222222222


### 3. Standardization

### Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.

#### Here’s the formula for standardization: Xstand = Xi - mean / standard deviation

In [13]:


#split the data into X(independent variable) and Y(Dependent Variable) 
Y = dfstand[['Class']]
X = dfstand[['Alcohol', 'Malic']]

#splitting train and test dataset
x_train, x_test, y_train, ytest = train_test_split(X,Y, test_size = 0.2, random_state = 42)

#define the knn model with k=5
knn = neighbors.KNeighborsRegressor(n_neighbors=5)

# fit and predict value using knn model
y_pred = knn.fit(x_train, y_train).predict(x_test) 

y_pred = [int(i) for i in y_pred] # convert continuous value to discreate values

#performance metrics
print('Square root mean squared error:  '+str(np.sqrt(mean_squared_error(ytest,y_pred))))
print('multilabel_confusion_matrix :  '+ str(accuracy_score(ytest,y_pred)))


Square root mean squared error:  0.6454972243679028
multilabel_confusion_matrix :  0.75
