In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
import os

In [10]:
os.chdir("D:\\meridianthe4\\PML\\Cases\\Glass_Identification")

In [33]:
glass = pd.read_csv("Glass.csv")

In [34]:
glass

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,building_windows_float_processed
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,building_windows_float_processed
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,headlamps
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,headlamps
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,headlamps
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,headlamps


In [35]:
le = LabelEncoder()
glass['Type'] = le.fit_transform(glass['Type'])
X, y = glass.drop('Type', axis=1), glass['Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

### Without Scaling

In [None]:
ks = np.arange(1, 16)
k_loss_data = [] 

for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_proba = knn.predict_proba(X_test)
    loss = log_loss(y_test, y_proba) 
    k_loss_data.append([k, loss])
    
results_df = pd.DataFrame(k_loss_data, columns=['k', 'log_loss']) 
results_df.sort_values(by='log_loss', ascending=True)

Unnamed: 0,k,log_loss
14,15,1.300383
6,7,1.654489
7,8,1.662591
8,9,1.685935
9,10,1.700234
10,11,1.743965
11,12,1.76407
12,13,1.777022
13,14,1.805283
5,6,2.138643


### Standard Scaling

In [41]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
ks = np.arange(1, 16)
k_loss_data = [] 

for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_proba = knn.predict_proba(X_test_scaled)
    loss = log_loss(y_test, y_proba) 
    k_loss_data.append([k, loss])
    
results_df = pd.DataFrame(k_loss_data, columns=['k', 'log_loss']) 
results_df.sort_values(by='log_loss', ascending=True)

Unnamed: 0,k,log_loss
12,13,1.311004
13,14,1.33042
14,15,1.332879
3,4,2.047242
4,5,2.113347
5,6,2.167274
6,7,2.215624
7,8,2.234334
8,9,2.255101
9,10,2.301887


### MinMax Scaling

In [43]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
ks = np.arange(1, 16)
k_loss_data = [] 

for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_proba = knn.predict_proba(X_test_scaled)
    loss = log_loss(y_test, y_proba) 
    k_loss_data.append([k, loss])
    
results_df = pd.DataFrame(k_loss_data, columns=['k', 'log_loss']) 
results_df.sort_values(by='log_loss', ascending=True)

Unnamed: 0,k,log_loss
8,9,1.298911
11,12,1.312763
9,10,1.320336
10,11,1.321956
12,13,1.335309
13,14,1.351946
14,15,1.357313
7,8,1.778786
6,7,2.25659
3,4,2.612423


### Using Pipeline

In [48]:
mm_scaler = MinMaxScaler()
std_scaler = StandardScaler()
Ks = np.arange(1, 16)
scores = []

for k in Ks:
    for s in [std_scaler, mm_scaler, None]:
        knn = KNeighborsClassifier(n_neighbors=k)
        pipe = Pipeline(steps=[('SCI', s), ('KNN', knn)])
        pipe.fit(X_train, y_train)
        y_proba = pipe.predict_proba(X_test)
        scores.append([k, s, log_loss(y_test, y_proba)])
df_scores = pd.DataFrame(scores, columns=['k', 'scaler', 'log_loss'])
df_scores.sort_values(by='log_loss', ascending=True)

Unnamed: 0,k,scaler,log_loss
25,9,MinMaxScaler(),1.298911
44,15,,1.300383
36,13,StandardScaler(),1.311004
34,12,MinMaxScaler(),1.312763
28,10,MinMaxScaler(),1.320336
31,11,MinMaxScaler(),1.321956
39,14,StandardScaler(),1.33042
42,15,StandardScaler(),1.332879
37,13,MinMaxScaler(),1.335309
40,14,MinMaxScaler(),1.351946


### Testing

In [49]:
glass_test = pd.read_csv("tst_Glass.csv")

In [51]:
knn = KNeighborsClassifier(n_neighbors=9)
pipe = Pipeline(steps=[('MMS', MinMaxScaler()), ('KNN', knn)])
pipe.fit(X, y)

0,1,2
,steps,"[('MMS', ...), ('KNN', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_neighbors,9
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [52]:
# Apply the model to the unlabeled data
predictions = pipe.predict(glass_test)
pred_proba = pipe.predict_proba(glass_test)
glass_test['Predicted_Type'] = le.inverse_transform(predictions)

In [53]:
glass_test

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Predicted_Type
0,1.5321,14.0,0.0,0.34,70.23,0.001,6.7,1.23,0.0,building_windows_non_float_processed
1,1.5212,15.0,3.0,1.23,75.9,0.1,7.0,0.0,0.44,building_windows_non_float_processed
2,1.5112,13.0,3.5,2.3,73.0,3.4,14.0,2.3,0.22,building_windows_non_float_processed
3,1.5,12.4,1.23,3.22,74.22,4.5,10.0,3.1,0.1,headlamps
4,1.52,13.0,2.4,0.34,71.22,3.2,9.0,1.44,0.001,building_windows_float_processed
5,1.51,16.0,2.7,4.0,70.0,2.0,6.0,2.9,0.89,building_windows_non_float_processed


### Model Inferencing