In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('Rice_Cammeo_Osmancik.csv')

In [3]:
df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896,Cammeo
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436,Cammeo
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259,Cammeo
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669,Cammeo
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024,Cammeo


## Dataframe information review

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3810 entries, 0 to 3809
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Area               3810 non-null   int64  
 1   Perimeter          3810 non-null   float64
 2   Major_Axis_Length  3810 non-null   float64
 3   Minor_Axis_Length  3810 non-null   float64
 4   Eccentricity       3810 non-null   float64
 5   Convex_Area        3810 non-null   int64  
 6   Extent             3810 non-null   float64
 7   Class              3810 non-null   object 
dtypes: float64(5), int64(2), object(1)
memory usage: 238.2+ KB


## Is NULL?

In [5]:
df.isnull().sum()

Area                 0
Perimeter            0
Major_Axis_Length    0
Minor_Axis_Length    0
Eccentricity         0
Convex_Area          0
Extent               0
Class                0
dtype: int64

## Data description

In [6]:
df.describe()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent
count,3810.0,3810.0,3810.0,3810.0,3810.0,3810.0,3810.0
mean,12667.727559,454.23918,188.776222,86.31375,0.886871,12952.49685,0.661934
std,1732.367706,35.597081,17.448679,5.729817,0.020818,1776.972042,0.077239
min,7551.0,359.100006,145.264465,59.532406,0.777233,7723.0,0.497413
25%,11370.5,426.144752,174.353855,82.731695,0.872402,11626.25,0.598862
50%,12421.5,448.852493,185.810059,86.434647,0.88905,12706.5,0.645361
75%,13950.0,483.683746,203.550438,90.143677,0.902588,14284.0,0.726562
max,18913.0,548.445984,239.010498,107.54245,0.948007,19099.0,0.86105


## Separate a column from dataframe

In [7]:
new_df = df.drop(['Class'], axis=1)
new_df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024


## Standardization

In [8]:
def scale_feature(df, feature):
    mean = df[feature].mean()
    std = df[feature].std()
    return (df[feature] - mean) / std

def scale_dataframe(df):
    df_scaled = pd.DataFrame()
    for feature in ['Area', 'Perimeter', 'Major_Axis_Length', 'Minor_Axis_Length', 'Eccentricity', 'Convex_Area', 'Extent']:
        df_scaled[feature] = scale_feature(df, feature)
    return df_scaled

df_scaled = scale_dataframe(new_df)
df_scaled.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent
0,1.479635,2.004091,2.348238,-0.212915,2.018073,1.499463,-1.15277
1,1.14772,1.125705,0.988261,0.945444,0.409964,1.192761,-0.602
2,1.13502,1.317041,1.451718,0.253854,1.212797,1.126356,0.405558
3,0.293398,0.115285,0.261405,0.198025,0.23972,0.233826,-0.275315
4,1.166191,1.486858,1.316269,0.523351,0.952096,1.299685,-0.205986


In [9]:
df_scaled.mean().round(2)

Area                 0.0
Perimeter            0.0
Major_Axis_Length    0.0
Minor_Axis_Length   -0.0
Eccentricity        -0.0
Convex_Area         -0.0
Extent              -0.0
dtype: float64

In [10]:
df_scaled.std()

Area                 1.0
Perimeter            1.0
Major_Axis_Length    1.0
Minor_Axis_Length    1.0
Eccentricity         1.0
Convex_Area          1.0
Extent               1.0
dtype: float64

## KNN (Nearest Neighbors Algorithm)

Calculate the Euclidean distance of the new data and guess what is the label of the new data based on 17 nearest neighbors?

In [11]:
new_data = {'Area':[13693],'Perimeter':[479.479004],
     'Major_Axis_Length':[200.397293],'Minor_Axis_Length':[88.174461]
     ,'Eccentricity':[0.897999],'Convex_Area':[14097],'Extent':[0.695253]}

test_data = pd.DataFrame(new_data)

### New data standardization

In [12]:
def scale_new_data(df1, df2, feature):
    mean = df2[feature].mean()
    std = df2[feature].std()
    return (df1[feature] - mean) / std

def scale_data(df1, df2):
    df_scaled = pd.DataFrame()
    for feature in ['Area', 'Perimeter', 'Major_Axis_Length', 'Minor_Axis_Length', 'Eccentricity', 'Convex_Area', 'Extent']:
        df_scaled[feature] = scale_new_data(df1, df2, feature)
    return df_scaled

data_scaled = scale_data(test_data, new_df)
data_scaled.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent
0,0.591833,0.709042,0.666014,0.324742,0.534571,0.644075,0.431376


### Convert new data to numeric array

In [13]:
np_test_scaled = data_scaled.to_numpy()
np_test_scaled

array([[0.59183304, 0.70904194, 0.66601439, 0.3247417 , 0.53457051,
        0.64407493, 0.43137612]])

### Convert dataframe to numeric array

In [14]:
np_df_scaled = df_scaled.to_numpy()
np_df_scaled

array([[ 1.47963532,  2.00409125,  2.34823834, ...,  2.01807257,
         1.49946262, -1.15276961],
       [ 1.14771964,  1.12570533,  0.98826071, ...,  0.40996432,
         1.19276111, -0.60199975],
       [ 1.13502026,  1.31704138,  1.45171791, ...,  1.21279729,
         1.12635602,  0.40555776],
       ...,
       [-0.13318625, -0.32980758, -0.29820597, ..., -0.27506304,
        -0.17304541, -0.45567127],
       [-1.60804635, -1.74009162, -1.58076367, ..., -0.59874276,
        -1.60694529, -0.03716269],
       [-0.71216264, -1.39138341, -1.58733812, ..., -2.93877438,
        -0.76618924,  1.82570729]])

### Calculate Euclidean Distance

In [15]:
euclidean_dist = np.zeros(shape=(len(np_df_scaled)))

print(type(euclidean_dist))

<class 'numpy.ndarray'>


In [16]:
for i in range(len(np_df_scaled)):
    euclidean_dist[i] = np.sqrt(np.sum([(a-b)*(a-b) for a, b in zip(np_test_scaled, np_df_scaled[i])]))

print(euclidean_dist)

[2.46331511 1.59797953 1.56515495 ... 1.85800583 5.73911932 3.37582205]


### Add Euclidean distance to dataframe

In [17]:
df_scaled['euclidean_dist'] = euclidean_dist
df_scaled.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,euclidean_dist
0,1.479635,2.004091,2.348238,-0.212915,2.018073,1.499463,-1.15277,2.463315
1,1.14772,1.125705,0.988261,0.945444,0.409964,1.192761,-0.602,1.59798
2,1.13502,1.317041,1.451718,0.253854,1.212797,1.126356,0.405558,1.565155
3,0.293398,0.115285,0.261405,0.198025,0.23972,0.233826,-0.275315,0.775775
4,1.166191,1.486858,1.316269,0.523351,0.952096,1.299685,-0.205986,1.64578


### Returning the labels that we set aside for standardization

In [18]:
df_scaled['Calss'] = df['Class']
df_scaled.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,euclidean_dist,Calss
0,1.479635,2.004091,2.348238,-0.212915,2.018073,1.499463,-1.15277,2.463315,Cammeo
1,1.14772,1.125705,0.988261,0.945444,0.409964,1.192761,-0.602,1.59798,Cammeo
2,1.13502,1.317041,1.451718,0.253854,1.212797,1.126356,0.405558,1.565155,Cammeo
3,0.293398,0.115285,0.261405,0.198025,0.23972,0.233826,-0.275315,0.775775,Cammeo
4,1.166191,1.486858,1.316269,0.523351,0.952096,1.299685,-0.205986,1.64578,Cammeo


### Sort the data by Euclidean distance

In [19]:
sorted_df = df_scaled.sort_values(by=['euclidean_dist'])
sorted_df[0:17] # Because K was 17, we will check the first 17 rows.

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,euclidean_dist,Calss
1525,0.557776,1.037748,1.166696,-0.450279,1.41596,0.631694,1.630997,0.337675,Cammeo
834,0.556621,0.890462,0.942119,-0.084191,1.030905,0.536026,-1.447627,0.33768,Cammeo
3403,0.558353,0.036711,0.037559,1.005946,-0.622186,0.468495,1.120988,0.337684,Osmancik
1355,0.556044,0.542399,0.270381,0.906715,-0.277185,0.668836,-1.057041,0.337692,Cammeo
1528,0.55893,0.781351,0.78106,0.099488,0.783541,0.579358,-1.0089,0.337699,Cammeo
1545,0.55893,0.758962,0.868395,-0.023319,0.934113,0.526459,-1.453005,0.337699,Cammeo
1381,0.559507,0.348732,0.272685,0.705168,-0.122089,0.498321,0.38501,0.337721,Cammeo
1570,0.559507,0.794217,0.941201,-0.155245,1.073947,0.562475,-1.363587,0.337721,Cammeo
3427,0.554889,0.461803,0.771878,0.015076,0.830239,0.468495,-1.23214,0.337738,Osmancik
1121,0.560662,0.791521,0.685439,0.247713,0.603501,0.613123,0.885067,0.337786,Cammeo


### Conclusion

According to the class of the nearest neighbors, the new data will be **Cammeo**.