In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split


In [2]:
# attributes:

# Sex / nominal / -- / M, F, and I (infant) 
# Length / continuous / mm / Longest shell measurement 
# Diameter	/ continuous / mm / perpendicular to length 
# Height / continuous / mm / with meat in shell 
# Whole weight / continuous / grams / whole abalone 
# Shucked weight / continuous	/ grams / weight of meat 
# Viscera weight / continuous / grams / gut weight (after bleeding) 
# Shell weight / continuous / grams / after being dried 
# Rings / integer / -- / +1.5 gives the age in years 

In [3]:
abalone = pd.read_csv("abalonedata.csv",names=['target','Length','diameter','height','whole wt','Shucked weight','Viscera weight','Shell weight','Rings'])
abalone

Unnamed: 0,target,Length,diameter,height,whole wt,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
5,I,0.425,0.300,0.095,0.3515,0.1410,0.0775,0.1200,8
6,F,0.530,0.415,0.150,0.7775,0.2370,0.1415,0.3300,20
7,F,0.545,0.425,0.125,0.7680,0.2940,0.1495,0.2600,16
8,M,0.475,0.370,0.125,0.5095,0.2165,0.1125,0.1650,9
9,F,0.550,0.440,0.150,0.8945,0.3145,0.1510,0.3200,19


In [4]:
features=abalone.loc[:,'Length':]
features

Unnamed: 0,Length,diameter,height,whole wt,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
5,0.425,0.300,0.095,0.3515,0.1410,0.0775,0.1200,8
6,0.530,0.415,0.150,0.7775,0.2370,0.1415,0.3300,20
7,0.545,0.425,0.125,0.7680,0.2940,0.1495,0.2600,16
8,0.475,0.370,0.125,0.5095,0.2165,0.1125,0.1650,9
9,0.550,0.440,0.150,0.8945,0.3145,0.1510,0.3200,19


In [5]:
target_values = abalone.loc[:,'target']
target_values

0       M
1       M
2       F
3       M
4       I
5       I
6       F
7       F
8       M
9       F
10      F
11      M
12      M
13      F
14      F
15      M
16      I
17      F
18      M
19      M
20      M
21      I
22      F
23      F
24      F
25      F
26      F
27      M
28      M
29      M
       ..
4147    M
4148    M
4149    I
4150    I
4151    I
4152    I
4153    I
4154    I
4155    I
4156    M
4157    M
4158    I
4159    F
4160    F
4161    F
4162    M
4163    I
4164    I
4165    I
4166    I
4167    M
4168    F
4169    M
4170    M
4171    M
4172    F
4173    M
4174    M
4175    F
4176    M
Name: target, Length: 4177, dtype: object

In [6]:
output = []
i=0
length = len(target_values)
for i in range(length):
    if(target_values[i] =='I'):
        output.append(0)
    elif(target_values[i] == 'M'):
        output.append(1)
    else:
        output.append(2)
        


In [7]:
target = pd.DataFrame(output,columns=['TARGET'])
target

Unnamed: 0,TARGET
0,1
1,1
2,2
3,1
4,0
5,0
6,2
7,2
8,1
9,2


In [8]:
data = pd.concat([features,target],axis=1)
data

Unnamed: 0,Length,diameter,height,whole wt,Shucked weight,Viscera weight,Shell weight,Rings,TARGET
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,2
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,0
5,0.425,0.300,0.095,0.3515,0.1410,0.0775,0.1200,8,0
6,0.530,0.415,0.150,0.7775,0.2370,0.1415,0.3300,20,2
7,0.545,0.425,0.125,0.7680,0.2940,0.1495,0.2600,16,2
8,0.475,0.370,0.125,0.5095,0.2165,0.1125,0.1650,9,1
9,0.550,0.440,0.150,0.8945,0.3145,0.1510,0.3200,19,2


In [9]:
correlation = data.corr('pearson') 

In [10]:
correlation

Unnamed: 0,Length,diameter,height,whole wt,Shucked weight,Viscera weight,Shell weight,Rings,TARGET
Length,1.0,0.986812,0.827554,0.925261,0.897914,0.903018,0.897706,0.55672,0.503697
diameter,0.986812,1.0,0.833684,0.925452,0.893162,0.899724,0.90533,0.57466,0.51645
height,0.827554,0.833684,1.0,0.819221,0.774972,0.798319,0.817338,0.557467,0.47785
whole wt,0.925261,0.925452,0.819221,1.0,0.969405,0.966375,0.955355,0.54039,0.501511
Shucked weight,0.897914,0.893162,0.774972,0.969405,1.0,0.931961,0.882617,0.420884,0.459731
Viscera weight,0.903018,0.899724,0.798319,0.966375,0.931961,1.0,0.907656,0.503819,0.505693
Shell weight,0.897706,0.90533,0.817338,0.955355,0.882617,0.907656,1.0,0.627574,0.499103
Rings,0.55672,0.57466,0.557467,0.54039,0.420884,0.503819,0.627574,1.0,0.401445
TARGET,0.503697,0.51645,0.47785,0.501511,0.459731,0.505693,0.499103,0.401445,1.0


In [11]:
correlation['TARGET'].sort_values()

Rings             0.401445
Shucked weight    0.459731
height            0.477850
Shell weight      0.499103
whole wt          0.501511
Length            0.503697
Viscera weight    0.505693
diameter          0.516450
TARGET            1.000000
Name: TARGET, dtype: float64

In [12]:
features = (features - features.min())/(features.max()-features.min())

In [13]:
xtrain,xtest,ytrain,ytest = train_test_split(features,target,test_size = 0.3)

In [14]:
xtrain

Unnamed: 0,Length,diameter,height,whole wt,Shucked weight,Viscera weight,Shell weight,Rings
2626,0.195946,0.176471,0.044248,0.016646,0.013786,0.012508,0.013453,0.107143
2341,0.317568,0.310924,0.079646,0.050823,0.040013,0.040816,0.043348,0.214286
2895,0.628378,0.605042,0.097345,0.218523,0.184600,0.196840,0.174390,0.321429
67,0.702703,0.739496,0.163717,0.454401,0.279085,0.294273,0.481814,0.428571
3646,0.540541,0.495798,0.110619,0.148929,0.127438,0.103357,0.133533,0.285714
910,0.358108,0.327731,0.066372,0.062511,0.044048,0.059250,0.043348,0.142857
1186,0.824324,0.815126,0.190265,0.602267,0.445864,0.480579,0.470354,0.464286
447,0.662162,0.672269,0.163717,0.327430,0.237391,0.206715,0.372197,0.535714
3437,0.432432,0.411765,0.079646,0.098105,0.089442,0.063858,0.073244,0.250000
3598,0.864865,0.781513,0.159292,0.565964,0.475454,0.463463,0.441953,0.392857


In [15]:
f1 = 'diameter'
f2 = 'Viscera weight'

basex = np.array(xtrain[[f1,f2]])
basey = np.array(ytrain["TARGET"])

testx = np.array(xtest[[f1,f2]])
testy = np.array(ytest["TARGET"])


In [16]:
basex

array([[0.17647059, 0.01250823],
       [0.31092437, 0.04081633],
       [0.60504202, 0.19684003],
       ...,
       [0.76470588, 0.35549704],
       [0.79831933, 0.44963792],
       [0.41176471, 0.08229098]])

In [17]:
testx

array([[0.21008403, 0.0164582 ],
       [0.1512605 , 0.00987492],
       [0.42857143, 0.09743252],
       ...,
       [0.77310924, 0.38709677],
       [0.77310924, 0.26662278],
       [0.82352941, 0.33706386]])

In [18]:
basey

array([0, 0, 0, ..., 1, 2, 0], dtype=int64)

In [19]:
testx

array([[0.21008403, 0.0164582 ],
       [0.1512605 , 0.00987492],
       [0.42857143, 0.09743252],
       ...,
       [0.77310924, 0.38709677],
       [0.77310924, 0.26662278],
       [0.82352941, 0.33706386]])

In [42]:
def euclidean_distance(train_point,test_point):
    distance = np.sum((train_point - test_point)**2)
   
    
    return np.sqrt(distance)

In [43]:
# calculates and returns the euclidean_distance of the test point from all the train data points.


def calc_distance_from_all(all_points,given_point,predictions): 
    all_distances = []
    
    for i, each in enumerate(all_points):
        distance = euclidean_distance(each,given_point)
       
        
        all_distances.append((distance,int(predictions[i]))) 

    all_distances.sort(key=lambda tup: tup[0])  

   
    
    return all_distances

In [44]:
# This is used to get the no. of train data points which are closest to the test point

def get_neighbours(distances,count):        
    return distances[:count]              

In [45]:
def predict(all_points,given_point,predictions,k):  
    distances = calc_distance_from_all(all_points,given_point,predictions) 
   
    neighbours = get_neighbours(distances,k)
    op = [row[-1] for row in neighbours]      
    predictions = max(set(op),key=op.count)    

    return predictions        

In [46]:
def accuracy(basex,basey,testx,testy,k):  
    correct =0
    for i in range(len(testx)):
        p = predict(basex,testx[i],basey,k) 
        if p == testy[i]:     
            correct += 1
            
    return f"Accuracy: {correct*100/len(testy)}"

In [47]:
test1 = [testx[0][0], testx[1][0]]

In [48]:
test1

[0.21008403361344538, 0.15126050420168066]

In [51]:
k=14
accuracy(basex,basey,testx,testy,k)


'Accuracy: 53.50877192982456'