In [214]:
import pprint
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

# Preprocessing

In [235]:
dataset = pd.read_csv('biodiversity-br.csv')
dataset

Unnamed: 0,alt,temp2m,temp2mrange,humidity,precip,atm,wind,m.fapar,classe
0,0.361674,0.719384,0.719384,0.614403,0.594679,0.979630,0.192118,0.665940,0
1,0.067043,0.916464,0.916464,0.579856,0.516432,0.994444,0.170624,0.365190,0
2,0.039864,0.856448,0.856448,0.818496,0.624413,1.000000,0.123337,0.798657,0
3,0.064889,0.821573,0.821573,0.872974,0.632238,1.000000,0.173509,0.999390,0
4,0.025681,0.788321,0.788321,1.000000,0.871674,1.000000,0.283287,0.896407,0
...,...,...,...,...,...,...,...,...,...
860,0.118726,0.142741,0.142741,0.707414,0.397496,0.112963,0.250426,0.623997,0
861,0.014968,0.197891,0.197891,0.718044,0.369327,0.112963,0.252041,0.455267,1
862,0.129396,0.142741,0.142741,0.680308,0.367762,0.061111,0.242492,0.627834,0
863,0.000000,0.213301,0.213301,0.712463,0.350548,0.061111,0.240703,0.397279,0


In [236]:
y = dataset['classe']

In [237]:
dataset.drop(['classe'], axis=1, inplace=True)
dataset

Unnamed: 0,alt,temp2m,temp2mrange,humidity,precip,atm,wind,m.fapar
0,0.361674,0.719384,0.719384,0.614403,0.594679,0.979630,0.192118,0.665940
1,0.067043,0.916464,0.916464,0.579856,0.516432,0.994444,0.170624,0.365190
2,0.039864,0.856448,0.856448,0.818496,0.624413,1.000000,0.123337,0.798657
3,0.064889,0.821573,0.821573,0.872974,0.632238,1.000000,0.173509,0.999390
4,0.025681,0.788321,0.788321,1.000000,0.871674,1.000000,0.283287,0.896407
...,...,...,...,...,...,...,...,...
860,0.118726,0.142741,0.142741,0.707414,0.397496,0.112963,0.250426,0.623997
861,0.014968,0.197891,0.197891,0.718044,0.369327,0.112963,0.252041,0.455267
862,0.129396,0.142741,0.142741,0.680308,0.367762,0.061111,0.242492,0.627834
863,0.000000,0.213301,0.213301,0.712463,0.350548,0.061111,0.240703,0.397279


In [238]:
dataset.drop(['temp2mrange'], axis=1, inplace=True)
dataset

Unnamed: 0,alt,temp2m,humidity,precip,atm,wind,m.fapar
0,0.361674,0.719384,0.614403,0.594679,0.979630,0.192118,0.665940
1,0.067043,0.916464,0.579856,0.516432,0.994444,0.170624,0.365190
2,0.039864,0.856448,0.818496,0.624413,1.000000,0.123337,0.798657
3,0.064889,0.821573,0.872974,0.632238,1.000000,0.173509,0.999390
4,0.025681,0.788321,1.000000,0.871674,1.000000,0.283287,0.896407
...,...,...,...,...,...,...,...
860,0.118726,0.142741,0.707414,0.397496,0.112963,0.250426,0.623997
861,0.014968,0.197891,0.718044,0.369327,0.112963,0.252041,0.455267
862,0.129396,0.142741,0.680308,0.367762,0.061111,0.242492,0.627834
863,0.000000,0.213301,0.712463,0.350548,0.061111,0.240703,0.397279


In [239]:
xTrain, xTest, yTrain, yTest = train_test_split(dataset, y)

In [240]:
yTrain.values[0]

0

In [241]:
X = xTrain.iloc[:,1:xTrain.shape[0]].values
y = []

print(len(X[0]))

for i in range(yTrain.shape[0]):
    y.append(np.delete(X[i], len(X[0])-1))

y = np.asarray(y)
print(y)

6
[[0.63179238 0.521924   0.25039124 0.72037037 0.21320793]
 [0.8540146  0.33138453 0.20813772 0.97962963 0.21762211]
 [0.73317113 0.35370715 0.42410016 0.87962963 0.22024754]
 ...
 [0.71695053 0.35397289 0.36306729 0.68703704 0.16840253]
 [0.93836172 0.19346266 0.11580595 0.97407407 0.22809498]
 [0.73154907 0.25564709 0.342723   0.83333333 0.24289547]]


# Fuzzy KNN

In [257]:
class FuzzyKNN(BaseEstimator, ClassifierMixin):
    def __init__(self, k=3, plot=False):
        self.k = k
        self.plot = plot
        
        
    def fit(self, X, y=None):
        self._check_params(X,y)
        self.X = X
        self.y = y
        
        self.xdim = X.shape[0]
        self.n = len(y)
        
        classes = list(set(y))
        classes.sort()
        self.classes = classes
        
        self.df = pd.DataFrame(self.X)
        self.df['y'] = self.y
        
        self.memberships = self._compute_memberships()
        
        self.df['membership'] = self.memberships
        
        self.fitted_ = True
        return self
    
    
    def predict(self, X):
        if self.fitted_ == None:
            raise Exception('predict() called before fit()')
        else:
            m = 2
            y_pred = []
            
            for x in X:
                neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
                
                votes = {}
                for c in self.classes:
                    den = 0
                    for n in range(self.k):
                        dist = np.linalg.norm(x - neighbors.iloc[n,0:self.xdim])
                        den += 1 / (dist ** (2 / (m-1)))
                    
                    neighbors_votes = []
                    for n in range(self.k):
                        dist = np.linalg.norm(x - neighbors.iloc[n,0:self.xdim])
                        num = (neighbors.iloc[n].membership[c]) / (dist ** (2 / (m-1)))
                        
                        vote = num/den
                        neighbors_votes.append(vote)
                    votes[c] = np.sum(neighbors_votes)
                    
                pred = max(votes.items(), key=operator.itemgetter(1))[0]
                y_pred.append((pred, votes))
                
            return y_pred
        
        
    def score(self, X, y):
        if self.fitted_ == None:
            raise Exception('score() called before fit()')
        else:
            predictions = self.predict(X)
            y_pred = [t[0] for t in predictions]
            confidences = [t[1] for t in predictions]
            
            return accuracy_score(y_pred=y_pred, y_true=y)
    
        
    def _find_k_nearest_neighbors(self, df, x):
        X = df.iloc[:,1:self.xdim].values
        aux = []

        for i in range(self.xdim):
            aux.append(np.delete(X[i], len(X[0])-1))

        aux = np.asarray(aux)
        
        df['distances'] = [np.linalg.norm(aux[i].astype(float) - float(x)) for i in range(self.n)]
        
        df.sort_values(by='distances', ascending=True, inplace=True)
        neighbors = df.iloc[0:self.k]
        
        return neighbors

                
    def _get_counts(self, neighbors):
        groups = neighbors.groupby('y')
        counts = {group[1]['y'].iloc[0]:group[1].count()[0] for group in groups}
        
        return counts
        
        
    def _compute_memberships(self):
        memberships = []
        aux = self.y.values
        
        for c in self.X: #para cada coluna do conjunto de dados de teste...
            l = self.X[c].values
            for i in range(self.n):
                x = l[i]

                if i != 0:
                    y = aux[i]

                neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
                counts = self._get_counts(neighbors)

                membership = dict()
                for c in self.classes:
                    try:
                        uci = 0.49 * (counts[c] / self.k)
                        if c == y:
                            uci += 0.51
                        membership[c] = uci
                    except:
                        membership[c] = 0

                memberships.append(membership)
            return memberships
        
        
    def _check_params(self, X, y):
        if type(self.k) != int:
            raise Exception('"k" should have type int')
        elif self.k >= len(y):
            raise Exception('"k" should be less than no of feature sets')
        elif self.k % 2 == 0:
            raise Exception('"k" should be odd')
            
        if type(self.plot) != bool:
            raise Exception('"plot" should have type bool')

# Running Fuzzy K-NN Model

In [258]:
custModel = FuzzyKNN()

In [259]:
custModel.fit(xTrain, yTrain)

FuzzyKNN()

In [260]:
print(cross_val_score(cv=5, estimator=custModel, X=xTest, y=yTest))

Traceback (most recent call last):
  File "D:\Users\yello\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "D:\Users\yello\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 105, in __call__
    score = scorer(estimator, *args, **kwargs)
  File "D:\Users\yello\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 418, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_12268\3995457824.py", line 66, in score
    predictions = self.predict(X)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_12268\3995457824.py", line 38, in predict
    neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_12268\3995457824.py", line 82, in _find_k_nearest_neighbors
    df['distances'] = [np.linalg.norm(aux[i].astype(float) - float(x)) for i in range(self.n)

[nan nan nan nan nan]


Traceback (most recent call last):
  File "D:\Users\yello\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "D:\Users\yello\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 105, in __call__
    score = scorer(estimator, *args, **kwargs)
  File "D:\Users\yello\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 418, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_12268\3995457824.py", line 66, in score
    predictions = self.predict(X)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_12268\3995457824.py", line 38, in predict
    neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_12268\3995457824.py", line 82, in _find_k_nearest_neighbors
    df['distances'] = [np.linalg.norm(aux[i].astype(float) - float(x)) for i in range(self.n)