In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import pandas as pd
import copy

%matplotlib inline

In [16]:
iris_dataset = sklearn.datasets.load_iris()

df = pd.DataFrame(iris_dataset['data'])
df['class'] = pd.Series(iris_dataset.target)
df.rename(columns={
    0: 'sle',
    1: 'swi',
    2: 'ple',
    3: 'pwi'
}, inplace=True)

df.head()
entities = df.to_dict(orient='records')
samples = []
for idx,params in enumerate(entities):
    samples += [{
        'id': idx,
        'params': params
    }]
    
samples

[{'id': 0,
  'params': {'sle': 5.1, 'swi': 3.5, 'ple': 1.4, 'pwi': 0.2, 'class': 0.0}},
 {'id': 1,
  'params': {'sle': 4.9, 'swi': 3.0, 'ple': 1.4, 'pwi': 0.2, 'class': 0.0}},
 {'id': 2,
  'params': {'sle': 4.7, 'swi': 3.2, 'ple': 1.3, 'pwi': 0.2, 'class': 0.0}},
 {'id': 3,
  'params': {'sle': 4.6, 'swi': 3.1, 'ple': 1.5, 'pwi': 0.2, 'class': 0.0}},
 {'id': 4,
  'params': {'sle': 5.0, 'swi': 3.6, 'ple': 1.4, 'pwi': 0.2, 'class': 0.0}},
 {'id': 5,
  'params': {'sle': 5.4, 'swi': 3.9, 'ple': 1.7, 'pwi': 0.4, 'class': 0.0}},
 {'id': 6,
  'params': {'sle': 4.6, 'swi': 3.4, 'ple': 1.4, 'pwi': 0.3, 'class': 0.0}},
 {'id': 7,
  'params': {'sle': 5.0, 'swi': 3.4, 'ple': 1.5, 'pwi': 0.2, 'class': 0.0}},
 {'id': 8,
  'params': {'sle': 4.4, 'swi': 2.9, 'ple': 1.4, 'pwi': 0.2, 'class': 0.0}},
 {'id': 9,
  'params': {'sle': 4.9, 'swi': 3.1, 'ple': 1.5, 'pwi': 0.1, 'class': 0.0}},
 {'id': 10,
  'params': {'sle': 5.4, 'swi': 3.7, 'ple': 1.5, 'pwi': 0.2, 'class': 0.0}},
 {'id': 11,
  'params': {'sle':

In [17]:
def build_agds(df):

    params = {}

    for param in df.columns.tolist():
        params.update({
            param: sorted(set(df[param].tolist()))
        })

    for (param, values) in params.items():
        connected_nodes = []
        for node in values:
            connected_nodes += [{
                    'value': node,
                    'samples': [sample for sample in samples if sample['params'][param] == node]
                }]
            params[param] = connected_nodes

    return params

agds = build_agds(df)
agds

{'sle': [{'value': 4.3,
   'samples': [{'id': 13,
     'params': {'sle': 4.3,
      'swi': 3.0,
      'ple': 1.1,
      'pwi': 0.1,
      'class': 0.0}}]},
  {'value': 4.4,
   'samples': [{'id': 8,
     'params': {'sle': 4.4, 'swi': 2.9, 'ple': 1.4, 'pwi': 0.2, 'class': 0.0}},
    {'id': 38,
     'params': {'sle': 4.4, 'swi': 3.0, 'ple': 1.3, 'pwi': 0.2, 'class': 0.0}},
    {'id': 42,
     'params': {'sle': 4.4,
      'swi': 3.2,
      'ple': 1.3,
      'pwi': 0.2,
      'class': 0.0}}]},
  {'value': 4.5,
   'samples': [{'id': 41,
     'params': {'sle': 4.5,
      'swi': 2.3,
      'ple': 1.3,
      'pwi': 0.3,
      'class': 0.0}}]},
  {'value': 4.6,
   'samples': [{'id': 3,
     'params': {'sle': 4.6, 'swi': 3.1, 'ple': 1.5, 'pwi': 0.2, 'class': 0.0}},
    {'id': 6,
     'params': {'sle': 4.6, 'swi': 3.4, 'ple': 1.4, 'pwi': 0.3, 'class': 0.0}},
    {'id': 22,
     'params': {'sle': 4.6, 'swi': 3.6, 'ple': 1.0, 'pwi': 0.2, 'class': 0.0}},
    {'id': 47,
     'params': {'sle': 4.6,
   

In [4]:
def new_id(agds):
    samples = []
    for attr, params in agds.items():
        for param in params:
            for sample in param['samples']:
                samples += [sample]
    return max([s['id'] for s in samples]) + 1

def attach(sample, agds, samples):
    new_sample = {'id': new_id(agds), 'params': sample}
    
    cloned = agds
    
    for attr, params in cloned.items():
        param_value_found = False
        for param in params:
            if attr in sample and sample[attr] == param['value']:
                param['samples'].append(new_sample)
                param_value_found = True
        if attr in sample and not param_value_found:
            params.append({
                'value': sample[attr],
                'samples': [new_sample]
            })
            cloned[attr] = sorted(params, key=lambda a: a['value'])
            
    samples += [new_sample]
            
    return new_sample 
    

sample = {'swi': 1, 'sle': 4.4, 'pwi': 1, 'ple': 1}
attached = attach(sample, agds, samples)
attached

{'id': 150, 'params': {'swi': 1, 'sle': 4.4, 'pwi': 1, 'ple': 1}}

In [5]:
def init_weigths(agds, samples):
    for attr, params in agds.items():
        for param in params:
            param['weight'] = .0
    for sample in samples:
        sample['weight'] = .0
            
    return agds, samples

agds, samples = init_weigths(agds, samples)
agds, samples

({'sle': [{'value': 4.3,
    'samples': [{'id': 13,
      'params': {'sle': 4.3, 'swi': 3.0, 'ple': 1.1, 'pwi': 0.1, 'class': 0.0},
      'weight': 0.0}],
    'weight': 0.0},
   {'value': 4.4,
    'samples': [{'id': 8,
      'params': {'sle': 4.4, 'swi': 2.9, 'ple': 1.4, 'pwi': 0.2, 'class': 0.0},
      'weight': 0.0},
     {'id': 38,
      'params': {'sle': 4.4, 'swi': 3.0, 'ple': 1.3, 'pwi': 0.2, 'class': 0.0},
      'weight': 0.0},
     {'id': 42,
      'params': {'sle': 4.4, 'swi': 3.2, 'ple': 1.3, 'pwi': 0.2, 'class': 0.0},
      'weight': 0.0},
     {'id': 150,
      'params': {'swi': 1, 'sle': 4.4, 'pwi': 1, 'ple': 1},
      'weight': 0.0}],
    'weight': 0.0},
   {'value': 4.5,
    'samples': [{'id': 41,
      'params': {'sle': 4.5, 'swi': 2.3, 'ple': 1.3, 'pwi': 0.3, 'class': 0.0},
      'weight': 0.0}],
    'weight': 0.0},
   {'value': 4.6,
    'samples': [{'id': 3,
      'params': {'sle': 4.6, 'swi': 3.1, 'ple': 1.5, 'pwi': 0.2, 'class': 0.0},
      'weight': 0.0},
     {'id

In [7]:
def fill_value_weights(predicted, agds, samples):
    attached = attach(predicted, agds, samples)
    agds, samples = init_weigths(agds, samples)
    
    # init predicted sample and its value nodes with weights 1.0
    attached['weight'] = 1.0
    for attr, params in agds.items():
        idx = None
        if not attr in predicted:
            continue
        for i, param in enumerate(params):
            if param['value'] == predicted[attr]:
                param['weight'] = 1.0
                idx = i
                break
        # propagate weights back to the left
        for i in reversed(range(idx)):
            params[i]['weight'] = (1 - abs(params[i]['value'] - params[i+1]['value'])
                                   /(params[-1]['value'] - params[0]['value'])) * params[i+1]['weight']
        # propagate weights to the right
        for i in range(idx+1, len(params)):
            params[i]['weight'] = (1 - abs(params[i]['value'] - params[i-1]['value'])
                                   /(params[-1]['value'] - params[0]['value'])) * params[i-1]['weight']
        
    
    for sample in samples:
        param_weights = []
        for attr, value in sample['params'].items():
            for param in agds[attr]:
                if param['value'] == value:
                    param_weights += [param['weight']]
        sample['weight'] = sum([w / len(sample['params']) for w in param_weights])
    
    return agds, samples

def find_n_samples_with_max_weights(samples, n):
    return sorted(samples, reverse=True, key=lambda x: x['weight'])[:n]

agds, samples = fill_value_weights({'swi': 2.6, 'sle': 5.8, 'pwi': 1.2, 'ple': 4.0}, agds, samples)
find_n_samples_with_max_weights(samples, 5)

[{'id': 151,
  'params': {'swi': 2.6, 'sle': 5.8, 'pwi': 1.2, 'ple': 4.0},
  'weight': 1.0},
 {'id': 152,
  'params': {'swi': 2.6, 'sle': 5.8, 'pwi': 1.2, 'ple': 4.0},
  'weight': 1.0},
 {'id': 92,
  'params': {'sle': 5.8, 'swi': 2.6, 'ple': 4.0, 'pwi': 1.2, 'class': 1.0},
  'weight': 0.8},
 {'id': 82,
  'params': {'sle': 5.8, 'swi': 2.7, 'ple': 3.9, 'pwi': 1.2, 'class': 1.0},
  'weight': 0.7907278165503491},
 {'id': 67,
  'params': {'sle': 5.8, 'swi': 2.7, 'ple': 4.1, 'pwi': 1.0, 'class': 1.0},
  'weight': 0.7744083721059045}]

In [36]:
import functools
from collections import Counter

def predict(sample, X, y):
    df = pd.DataFrame(X)
    df['class'] = pd.Series(y)
    df.rename(columns={
        0: 'sle',
        1: 'swi',
        2: 'ple',
        3: 'pwi'
    }, inplace=True)
    
    entities = df.to_dict(orient='records')
    samples = []
    for idx,params in enumerate(entities):
        samples += [{
            'id': idx,
            'params': params
        }]
    agds = build_agds(df)
    
    agds, samples = fill_value_weights(sample, agds, samples)
    n_max = find_n_samples_with_max_weights(samples, 6)
    most_similar = list(filter(lambda x: 'class' in x['params'], n_max))
    
    return Counter(list(map(lambda x: x['params']['class'], most_similar))).most_common(1)[0][0]
 
predict({'swi': 2.6, 'sle': 5.8, 'pwi': 1.2, 'ple': 4.0}, iris_dataset['data'], iris_dataset['target'])

1.0

In [79]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], test_size=0.33)

tests_count = X_test.shape[0]
tests_passed = 0
for x,y in zip(X_test, y_test):
    predicted = predict({'swi': x[1], 'sle': x[0], 'pwi': x[3], 'ple': x[2]}, X_train, y_train)
    if int(predicted) == int(y):
        tests_passed += 1
        
print("Iris dataset splitted with 2:1 ratio. {0:.0%} of test set was recognized correctly.".format(tests_passed/ tests_count))

Iris dataset splitted with 2:1 ratio. 96% of test set was recognized correctly.
