In [186]:
import numpy as np
import pandas as pd
from enum import Enum
import json

In [3]:
df = pd.read_csv('students\\student-mat.csv', sep=';', header=0)

In [88]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)

In [89]:
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,course,other,1,2,2,no,yes,yes,no,yes,yes,no,no,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,course,other,1,1,3,no,no,no,no,no,yes,no,no,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,0,no,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,11,12,10


In [169]:
class FeatureType(Enum):
    ABSOLUTE = 0, 'a'
    NOMINAL = 1, 'n'
    ORDERED = 2, 'o'
    
    def to_json(self):
        return self.value[1]

In [170]:
class MyEnumEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, FeatureType):
            return obj.to_json()
        return super().default(obj)

In [101]:
# задаём типы шкал через консоль (буква или индекс из FeatureType) или из файла ниже
featureTypes = dict()
for columnName in df:
    ch = input(f'{columnName}:\n')
    featureTypes[columnName] = next(obj for obj in FeatureType if (int(ch) if ch.isdigit() else ch) in obj.value)

In [246]:
with open('featuretypes.json', 'w') as f:
    json.dump(featureTypes, f, cls=MyEnumEncoder)

In [247]:
with open('featuretypes.json', 'r') as f:
    featureTypes = {n:next(obj for obj in FeatureType if v in obj.value) for n,v in json.load(f).items()}

In [230]:
featureTypes

{'school': <FeatureType.NOMINAL: (1, 'n')>,
 'sex': <FeatureType.NOMINAL: (1, 'n')>,
 'age': <FeatureType.ABSOLUTE: (0, 'a')>,
 'address': <FeatureType.NOMINAL: (1, 'n')>,
 'famsize': <FeatureType.NOMINAL: (1, 'n')>,
 'Pstatus': <FeatureType.NOMINAL: (1, 'n')>,
 'Medu': <FeatureType.ORDERED: (2, 'o')>,
 'Fedu': <FeatureType.ORDERED: (2, 'o')>,
 'Mjob': <FeatureType.NOMINAL: (1, 'n')>,
 'Fjob': <FeatureType.NOMINAL: (1, 'n')>,
 'reason': <FeatureType.NOMINAL: (1, 'n')>,
 'guardian': <FeatureType.NOMINAL: (1, 'n')>,
 'traveltime': <FeatureType.ORDERED: (2, 'o')>,
 'studytime': <FeatureType.ORDERED: (2, 'o')>,
 'failures': <FeatureType.ORDERED: (2, 'o')>,
 'schoolsup': <FeatureType.NOMINAL: (1, 'n')>,
 'famsup': <FeatureType.NOMINAL: (1, 'n')>,
 'paid': <FeatureType.NOMINAL: (1, 'n')>,
 'activities': <FeatureType.NOMINAL: (1, 'n')>,
 'nursery': <FeatureType.NOMINAL: (1, 'n')>,
 'higher': <FeatureType.NOMINAL: (1, 'n')>,
 'internet': <FeatureType.NOMINAL: (1, 'n')>,
 'romantic': <FeatureTy

In [207]:
def diffNominalScale(i, l, data):
    result = 0
    for k in data:
        if i != k and l == k or i == k and l != k:
            result+=1
    return result / len(data)

In [232]:
def diffOrderedScale(i, l, data):
    result = 0
    for k in data:
        if i < k and l > k or i > k and l < k:
            result += 1
        elif i == k and l != k or i != k and l == k:
            result += 0.5
    return result / (len(data) - 1)

In [233]:
def diffAbsoluteScale(a, b, data):
    return np.abs(a - b) / (np.max(data) - np.min(data))

In [234]:
def diffTotal(obj1, obj2, data, featureTypes, functionsMap):
    sumsq = 0
    for i, colName in enumerate(data):
        f = functionsMap[featureTypes[colName]]
        sumsq += f(obj1[i], obj2[i], data[colName]) **2
    return np.sqrt(sumsq) / np.sqrt(data.shape[1])


In [238]:
def calcMatrix(data:pd.DataFrame, featureTypes, functionsMap):
    matrix = np.zeros(tuple([data.shape[0]]*2))
    for i, obj1 in df[:-1].iterrows():
        for j, obj2 in df[i+1:].iterrows():
            matrix[i,j] = d = diffTotal(obj1, obj2, data, featureTypes, functionsMap)
            if i!=j:
                matrix[j,i] = d
    return matrix


In [236]:
functionsMap = {
    FeatureType.ABSOLUTE:diffAbsoluteScale,
    FeatureType.NOMINAL:diffNominalScale,
    FeatureType.ORDERED:diffOrderedScale
                }

In [239]:
matrix = calcMatrix(df, featureTypes, functionsMap)

In [242]:
matrix

array([[0.        , 0.48786061, 0.47206084, ..., 0.5642948 , 0.57815843,
        0.54541345],
       [0.48786061, 0.        , 0.46896242, ..., 0.46634525, 0.51185358,
        0.4718182 ],
       [0.47206084, 0.46896242, 0.        , ..., 0.56554811, 0.50015124,
        0.46108614],
       ...,
       [0.5642948 , 0.46634525, 0.56554811, ..., 0.        , 0.38476678,
        0.43754371],
       [0.57815843, 0.51185358, 0.50015124, ..., 0.38476678, 0.        ,
        0.39236845],
       [0.54541345, 0.4718182 , 0.46108614, ..., 0.43754371, 0.39236845,
        0.        ]])

In [None]:
# ensure that d(a,a) = 0
obj1 =df.iloc[0]
obj2 =df.iloc[0]
for i,(n,f) in enumerate(featureTypes.items()):
    print(i, n, f, functionsMap[f](obj1[i], obj2[i], df[n]))