In [2]:
import pandas as pd
import numpy as np
import math

df = pd.read_excel("dataset2.xls", header=None)
data = df.to_numpy()

def var(array):
    mean_array = mean(array)
    return sum(pow(x - mean_array, 2) for x in array) / len(array)

def mean(array):
    return sum(array) / len(array)

def z_score(array):
    mean_array = mean(array)
    std = math.sqrt(var(array))  # standard diviation
    return [(x - mean_array) / std for x in array]

# a
# output: numpy Array
def normalization(data):
    num_col = np.size(data, 1)
    new_data = []
    for i in range(0, num_col):
        new_data.append(z_score(data[:, i]))
        
    return np.transpose(np.array(new_data))

# b
# output: numpy Array
def correlation_matrix(data):
    cor_matrix = []
    n_row = np.size(data, 0)
    n_col = np.size(data, 1)
    for x in range(n_col):
        row = []
        A = data[:, x]
        for y in range(n_col):
            B = data[:, y]
            A_bar = mean(A)
            B_bar = mean(B)
            std_A = math.sqrt(var(A))
            std_B = math.sqrt(var(B))
            row.append(sum((A[i] - A_bar) * (B[i] - B_bar) for i in range(n_row)) / n_row * std_A * std_B)
            
        cor_matrix.append(row)
        
    return np.transpose(np.array(cor_matrix))


def minkowski_distance(data, h):
    min_matrix = []
    n_row = np.size(data, 0)
    n_col = np.size(data, 1)
    for i in range(n_row):
        row = []
        for j in range(n_row):
            if j > i:
                row.append(np.nan)
            else :
                row.append(sum(pow(abs(data[i, k] - data[j, k]), h) for k in range(n_col)) ** (1 / h))
        
        min_matrix.append(row)
    
    return np.array(min_matrix)


# c
# output: numpy Array
def manhattan_distance(data):
    return minkowski_distance(data, 1)


# d
# output: numpy Array
def euclidean_distance(data):
    return minkowski_distance(data, 2)


#e
# output: numpy Array
def supremum_distance(data):
    sup_matrix = []
    n_row = np.size(data, 0)
    n_col = np.size(data, 1)
    for i in range(n_row):
        row = []
        for j in range(n_row):
            if j > i:
                row.append(np.nan)
            else :
                row.append(np.max([(data[i, k] - data[j, k]) for k in range(n_col)]))
        
        sup_matrix.append(row)
    
    return np.array(sup_matrix)


# f
# output: numpy Array
def cosine_similarity(data):
    cosine_matrix = []
    n_row = np.size(data, 0)
    n_col = np.size(data, 1)
    for i in range(n_row):
        row = []
        for j in range(n_row):
            if j > i:
                row.append(np.nan)
            else :
                s = sum(data[i, k] * data[j, k] for k in range(n_col))
                d1 = math.sqrt(sum([data[i, k] ** 2 for k in range(n_col)]))
                d2 = math.sqrt(sum([data[j, k] ** 2 for k in range(n_col)]))
                row.append(s / (d1 * d2))
        
        cosine_matrix.append(row)
    
    return np.array(cosine_matrix)

            



normal_data = normalization(data)
# cor_matrix = correlation_matrix(normal_data)
# man_matrix = manhattan_distance(normal_data)
# euc_matrix = euclidean_distance(normal_data)
# sup_matrix = supremum_distance(normal_data)
# cosine_matrix = cosine_similarity(normal_data)
print(normal_data)













[[ 1.00831277  0.56986043 -0.80595947 ...  1.49861442 -0.07326831
  -0.20201979]
 [ 1.00831277 -0.24400101 -0.00810203 ... -1.74175564 -0.07326831
  -0.20201979]
 [ 1.00831277 -0.24400101 -0.00810203 ... -1.51828184 -0.07326831
  -0.20201979]
 ...
 [ 1.00831277 -0.24400101 -1.64008316 ...  1.49861442 -0.07326831
  -0.02653216]
 [-1.58736044 -0.24400101  0.68095666 ... -0.00983371 -0.07326831
  -0.20201979]
 [ 0.57570057 -1.05786246 -2.02087875 ...  0.26950853 -0.07326831
  -0.20201979]]


In [341]:
import pandas as pd
import numpy as np
import math

df = pd.read_csv("hayes-roth.csv", header=None)
data = df.to_numpy()

# get number of ranks (Mf)
def get_Mf(array):
    o = []
    for x in array:
        if x in o:
            continue
        else:
            o.append(x)
            
    return len(o)


def z(data):
    new_data = []
    nc = np.size(data, 1)
    for i in range(nc):
        col = data[:, i]
        Mf = get_Mf(col)
        row = [(r - 1) / (Mf - 1) for r in col]
        new_data.append(row)
    
    return np.transpose(np.array(new_data))
        

# i , j is 1-D vector
# i , j must be same len
def distance(i, j):
    p = np.size(i)
    m = 0
    for k in range(p):
        if i[k] == j[k]:
            m += 1
    return (p - m) / p


def distance_matrix(data):
    matrix = []
    nr = np.size(data, 0)
    nc = np.size(data, 1)
    for i in range(nr):
        row = []
        for j in range(nr):
            if j > i :
                row.append(np.nan)
            else:
                row.append(distance(data[i, :], data[j, :]))
                
        matrix.append(row)
    
    return np.array(matrix)
                
        
        

# if dont assign data = z(data) result is same.
# function "z" for normalize ordinal attribute form "0" to "1"
data = z(data)
print(distance_matrix(data))



[[0.  nan nan ... nan nan nan]
 [0.4 0.  nan ... nan nan nan]
 [0.8 0.8 0.  ... nan nan nan]
 ...
 [0.6 0.6 1.  ... 0.  nan nan]
 [1.  0.8 0.8 ... 0.4 0.  nan]
 [0.2 0.6 0.6 ... 0.6 1.  0. ]]


In [30]:
import pandas as pd
import numpy as np
import math

df = pd.read_csv("semeion.csv", header=None)
data = df.to_numpy()


# i , j is 1-D vector
# i , j must be same len
# def contingency_table(i, j):
#     q = 0
#     r = 0
#     s = 0
#     t = 0
#     n = len(i)
    
#     for x in range(n):
#         if i[x] == 1 and j[x] == 1:
#             q += 1
#         elif i[x] == 1 and j[x] == 0:
#             r += 1
#         elif i[x] == 0 and j[x] == 1:
#             s += 1
#         else:
#             t += 1
    
#     return [[q,r],[s,t]]

def sim(i, j):
    q = np.sum(np.bitwise_and(i, j))
    t = np.sum(np.bitwise_not(np.bitwise_or(i, j)) + 2)
    p = len(i)
    return q / (p - t)


def jaccard_sim(data):
    matrix = []
    n = np.size(data, 0)
    for i in range(n):
        row = []
        for j in range(n):
            if j > i:
                row.append(np.nan)
            else:
                row.append(sim(data[i, :], data[j, :]))     
                
        matrix.append(row)
    
    return np.array(matrix)
                

print(jaccard_sim(data))





[[1.                nan        nan ...        nan        nan        nan]
 [0.23976608 1.                nan ...        nan        nan        nan]
 [0.33793103 0.3        1.         ...        nan        nan        nan]
 ...
 [0.2247191  0.18421053 0.11724138 ... 1.                nan        nan]
 [0.24561404 0.19047619 0.13768116 ... 0.38167939 1.                nan]
 [0.46938776 0.17880795 0.26984127 ... 0.1572327  0.17763158 1.        ]]


0.42857142857142855
