In [17]:
import numpy as np
import pandas as pd
import sklearn.utils as utils
import sklearn.linear_model as models
from sklearn.model_selection import cross_validate
from similartiy_fun import numerical_similarity_fun, categorical_similarity_fun, voting_similarity

In [18]:
def similarity1(o1, o2):
    return np.dot(o1,o2)

In [19]:
def get_data(path, sep = ',', names = None):
    df = pd.read_table(path , header=None, sep=sep, names = names)
    data = df.values
    return data

names = ['age', 'W eduction', 'H eduction', 'no of children', 'religon', 'working', 'H of occupation', 'standard of living', 'media expouser', 'cmc']

In [20]:
def processed_data(data,  numerical_columns):
    processed_data = []
    
    processed_data_n = data[:,numerical_columns]
    processed_data_n = np.concatenate((processed_data_n, data[:,-1].reshape(-1,1)), axis = 1)
    
    categorical_cols = np.setdiff1d(range(len(data[1])-1),numerical_columns)
    processed_data_c = data[:,categorical_cols]
    processed_data_c = np.concatenate((processed_data_c, data[:,-1].reshape(-1,1)), axis = 1)
                    
    return processed_data_n, processed_data_c

In [21]:
def get_similarity_Matrix(data, similarity_function):
    #sm = np.empty((len(data),len(data)),dtype=float)
    sm = []
    lables = []
    #lables = np.empty((len(data),len(data)))
    for i in range(len(data)):
        for j in range(i,len(data)): # only get the upper right of the similarity matrix

            sm.append(similarity_function(data[i][:-1],data[j][:-1])) #exculde the label
            lables.append(data[i][-1] == data[j][-1])
           
    return np.array(sm).reshape(-1,1),np.array(lables).reshape(-1,1)

In [22]:
def sample_data(classifer_data):
    classifier_data = classifer_data[classifer_data[:,1].argsort()] #sort by lables
    lable0_count = (classifier_data[:,1] == 0).sum()
    lable1_count = len(classifier_data)- lable0_count
    slice_size = min(lable0_count, lable1_count)    
    # shuffle each lable part individually
    classifier_data[:lable0_count] = utils.shuffle(classifier_data[:lable0_count]) 
    classifier_data[lable0_count:] = utils.shuffle(classifier_data[lable0_count:])
   # sample from each lable by the slice size
    classifier_data0 = classifier_data[:slice_size]
    classifier_data1 = classifier_data[-slice_size:]
    # concatente the samples
    classifier_data = np.concatenate((classifier_data0, classifier_data1), axis= 0)
    return classifier_data

In [23]:
def get_similarity_metrics(path_to_data, data_sep, numerical_columns, similarity_fn = None, similarity_type=1): #similarity types 1 - numerical 2 - categorical 3 - mixed
    # numerical columns = indecies of numerical cols
    data = get_data(path_to_data, data_sep)
    processed_data_n, processed_data_c = processed_data(data, numerical_columns)
    
    
    if (similarity_type == 1):
        data = processed_data_n
    elif (similarity_type == 2):
        data = processed_data_c
#     else:
#         data = np.concatenate((processed_data_n[:][:-1], processed_data_c), axis = 1)
        
    
    if (similarity_type == 3):
        voting_similarity_n = lambda o1,o2 :voting_similarity(o1,o2,True)
        measures_n, lables_n = get_similarity_Matrix(processed_data_n,voting_similarity_n)
        measures_c, lables_c = get_similarity_Matrix(processed_data_c,voting_similarity)
        measures = np.average((measures_n, measures_c), axis=1)
        lables = lables_c
    else:
        measures , lables = get_similarity_Matrix(data, similarity_fn)
        
        
    classifier_data = sample_data(np.concatenate((measures,lables),axis=1))
    classifer_input = classifier_data[:,0].reshape(-1,1)
    classifer_targets = classifier_data[:,1]
    
    
    clf = models.LogisticRegression(max_iter=400)
    #train the classifier
    scoring = ['f1_macro', 'precision_macro', 'recall_macro']
    scores = cross_validate(clf, classifer_input, classifer_targets, cv=5, scoring=scoring, return_train_score=False)
    
    for score in scores:
        if(score == 'fit_time' or score == 'score_time'):
            continue
        print(score, ': ', np.average(scores[score]))
    return

In [24]:
# get data links, sep and neumaric cols indices
maram_data = ('https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data', ',', [0,3]) 
safana_data = ('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', ',', [0,1]) 
nosiba_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data', ',', [1,3,5,7,9]
ola_data = ('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data', ' ', [1, 4, 7, 10, 12, 15, 17])

In [None]:
print('############### Numerical Similarity ###############')
# for similarity_fun in numerical_similarity_fun:
#     print('------------',similarity_fun,'----------------------')
#     get_similarity_metrics(*nosiba_data, numerical_similarity_fun[similarity_fun], similarity_type=1)

print('############### Categorical Similarity ###############')
# for similarity_fun in categorical_similarity_fun:
#     print('------------',similarity_fun,'----------------------')
#     get_similarity_metrics(*nosiba_data, categorical_similarity_fun[similarity_fun], similarity_type=2)
    
print('############### Mix Similarity ###############')
print('------------','Mixed fun','----------------------')
get_similarity_metrics(*maram_data, similarity_type=3)

############### Numerical Similarity ###############
############### Categorical Similarity ###############
############### Mix Similarity ###############
------------ Mixed fun ----------------------


In [8]:
def print_metrics(path_to_data, data_sep, numerical_columns, numerical_similarity_fun, categorical_similarity_fun):
    
    print('############### Numerical Similarity ###############')
    for similarity_fun in numerical_similarity_fun:
        print('------------',similarity_fun,'----------------------')
        get_similarity_metrics(*maram_data, numerical_similarity_fun[similarity_fun], [0,3], similarity_type=1)

    print('############### Categorical Similarity ###############')
    for similarity_fun in categorical_similarity_fun:
        print('------------',similarity_fun,'----------------------')
        get_similarity_metrics(*maram_data, categorical_similarity_fun[similarity_fun], [0,3], similarity_type=2)
        
    return 

In [11]:
all_data_args = {
    'Maram': maram_data
    ,'Safana': safana_data
    ,'Nosiba': nosiba_data
    ,'Ola': ola_data
}
for name, data_args in all_data_args.items():
    print('************************', name ,'***************************')
    print_metrics(*data_args, numerical_similarity_fun, categorical_similarity_fun)