In [1]:
import numpy as np
import pandas as pd
import sklearn.utils as utils
import sklearn.linear_model as models
from sklearn.model_selection import cross_validate
from similartiy_fun import numerical_similarity_fun, categorical_similarity_fun

In [2]:
def similarity1(o1, o2):
    return np.dot(o1,o2)

In [3]:
def get_data(path, sep = ',', names = None):
    df = pd.read_table(path , header=None, sep=sep, names = names)
    data = df.values
    return data

names = ['age', 'W eduction', 'H eduction', 'no of children', 'religon', 'working', 'H of occupation', 'standard of living', 'media expouser', 'cmc']

In [4]:
def processed_data(data,  numerical_columns):
    processed_data = []
    
    processed_data_n = data[:,numerical_columns]
    processed_data_n = np.concatenate((processed_data_n, data[:,-1].reshape(-1,1)), axis = 1)
    
    categorical_cols = np.setdiff1d(range(len(data[1])-1),numerical_columns)
    processed_data_c = data[:,categorical_cols]
    processed_data_c = np.concatenate((processed_data_c, data[:,-1].reshape(-1,1)), axis = 1)
                    
    return processed_data_n, processed_data_c

In [31]:
cols = range(5)
ncols = [1,3]
np.setdiff1d(cols,ncols)

array([0, 2, 4])

In [5]:
def get_similarity_Matrix(data, similarity_function):
    #sm = np.empty((len(data),len(data)),dtype=float)
    sm = []
    lables = []
    #lables = np.empty((len(data),len(data)))
    for i in range(len(data)):
        for j in range(i,len(data)): # only get the upper right of the similarity matrix
#             set_trace()
            sm.append(similarity_function(data[i][:-1],data[j][:-1])) #exculde the label
            lables.append(data[i][-1] == data[j][-1])
           
    return np.array(sm).reshape(-1,1),np.array(lables).reshape(-1,1)

In [6]:
def sample_data(classifer_data):
    classifier_data = classifer_data[classifer_data[:,1].argsort()] #sort by lables
    lable0_count = (classifier_data[:,1] == 0).sum()
    lable1_count = len(classifier_data)- lable0_count
    slice_size = min(lable0_count, lable1_count)    
    # shuffle each lable part individually
    classifier_data[:lable0_count] = utils.shuffle(classifier_data[:lable0_count]) 
    classifier_data[lable0_count:] = utils.shuffle(classifier_data[lable0_count:])
   # sample from each lable by the slice size
    classifier_data0 = classifier_data[:slice_size]
    classifier_data1 = classifier_data[-slice_size:]
    # concatente the samples
    classifier_data = np.concatenate((classifier_data0, classifier_data1), axis= 0)
    return classifier_data

In [12]:
def get_similarity_metrics(path_to_data, data_sep, numerical_columns, similarity_fn, similarity_type=1): #similarity types 1 - numerical 2 - categorical 3 - mixed
    # numerical columns = indecies of numerical cols
    data = get_data(path_to_data, data_sep)
    processed_data_n, processed_data_c = processed_data(data, numerical_columns)
    
    
    if (similarity_type == 1):
        data = processed_data_n
    elif (similarity_type == 2):
        data = processed_data_c
    else:
        data = np.concatenate((processed_data_n[:][:-1], processed_data_c), axis = 1)
   
    measures,lables = get_similarity_Matrix(data, similarity_fn)
        
        
    classifier_data = sample_data(np.concatenate((measures,lables),axis=1))
    classifer_input = classifier_data[:,0].reshape(-1,1)
    classifer_targets = classifier_data[:,1]
    
    
    clf = models.LogisticRegression(max_iter=400)
    #train the classifier
    scoring = ['f1_macro', 'precision_macro', 'recall_macro']
    scores = cross_validate(clf, classifer_input, classifer_targets, cv=5, scoring=scoring, return_train_score=False)
    
    for score in scores:
        if(score == 'fit_time' or score == 'score_time'):
            continue
        print(score, ': ', np.average(scores[score]))
    return

In [None]:
get_similarity_metrics("../cmc.data", similarity1, [0,3], similarity_type=1)

In [None]:
get_similarity_metrics("covtype.data",',', similarity1, [0,3], similarity_type=1)

In [10]:
from IPython.core.debugger import set_trace

In [13]:
print('############### Numerical Similarity ###############')
for similarity_fun in numerical_similarity_fun:
    print('------------',similarity_fun,'----------------------')
    get_similarity_metrics(*maram_data, numerical_similarity_fun[similarity_fun], similarity_type=1)

print('############### Categorical Similarity ###############')
for similarity_fun in categorical_similarity_fun:
    print('------------',similarity_fun,'----------------------')
    get_similarity_metrics(*maram_data, categorical_similarity_fun[similarity_fun], similarity_type=2)

############### Numerical Similarity ###############
------------ cosine ----------------------
------------ Euclidean_distance ----------------------




test_f1_macro :  0.5117475032153039
test_precision_macro :  0.5117485235017738
test_recall_macro :  0.5117484351353397
------------ wieghted_euclidean ----------------------




test_f1_macro :  0.5099290646412662
test_precision_macro :  0.5110502726484383
test_recall_macro :  0.5109579230256218
------------ mahalanobis ----------------------




test_f1_macro :  0.507285254812209
test_precision_macro :  0.5101805644160656
test_recall_macro :  0.5099593811286367
------------ minkowski ----------------------




test_f1_macro :  0.5075208416029461
test_precision_macro :  0.510402406953008
test_recall_macro :  0.5101778080354155
############### Categorical Similarity ###############
------------ cosine ----------------------




test_f1_macro :  0.5061010960569666
test_precision_macro :  0.5068482517801292
test_recall_macro :  0.5068090447711663
------------ overlap ----------------------




test_f1_macro :  0.4933789237195142
test_precision_macro :  0.5075672699349842
test_recall_macro :  0.5067674373768176
------------ Jaccard_Distance ----------------------




test_f1_macro :  0.5057452309212177
test_precision_macro :  0.5063993361747916
test_recall_macro :  0.5063669784430651
------------ Monge_Elkan ----------------------


TypeError: Input is expected to be a string

In [10]:
l = np.array([2, 3, 1, 1, 2, 3, 0])

In [12]:
l.tolist()

[2, 3, 1, 1, 2, 3, 0]

In [95]:
list(np.array([2, 3, 1, 1, 2, 3, 0]))

[2, 3, 1, 1, 2, 3, 0]

In [8]:
def print_metrics(path_to_data, data_sep, numerical_columns, numerical_similarity_fun, categorical_similarity_fun):
    
    print('############### Numerical Similarity ###############')
    for similarity_fun in numerical_similarity_fun:
        print('------------',similarity_fun,'----------------------')
        get_similarity_metrics(*maram_data, numerical_similarity_fun[similarity_fun], [0,3], similarity_type=1)

    print('############### Categorical Similarity ###############')
    for similarity_fun in categorical_similarity_fun:
        print('------------',similarity_fun,'----------------------')
        get_similarity_metrics(*maram_data, categorical_similarity_fun[similarity_fun], [0,3], similarity_type=2)
        
    return 

In [9]:
maram_data = ('https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data', ',', [0,3]) # fine
safana_data = ('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', ',', [0,1]) # need to process the categorical char cols
nosiba_data = ''
ola_data = ('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data', ' ', [1,4]) # fine

In [25]:
all_data_args = {
    'Maram': maram_data
    ,'Safana': safana_data
    ,'Nosiba': nosiba_data
    ,'Ola': ola_data
}
for name, data_args in all_data_args.items():
    print('************************', name ,'***************************')
    print_metrics(*data_args, numerical_similarity_fun, categorical_similarity_fun)

SyntaxError: invalid syntax (<ipython-input-25-fa76a51f7ba9>, line 3)

In [52]:
get_similarity_metrics(ola_data[0],ola_data[1], similarity1, [0], similarity_type=1)

  


test_f1_macro :  0.5806479272372341
test_precision_macro :  0.5863541318498509
test_recall_macro :  0.5837765025407394


In [49]:
data = pd.read_table(ola_data[0], sep='  ')

  """Entry point for launching an IPython kernel.


In [50]:
data.head()

Unnamed: 0,1,6,4,12,5,5.1,3,4.1,1.1,67,...,0,0.1,1.3,0.2,0.3,1.4,0.4,0.5,1.5,1.6
0,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,0,0,1,2.0
1,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,0,1,0,1,0,1.0
2,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,0,0,0,1,1.0
3,1,24,3,49,1,3,3,4,4,53,...,1,0,1,0,0,0,0,0,1,2.0
4,4,36,2,91,5,3,3,4,4,35,...,0,0,1,0,0,0,0,1,0,1.0


In [63]:
data = get_data(*ola_data)

In [64]:
data

array([['A11', 6, 'A34', ..., 'A192', 'A201', 1],
       ['A12', 48, 'A32', ..., 'A191', 'A201', 2],
       ['A14', 12, 'A34', ..., 'A191', 'A201', 1],
       ...,
       ['A14', 12, 'A32', ..., 'A191', 'A201', 1],
       ['A11', 45, 'A32', ..., 'A192', 'A201', 2],
       ['A12', 45, 'A34', ..., 'A191', 'A201', 1]], dtype=object)

In [None]:
with open('safana_data.txt','w') as file:
    np.savetxt(file,data,delimiter=',')