e.g., fuzz.ratio("a", "abc"), T = 1 + 3 = 4; M = 1 (only a matches); so 2.0*1/4 = 0.5 (i.e., 50)


Where T is the total number of elements in both sequences, and M is the number of matches, this is 2.0*M / T

MinMaxScaler with range [0.01, 0.99]: <br>
eq: <br>
    X_scaled = scale * X + min - X.min(axis=0) * scale <br>
    where scale = (max - min) / (X.max(axis=0) - X.min(axis=0)) <br>
    
e.g., transform the val 800 to the range [0.01, 0.99]:

scale = (0.99 - 0.01)/ (1000 - 12) <br>
X_scaled = scale * 800 + 0.01 - 12 * scale


In [191]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

data = pd.read_csv('data/som_clustering_dataset1.csv', sep = ',')

In [2]:
### step 1, normalize a numerical series to a given range
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99))

for i in data.columns[0:-1]:
    x = data[i].values.reshape(-1, 1) 
    x_scaled = min_max_scaler.fit_transform(x)
    data[i] = x_scaled

In [3]:
# step 2, transform the scaled numerical series to an alphabetical sequence
seq_dict = {0:'a',1:'b',2:'c',3:'d',4:'e',5:'f',6:'g',7:'h',8:'i',9:'j',10:'k',11:'l',
            12:'m',13:'n',14:'o',15:'p',16:'q',17:'r',18:'s',19:'t',20:'u',21:'v',22:'w',23:'x',24:'y',25:'z'}
ref_seq = [] ### for reference sequence
arr = '' 
slot_num = 26
# Put array elements in different alphabet 
for row in range(len(data)):
    for col in range(len(data.columns[0:-1])):
        arr = arr + seq_dict[int(slot_num*data.iloc[row,col])]
    ref_seq.append(arr)   
    arr = ''

In [300]:
# step 3, match the alphabetical sequence to the reference alphabetical sequence
# this step is with a clustering method.

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(ref_seq)
true_k = 128 ### set some clusters 
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1,verbose=1)
model.fit(X)

Initialization complete
Iteration  0, inertia 406.000
Iteration  1, inertia 203.000
Converged at iteration 1: center shift 0.000000e+00 within tolerance 3.012021e-07


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
    n_clusters=128, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=1)

In [303]:
# step 3 result:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
clustering_pattern = []
for i in range(true_k):
    print ("Cluster %d:" % i,)
    for ind in order_centroids[i, :1]:
        print (' %s' % terms[ind],)
        print(ind)
        clustering_pattern.append(terms[ind])
    print()

Top terms per cluster:
Cluster 0:
 bccbdabbeaecbabbcbadbacchbaadbaccebaabababbcccaaacbcbbcdabacbadacdbccaebbbebbacabbfbbbbbcaabcacdcadddbcbcacbebdabdbcdabdcebfalcaaabcbaadadbacbdcba
165

Cluster 1:
 aadhdgdbdbcgbedcgeiijkehmcdjifhdqhcedjfcegbcbeddgljjjefdefhcgaijmlhkeihjdegiibhgkffnjdghbedhqknnfignlluhihjjhligdmhinnqlpfkilshdjnrmeicnjdffdpnhbe
33

Cluster 2:
 bbaaaaaaccaaaabaaabbcababbcdaaabaabaaacabddeciijkkijiikrkoefhqzjmrppnneeebddgadgccaebcbebcbedcdbbiacfdccbcdabbbcabbdfccffdbbfmddcdhjcfahhieebgebba
148

Cluster 3:
 acbdaddbaahccdbchebaaeccgccccbaahdbbbbaddaaaecbbececebacafbebbcaaeecbabbcbbhcegbecedjacbaaabffbbbbbddafbddebcgbcbceadbaafdbdfccbbbabbadghecfbadbbb
61

Cluster 4:
 alcbcccihhmledgeifgkefgiogeinclbbhbebfcdihihgrlnppmnlklmmoigejeddhejighejpfddbcgkdbekcedageeeedidedecdieebdfdhfdcbbbefiegbbdghfdefbggedkdecfcebdbh
128

Cluster 5:
 godebffeecegccddgfebbicgmffgrhdcohhebkfggdggficdcceiidfebjfeeecccgffcdfhdebfhfhejaiipbbhfbgcfkghcgegikmgcgfjgjjedgidhddckifgimjefmfflkfnolekcgiefe


In [304]:
# step 4, choose a target seq to match clusters

def cluster_matching(vec_be_calculated, accept_range, accept_ratio):
    len_of_vec = len(vec_be_calculated)
    tmp_list = []
    for j in range(len(clustering_pattern)):
        match = 0
        for i in range(len_of_vec):
            if(abs(ord(vec_be_calculated[i]) - ord(clustering_pattern[j][i])) <= accept_range): # e.g., 'a' in ASCII = 97; 'b' in ASCII = 98; abs(97 - 98) = 1
                match += 1
        if (round((2*match)/(2*len_of_vec) > accept_ratio)):
            tmp_list.append([j,round((2*match)/(2*len_of_vec),2)]) # [id in clustering_pattern, matching ratio]
            #print(j, round((2*match)/(2*len_of_vec),2))
    return tmp_list

def takeSecond(elem):
    return elem[1]

In [313]:
vec_be_calculated = ref_seq[6]
accept_range = 2
accept_ratio = 0.7
query_ts = cluster_matching(vec_be_calculated, accept_range, accept_ratio)
query_ts.sort(key=takeSecond, reverse=True)
print(query_ts)

[[34, 0.92], [79, 0.9], [44, 0.89], [88, 0.88], [60, 0.86], [7, 0.85], [20, 0.84], [54, 0.84], [0, 0.83], [41, 0.83], [39, 0.82], [40, 0.82], [51, 0.82], [56, 0.82], [19, 0.81], [71, 0.81], [75, 0.81], [119, 0.81], [87, 0.8], [6, 0.79], [127, 0.79], [107, 0.78], [118, 0.78], [52, 0.77], [58, 0.77], [105, 0.77], [35, 0.76], [98, 0.75], [17, 0.73], [80, 0.73], [95, 0.73], [32, 0.71]]


In [326]:
# get the original time seris
for j in query_ts:
    for i in range(len(ref_seq)):
        if(ref_seq[i] == clustering_pattern[j[0]]):
            print("row:", i ,"in the original dataset")
            break;

row: 3 in the original dataset
row: 8 in the original dataset
row: 15 in the original dataset
row: 20 in the original dataset
row: 12 in the original dataset
row: 5 in the original dataset
row: 98 in the original dataset
row: 50 in the original dataset
row: 78 in the original dataset
row: 55 in the original dataset
row: 51 in the original dataset
row: 63 in the original dataset
row: 66 in the original dataset
row: 47 in the original dataset
row: 70 in the original dataset
row: 10 in the original dataset
row: 43 in the original dataset
row: 18 in the original dataset
row: 39 in the original dataset
row: 35 in the original dataset
row: 30 in the original dataset
row: 24 in the original dataset
row: 101 in the original dataset
row: 53 in the original dataset
row: 65 in the original dataset
row: 186 in the original dataset
row: 79 in the original dataset
row: 92 in the original dataset
row: 104 in the original dataset
row: 27 in the original dataset
row: 67 in the original dataset
row: 80 

################################################################################################################################