In [1]:
import math
import json
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from glob import glob
from pathlib import Path
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import pairwise_distances

INTERVAL = 30

In [2]:
labelfile = "timeseries_feature/interval_30_src_feature/rfcm_results.csv"
typefile = "timeseries_feature/interval_30_src_feature/types.json"

with open(typefile) as f:
    types = json.load(f)

df = pd.read_csv(labelfile, index_col=0, dtype=types)
df = df.astype(np.float16)
df.head()

Unnamed: 0_level_0,bytes-bytes_packets,bytes-flows,bytes-flows_(bytes_packets),bytes-nDstIP,bytes-nDstPort,bytes-nSrcPort,bytes_packets-flows_(bytes_packets),bytes_packets-nDstIP,bytes_packets-nDstPort,bytes_packets-nSrcPort,...,nDstIP-nDstPort,nDstIP-nSrcPort,nSrcPort-nDstPort,packets-bytes,packets-bytes_packets,packets-flows,packets-flows_(bytes_packets),packets-nDstIP,packets-nDstPort,packets-nSrcPort
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.215.192.227,6.0,6.0,6.0,6.0,6.0,6.0,7.0,1.0,1.0,1.0,...,5.0,5.0,5.0,6.0,7.0,8.0,5.0,8.0,8.0,8.0
10.105.0.11,6.0,6.0,6.0,6.0,6.0,6.0,7.0,1.0,1.0,1.0,...,5.0,5.0,5.0,6.0,7.0,8.0,5.0,8.0,8.0,8.0
10.105.0.19,6.0,6.0,6.0,6.0,6.0,6.0,8.0,7.0,7.0,7.0,...,8.0,8.0,8.0,6.0,8.0,8.0,5.0,8.0,8.0,8.0
10.105.0.25,6.0,6.0,6.0,6.0,6.0,6.0,7.0,1.0,1.0,1.0,...,5.0,5.0,5.0,6.0,7.0,8.0,5.0,8.0,8.0,8.0
10.105.0.3,6.0,6.0,6.0,6.0,6.0,6.0,7.0,1.0,1.0,1.0,...,2.0,2.0,2.0,6.0,7.0,8.0,3.0,8.0,8.0,8.0


In [3]:
cluster_sizes = [df.iloc[:, i].value_counts() for i in range(len(df.columns))]
number_entries = len(df.index)
number_features = len(df.columns)

In [4]:
def similarity_func(a, b):
    return np.sum([math.e ** -(5 * cluster_sizes[i][a[i]] / number_entries) for i in np.where(a == b)[0]], dtype=np.float16) / number_features
    # return np.sum([cluster_sizes[i][a[i]] for i in np.where(a == b)[0]], dtype=np.float16)

In [5]:
# similarity_matrix = pd.DataFrame(pairwise_distances(df, metric=similarity_func, n_jobs=4), index=df.index, columns=df.index, dtype=np.float16)
similarity_matrix = pd.DataFrame(squareform(pdist(df, similarity_func)), index=df.index, columns=df.index, dtype=np.float16)
similarity_matrix

ip,0.215.192.227,10.105.0.11,10.105.0.19,10.105.0.25,10.105.0.3,10.105.0.5,10.105.0.7,100.101.17.106,100.101.179.198,100.101.179.3,...,99.175.160.49,99.175.168.76,99.175.169.237,99.175.22.26,99.175.42.132,99.2.14.146,99.3.100.255,99.3.103.221,99.3.106.98,99.3.118.211
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.215.192.227,0.000000,0.186768,0.011597,0.186768,0.012878,0.018387,0.012878,0.186768,0.186768,0.186768,...,0.012878,0.012878,0.011597,0.011597,0.008965,0.181396,0.011597,0.186768,0.181396,0.186768
10.105.0.11,0.186768,0.000000,0.011597,0.186768,0.012878,0.018387,0.012878,0.186768,0.186768,0.186768,...,0.012878,0.012878,0.011597,0.011597,0.008965,0.181396,0.011597,0.186768,0.181396,0.186768
10.105.0.19,0.011597,0.011597,0.000000,0.011597,0.006104,0.011597,0.006104,0.011597,0.011597,0.011597,...,0.006104,0.006104,0.054657,0.054657,0.002184,0.006104,0.356934,0.011597,0.006104,0.011597
10.105.0.25,0.186768,0.186768,0.011597,0.000000,0.012878,0.018387,0.012878,0.186768,0.186768,0.186768,...,0.012878,0.012878,0.011597,0.011597,0.008965,0.181396,0.011597,0.186768,0.181396,0.186768
10.105.0.3,0.012878,0.012878,0.006104,0.012878,0.000000,0.051544,0.019775,0.012878,0.012878,0.012878,...,0.012878,0.012878,0.044769,0.006104,0.008965,0.019775,0.006104,0.012878,0.012878,0.012878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99.2.14.146,0.181396,0.181396,0.006104,0.181396,0.019775,0.012878,0.019775,0.181396,0.181396,0.181396,...,0.012878,0.012878,0.006104,0.006104,0.008965,0.000000,0.006104,0.181396,0.181396,0.181396
99.3.100.255,0.011597,0.011597,0.356934,0.011597,0.006104,0.011597,0.006104,0.011597,0.011597,0.011597,...,0.006104,0.006104,0.066895,0.066895,0.002184,0.006104,0.000000,0.011597,0.006104,0.011597
99.3.103.221,0.186768,0.186768,0.011597,0.186768,0.012878,0.018387,0.012878,0.186768,0.186768,0.186768,...,0.012878,0.012878,0.011597,0.011597,0.008965,0.181396,0.011597,0.000000,0.181396,0.186768
99.3.106.98,0.181396,0.181396,0.006104,0.181396,0.012878,0.012878,0.012878,0.181396,0.181396,0.181396,...,0.012878,0.012878,0.006104,0.006104,0.008965,0.181396,0.006104,0.181396,0.000000,0.181396


In [11]:
similarity_matrix.to_csv("timeseries_feature/interval_30_src_feature/similarity_matrix.csv")

In [None]:
similarity_matrix = pd.read_csv("timeseries_feature/interval_30_src_feature/similarity_matrix.csv", index_col=0, dtype=np.float16)

In [6]:
# similarity_matrix = df.T.corr(similarity_func)
# similarity_matrix

In [7]:
# similarity_matrix = np.zeros((len(df), len(df)), dtype=np.float32)
# for feature in df.columns:
#     print("Feature: {}".format(feature))
#     cluster_count = df[feature].value_counts()
#     for i in range(len(df)):
#         for j in range(len(df)):
#             print("\tPair: ({}, {})".format(i, j), " " * 20, end='\r')
#             if df[feature].iloc[i] == df[feature].iloc[j]:
#                 similarity_matrix[i, j] += math.e ** (-5 * (cluster_count[df[feature].iloc[i]] - 2) / len(df))
#     print()
# similarity_matrix = similarity_matrix / len(df.columns)
# similarity_matrix = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)

In [8]:
# mathmatical constant
# gamma = 5
# num_in_cluster = value_counts()
# n_min = 2
# n = len(df)
# similarity_matrix = pd.DataFrame(0, index=df.index, columns=df.index, dtype=np.float32)
# for feature in df.columns:
#     cluster_count = df[feature].value_counts()
#     for i in df.index:
#         for j in df.index:
#             if df.loc[i, feature] == df.loc[j, feature]:
#                 similarity_matrix.loc[i, j] += math.e ** (-5 * (cluster_count[df.loc[i, feature]] - 2) / len(df))
# similarity_matrix = similarity_matrix / len(df.columns)
# similarity_matrix.head()