In [107]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

abs_data_path = "../data/stats"
CONNECTIONS = [
    # B1
    ("b1", "ftj"),
    ("b1", "uci"),
    ("b1", "b6"),
    # B6
    ("b6", "ms"),
    ("b6", "uci"),
    ("b6", "b1"),
    ("b6", "ftj"),
    # UCI
    ("uci", "ftj"),
    ("uci", "b1"),
    ("uci", "b6"),
    ("uci", "cyfronet"),
     # FTJ
    ("ftj", "b1"),
    ("ftj", "uci"),
    ("ftj", "b6"),
    ("ftj", "cyfronet"),
    ]
NODES = {"cyfronet", "ftj", "uci", "b1", "b6", "ms"}

# graph is directional hence this matrix is asymetric
ADJACENCY_MATRIX = [
    [0, 0, 0, 0, 0, 0],
    [1, 0, 1, 1, 1, 0],
    [1, 1, 0, 1, 1, 0],
    [0, 1, 1, 0, 1, 0],
    [0, 1, 1, 1, 0, 1],
    [0, 0, 0, 0, 0, 0],
]

SUBSET = True

### TODO: 
- inverstigate this :
```
Num of unique collections: 284
Num of collection files: 271
```
- add a description for every column in the final dataframe

### Load a dataframe

In [43]:
files_tree = {}

for date_folder in os.listdir(abs_data_path):
    if date_folder.startswith("."):
        continue

    for log_file in os.listdir(abs_data_path+"/"+date_folder):
        if log_file[:-4] not in files_tree:
            files_tree[log_file[:-4]] = []

        files_tree[log_file[:-4]].append(abs_data_path+"/"+date_folder+"/"+log_file)

files_flat = []
for con in files_tree:
    files_flat += files_tree[con]

files_flat = set(files_flat)


Read file tree into a pandas dataframe

In [114]:
csv_headers = ["timestamp", "incoming_rate_avg", "outgoing_rate_avg", "incoming_rate_max", "outgoing_rate_max"]
col_names = csv_headers + ["collection_timestamp", "src_host", "dst_host"]

df = pd.DataFrame(columns=col_names)

dfs = []

for date_folder in os.listdir(abs_data_path):

    if date_folder.startswith("."):
        continue

    collection_dir_path = abs_data_path + "/" + date_folder

    collection_times = []
    for file_path in os.listdir(collection_dir_path):
        data_file_path = collection_dir_path + "/" + file_path

        cn = 0
        with open(data_file_path) as fil:
            firstline = fil.readline().rstrip()
            cn += 1

            if SUBSET and cn == 100:
                continue
            
        collection_times.append(firstline.split()[0])

        _df = pd.read_csv(data_file_path, sep=" ", names=csv_headers, skiprows=[0])

        # some string maniputaltion to get names of the nodes
        src_host = data_file_path.split("/")[-1].split("-")[0][:-3]
        dst_host = data_file_path.split("/")[-1].split("-")[1][:-4]

        _df["collection_timestamp"] = max(set(collection_times), key = collection_times.count)
        _df["src_host"] = src_host
        _df["dst_host"] = dst_host

        dfs.append(_df[1:])

df = pd.concat(dfs, ignore_index=True)


Column type conversion

In [115]:
df.timestamp = pd.to_datetime(df.timestamp, unit="s")
df.collection_timestamp = pd.to_datetime(df.collection_timestamp, unit="s")
df = df.astype({"incoming_rate_max": np.int64, "outgoing_rate_max": np.int64, "src_host": str, "dst_host": str})

### Collection time analysis

In [116]:
print(f"Num of unique collections: {len(df.collection_timestamp.unique())}")
print(f"Num of collection files: {len([ el for el in list(os.listdir(abs_data_path)) if not el.startswith('.') ])}")

collection_days = (df.collection_timestamp.max() - df.collection_timestamp.min())
collection_start = df.collection_timestamp.min()
collection_end = df.collection_timestamp.max()
print(f"Data collected for {collection_days}, starting from {collection_start} to {collection_end}")

df.head(5)

Num of unique collections: 281
Num of collection files: 270
Data collected for 526 days 23:00:01, starting from 2020-12-13 22:55:03 to 2022-05-24 21:55:04


Unnamed: 0,timestamp,incoming_rate_avg,outgoing_rate_avg,incoming_rate_max,outgoing_rate_max,collection_timestamp,src_host,dst_host
0,2022-04-06 21:50:04,16201,4794,16201,4794,2022-04-06 21:55:03,b6,b1
1,2022-04-06 21:50:00,16331,4831,29239,8524,2022-04-06 21:55:03,b6,b1
2,2022-04-06 21:45:00,29032,8490,29239,8524,2022-04-06 21:55:03,b6,b1
3,2022-04-06 21:40:00,13693,5999,13745,6012,2022-04-06 21:55:03,b6,b1
4,2022-04-06 21:35:00,8762,4762,22377,4793,2022-04-06 21:55:03,b6,b1


#  TODO: maybe move linespace code here

`collection_interval` - time difference between subsequent samples. 
In the dataset samples are agregated and we only need those with the highest 
granularity - lowest `collection_interval`

In [117]:
df["collection_interval"] = df.timestamp.shift(periods=1) - df.timestamp

fix wrong timestamps that appeared as a result of .shift substruction

In [118]:
df.loc[(df.collection_interval < np.timedelta64(5, "m"), "collection_interval")] = np.timedelta64(5, "m")
df.loc[0, "collection_interval"] = np.timedelta64(5, "m")

everything is uniformly spreaded - samples with 5 min intervals

In [119]:
df.loc[df.collection_interval <= np.timedelta64(6, "m")].collection_interval.unique()

array([300000000000], dtype='timedelta64[ns]')

Create a column with artificial timestamps

In [37]:
df.timestamp.count(), df.timestamp.min().value, df.timestamp.max().value

(10266750, 1539129600000000000, 1653429003000000000)

In [None]:

df.head(3)
for collection in df.collection_timestamp.unique()[:2]:
    print(collection)
    print(subset.timestamp.count(), )
    subset = df.loc[(df.collection_timestamp == collection)]

    even_ts_col = np.linspace(
        subset.timestamp.max().value,
        subset.timestamp.min().value,
        subset.timestamp.count(),
        dtype=np.int64
    )

    
    # for k in range(len(even_ts_col)):
    for k in range(5):
        print(f"* {np.array(subset.timestamp)[k]} = {pd.to_datetime(even_ts_col[k], unit='ns')}")


Save as a CSV for further processing

In [120]:
if SUBSET:
    df.loc[df.collection_interval <= np.timedelta64(6, "m")].to_csv("../data/samples_5m_subset_v1.csv")
else:
    df.loc[df.collection_interval <= np.timedelta64(6, "m")].to_csv("../data/samples_5m_v1.csv")

In [79]:
df = df.sort_values(by=["collection_timestamp", "timestamp"], ignore_index=True)

In [53]:
data_collections = df.collection_timestamp.unique().copy()[:5]
nn = 0
diff = []
for pos, coll in enumerate(data_collections):
    if pos == len(data_collections)-1:
        break
    if data_collections[pos+1] - data_collections[pos] != np.timedelta64(2, "D"):
        print(data_collections[pos+1] - data_collections[pos])
        nn+=1
        diff.append(data_collections[pos+1] - data_collections[pos] - np.timedelta64(2, "D"))

print(nn, np.array(diff).mean())

86401000000000 nanoseconds
172799000000000 nanoseconds
172801000000000 nanoseconds
3 -28799666666666 nanoseconds


In [54]:
for collection in df.collection_timestamp.unique()[:3]:
    print(collection)
    for con in CONNECTIONS[:3]:
        print(con)
        print(df.loc[
            (df.collection_timestamp == collection) & \
            (df.src_host == con[0]) &
            (df.dst_host == con[1])
        ])


2020-12-13T22:55:03.000000000
('b1', 'ftj')
                timestamp  incoming_rate_avg  outgoing_rate_avg  \
11    2018-10-10 00:00:00              35008               9212   
26    2018-10-11 00:00:00              35195              10823   
41    2018-10-12 00:00:00              61121              17110   
56    2018-10-13 00:00:00              32003              21691   
71    2018-10-14 00:00:00              23440               6710   
...                   ...                ...                ...   
37961 2020-12-13 22:35:00              16238               2264   
37976 2020-12-13 22:40:00              16804               2302   
37991 2020-12-13 22:45:00              18181               2327   
38006 2020-12-13 22:50:00              16475               2257   
38021 2020-12-13 22:50:03              16452               2257   

       incoming_rate_max  outgoing_rate_max collection_timestamp src_host  \
11                 67485             120536  2020-12-13 22:55:03       b1 

In [56]:
print(df.src_host.unique())
print(df.dst_host.unique())


['b6' 'ftj' 'uci' 'b1']
['b1' 'uci' 'ftj' 'cyfronet' 'b6' 'ms']


### Wierd code, mostly tests

In [None]:
df.groupby(df.collection_timestamp, group_keys=True).apply(lambda x: x).timestamp

collection_timestamp         
2020-12-13 15:25:03   9810450   2020-12-13 15:20:03
                      9810451   2020-12-13 15:20:00
                      9810452   2020-12-13 15:15:00
                      9810453   2020-12-13 15:10:00
                      9810454   2020-12-13 15:05:00
                                        ...        
2022-05-24 21:55:04   6160045   2020-03-24 00:00:00
                      6160046   2020-03-23 00:00:00
                      6160047   2020-03-22 00:00:00
                      6160048   2020-03-21 00:00:00
                      6160049   2020-03-20 00:00:00
Name: timestamp, Length: 10304760, dtype: datetime64[ns]

In [None]:
(df.loc[df.collection_timestamp == "2020-12-13 15:25:03"].loc[df.src_host == "b1"].loc[df.dst_host == "uci"].timestamp.reset_index(drop=True) \
- df.loc[df.collection_timestamp == "2020-12-13 15:25:03"].loc[df.src_host == "b1"].loc[df.dst_host == "ftj"].timestamp.reset_index(drop=True)).unique()

array([0], dtype='timedelta64[ns]')

In [None]:
df[df.collection_timestamp == "2020-12-13 15:25:03"].timestamp.to_numpy() - df[df.collection_timestamp == "2022-05-24 21:55:04"].timestamp.to_numpy()[:-1]

In [None]:
plt.plot(df[df.src_host == "b1rtr"][df.dst_host == "uci"][df.collection_timestamp == df[:1].collection_timestamp.to_numpy()[0]].timestamp[1:], _diff[1:].to_numpy(dtype=np.int64)/(1000000000*60))

In [None]:
_np_diff = _diff.to_numpy(dtype=np.int64)
print(len(_np_diff))
linn = np.linspace(0, 1000, num=len(_np_diff))
print(len(linn), len(_np_diff))
_np_diff.sort()
plt.plot(linn, _np_diff)

In [None]:
first_best_file = files_tree['b6rtr-b1'][0]
first_one = '/Users/nick/Dev/magisterka/data/stats/2020-12-15_00/b6rtr-b1.log'
middle_one = '/Users/nick/Dev/magisterka/data/stats/2021-06-07_00/b6rtr-b1.log'
last_one = '/Users/nick/Dev/magisterka/data/stats/2022-05-25_00/b6rtr-b1.log'

headers = ["timestamp", "incoming_rate_avg", "outgoing_rate_avg", "incoming_rate_max", "outgoing_rate_max"]
data_first = pd.read_csv(first_one, sep=" ", names=headers, skiprows=[0])
data_middle = pd.read_csv(middle_one, sep=" ", names=headers, skiprows=[0])
data_last = pd.read_csv(last_one, sep=" ", names=headers, skiprows=[0])

data_first.timestamp = pd.to_datetime(data_first.timestamp, unit="s")
data_last.timestamp = pd.to_datetime(data_last.timestamp, unit="s")
data_middle.timestamp = pd.to_datetime(data_middle.timestamp, unit="s")

In [None]:

fig = plt.figure(figsize=(24, 3))
plt.plot(data_first.timestamp, data_first.incoming_rate_avg, label="2020-12-15_00")
plt.plot(data_middle.timestamp, data_middle.incoming_rate_avg, label="2021-06-07_00")
plt.plot(data_last.timestamp, data_last.incoming_rate_avg, label="2022-05-25_00")
fig.legend()

# Visual
* Delta histogram
* Delta time series graph
* Aggregation level showed with colors 
* Use 'gggraph' for graph visualization