# Creació dataset 
### Exploració de funcionament amb un únic escenari

Donada una traça i un interval de temps crei un dataset de la següent manera:


| Escenari | Node | NUMCONT | ICT | DURC | Centr |
| --- | --- | --- | --- | --- | --- |
| Scenario01 | 1 | 32 | 23 | 3 | 5.6 |
| Scenario01 | 2 | 31 | 25 | 10 | 7.2 |


Vol dir (per la primera línia, per exemple) que a l'escenari Scenario01, el node 1 durant l'interval de temps donat ha vist 32 nodes diferents, la mitjana de temps entre contactes és 23 i la mitjana de la duració d'aquests contactes és 3 segons.

In [1]:
"""
Dependencies:
numpy
pandas
"""
import numpy as np
import pandas as pd
import random

In [2]:
dataset = pd.read_csv('Scenario01.txt', sep=" ", header=None)
dataset.columns = ["Time", "CONN", "Node", "Node2", "Type"]
dataset.drop(["CONN"], axis = 1, inplace = True)

# dades inicials de cada escenari
dataset

Unnamed: 0,Time,Node,Node2,Type
0,1.0,104,148,up
1,1.0,23,220,up
2,1.0,107,155,up
3,1.0,60,103,up
4,1.0,113,160,up
...,...,...,...,...
113454,14399.0,1,225,down
113455,14399.0,92,212,down
113456,14400.0,171,233,down
113457,14400.0,26,232,up


In [3]:
def both_ways(dataset):  # TODO. Optimize function
    """Adds rows to the dataset so that a connection between to nodes is
    counted in both directions"""
    column_list = list(dataset.columns)

    # Use the indices to swap the columns of "Node" and "Node2"
    column_list[2], column_list[3] = column_list[3], column_list[2]
    auxiliar = dataset[column_list]
    dataset = pd.concat([auxiliar, dataset], ignore_index=True)

    return dataset

# afegim el "Node2" a la columna "Node" per a tenir en compte la connexió en les dues direccions
modificat = both_ways(dataset)
modificat

Unnamed: 0,Time,Node,Type,Node2
0,1.0,104,up,148
1,1.0,23,up,220
2,1.0,107,up,155
3,1.0,60,up,103
4,1.0,113,up,160
...,...,...,...,...
226913,14399.0,1,down,225
226914,14399.0,92,down,212
226915,14400.0,171,down,233
226916,14400.0,26,up,232


In [4]:
fdset = pd.DataFrame()
fdset = modificat.groupby(['Node'])["Node2"].size().reset_index(name ='NumCont')

# dataset final on s'emmagatzemaran les dades tractades de tots els escenaris
fdset 

Unnamed: 0,Node,NumCont
0,0,992
1,1,1208
2,2,436
3,3,456
4,4,1576
...,...,...
234,234,180
235,235,140
236,236,144
237,237,64


In [5]:
def elastic_centrality(data):

    # inicialitzacions
    k = 30
    gamma = 0.98
    ec_node = 0
    ec_time = 0

    # iterem les dades
    for time, action in zip(data['Time'], data['Type']):
        
        # actualitzar els valors de centralitat segons el temps
        if ec_time >= k:
            ec_node = ec_node * gamma**k
        
        # gestionar les dades segons l'acció
        if action == 'up':
            ec_node += 1
        if action == 'down':
            ec_time = time - ec_time
    
    return pd.Series({'EC':ec_node})
            
ec_data = pd.DataFrame()
ec_data = modificat.groupby(['Node'], as_index = False).apply(elastic_centrality)

ec_data.head()

Unnamed: 0,Node,EC
0,0,0.776498
1,1,0.77655
2,2,1.200144
3,3,2.745628
4,4,0.774606


In [None]:
test_ratio = 0.2
nodes_scenario = np.unique(np.array(dataset.Node2.unique().tolist() + dataset.Node.unique().tolist()))

test_nodes = []
for _ in range(int(max(nodes_scenario)*test_ratio)):
    test_nodes.append(random.choice(nodes_scenario))

In [None]:
# TODO: generate train and test files

# data_training = dataset
# for node in test_nodes:
#     data_training = data_training[ (data_training['Node'] != node) & (data_training['Node2'] != node)]

# testing = (dataset.merge(data_training, on='Node', how='left', indicator=False).query('_merge == "left_only"').drop('_merge', 1))
# testing = (testing.merge(data_training, on='Node2', how='left', indicator=False).query('_merge == "left_only"').drop('_merge', 1))

In [6]:
# llibreria de funcions utilitzades

def find_mean(temps):
    """Calculates the mean pair value given a list"""
    # If there is no info in the list, then there is no full circle of connections
    # and so provisionally, we count as 0 (because we cannot divide by 0 later)
    if len(temps) == 0:
        return 0
    
    temps = temps.tolist()
    
    # Find each pair, compute the time between itemps and return its mean value
    t_connected = 0
    for i in range(0, len(temps), 2):
        inici = temps[i]
        final = temps[i+1]
        t_connected += (final - inici)
        
    return t_connected/(len(temps)/2)


def mean_btw_conns(temps):
    """To calculate the mean time between different connections (ICT)"""
    # If the list is even then the last value is a "down" item and we take it
    # down to not disturbe the calculations
    if len(temps)%2 == 0:
        temps = temps[:-1]
    
    # We take down the first item in the list so we have inversed pairs to calculate
    # the time mean
    temps = temps[1:]
    
    return find_mean(temps)

def mean_conn_duration(temps):
    """To calculate the mean time of the connections's duration (DURC)"""
    # If the list is not even then the last communication did not conclude
    # provisionally, we take it down
    if len(temps)%2 != 0:
        temps = temps[:-1]
    
    return find_mean(temps)

In [7]:
# calcul per parelles, encara s'ha de calcular les mitjanes totals
dtemps = dataset.groupby(['Node', 'Node2']).agg(pair_DURC = ("Time", mean_conn_duration), pair_ICT = ("Time", mean_btw_conns)).reset_index()
dtemps

Unnamed: 0,Node,Node2,pair_DURC,pair_ICT
0,0,8,21.333333,266.500000
1,0,15,5.000000,0.000000
2,0,28,16.000000,0.000000
3,0,36,20.000000,0.000000
4,0,43,24.500000,456.000000
...,...,...,...,...
16151,236,238,1.153846,1005.750000
16152,236,239,1.125000,1376.285714
16153,237,238,1.000000,2104.200000
16154,237,239,1.200000,962.111111


In [8]:
final_attrs = dtemps.groupby(['Node']).agg(DURC = ("pair_DURC", np.mean), ICT = ("pair_ICT", np.mean)).reset_index()

# visualització final dels atributs calculats
final_attrs

Unnamed: 0,Node,DURC,ICT
0,0,7.285278,1464.335950
1,1,5.650332,2440.952995
2,2,16.490136,649.230952
3,3,4.808173,1309.505403
4,4,7.398552,1684.714577
...,...,...,...
234,234,1.438571,1784.220404
235,235,1.170940,1568.518750
236,236,1.115171,1090.845238
237,237,1.100000,1533.155556


In [9]:
# merge dels datasets importants utilitzant "Node" com a clau
fdset = fdset.merge(final_attrs, how = 'inner')
fdset = fdset.merge(ec_data, how = 'inner')

# afegeix l'escenari d'on s'han extret les dades
fdset["Escenari"] = 1

# visualització final
fdset

Unnamed: 0,Node,NumCont,DURC,ICT,EC,Escenari
0,0,992,7.285278,1464.335950,0.776498,1
1,1,1208,5.650332,2440.952995,0.776550,1
2,2,436,16.490136,649.230952,1.200144,1
3,3,456,4.808173,1309.505403,2.745628,1
4,4,1576,7.398552,1684.714577,0.774606,1
...,...,...,...,...,...,...
234,234,180,1.438571,1784.220404,1.200144,1
235,235,140,1.170940,1568.518750,1.200144,1
236,236,144,1.115171,1090.845238,2.200144,1
237,237,64,1.100000,1533.155556,2.200057,1
