# Creació dataset 
### Exploració de funcionament amb un únic escenari

Donada una traça i un interval de temps crei un dataset de la següent manera:


| Escenari | Node | NUMCONT | ICT | DURC | Centr |
| --- | --- | --- | --- | --- | --- |
| Scenario01 | 1 | 32 | 23 | 3 | 5.6 |
| Scenario01 | 2 | 31 | 25 | 10 | 7.2 |


Vol dir (per la primera línia, per exemple) que a l'escenari Scenario01, el node 1 durant l'interval de temps donat ha vist 32 nodes diferents, la mitjana de temps entre contactes és 23 i la mitjana de la duració d'aquests contactes és 3 segons.

In [1]:
"""
Dependencies:
numpy
pandas
"""
import numpy as np
import pandas as pd
import random

In [2]:
dataset = pd.read_csv('Scenario01.txt', sep=" ", header=None)
dataset.columns = ["Time", "CONN", "Node", "Node2", "Type"]
dataset.drop(["CONN"], axis = 1, inplace = True)

# dades inicials de cada escenari
dataset

Unnamed: 0,Time,Node,Node2,Type
0,1.0,104,148,up
1,1.0,23,220,up
2,1.0,107,155,up
3,1.0,60,103,up
4,1.0,113,160,up
...,...,...,...,...
113454,14399.0,1,225,down
113455,14399.0,92,212,down
113456,14400.0,171,233,down
113457,14400.0,26,232,up


In [5]:
def both_ways(dataset):  # TODO. Optimize function
    """Adds rows to the dataset so that a connection between to nodes is
    counted in both directions"""
    column_list = list(dataset.columns)

    # Use the indices to swap the columns of "Node" and "Node2"
    column_list[2], column_list[3] = column_list[3], column_list[2]
    auxiliar = dataset[column_list]
    dataset = pd.concat([auxiliar, dataset], ignore_index=True)

    return dataset

# afegim el "Node2" a la columna "Node" per a tenir en compte la connexió en les dues direccions
modificat = both_ways(dataset)
modificat

Unnamed: 0,Time,Node,Type,Node2
0,1.0,104,up,148
1,1.0,23,up,220
2,1.0,107,up,155
3,1.0,60,up,103
4,1.0,113,up,160
...,...,...,...,...
226913,14399.0,1,down,225
226914,14399.0,92,down,212
226915,14400.0,171,down,233
226916,14400.0,26,up,232


In [20]:
test_ratio = 0.2
nodes_scenario = modificat.Node.unique()

test_nodes = []
for _ in range(int(max(nodes_scenario)*test_ratio)):
    test_nodes.append(random.choice(nodes_scenario))

In [21]:
# Datasets for training and testing
training = modificat[~modificat.Node.isin(test_nodes)]
testing = modificat[modificat.Node.isin(test_nodes)]

#### Generate files
***

In [23]:
fdset_train = training.groupby(['Node'])["Node2"].size().reset_index(name ='NumCont')

# dataset final on s'emmagatzemaran les dades tractades de tots els escenaris per a l'entrenament
fdset_train 

Unnamed: 0,Node,NumCont
0,0,992
1,1,1208
2,2,436
3,3,456
4,5,1100
...,...,...
193,233,184
194,234,180
195,235,140
196,236,144


In [25]:
fdset_test = testing.groupby(['Node'])["Node2"].size().reset_index(name ='NumCont')

# dataset final on s'emmagatzemaran les dades tractades de tots els escenaris per al testeig
fdset_test.head()

Unnamed: 0,Node,NumCont
0,4,1576
1,7,904
2,9,1302
3,17,1340
4,19,328


In [26]:
def elastic_centrality(data):

    # inicialitzacions
    k = 30
    gamma = 0.98
    ec_node = 0
    ec_time = 0

    # iterem les dades
    for time, action in zip(data['Time'], data['Type']):
        
        # actualitzar els valors de centralitat segons el temps
        if ec_time >= k:
            ec_node = ec_node * gamma**k
        
        # gestionar les dades segons l'acció
        if action == 'up':
            ec_node += 1
        if action == 'down':
            ec_time = time - ec_time
    
    return pd.Series({'EC':ec_node})
            

ec_data_train = training.groupby(['Node'], as_index = False).apply(elastic_centrality)
ec_data_train.head()

Unnamed: 0,Node,EC
0,0,0.776498
1,1,0.77655
2,2,1.200144
3,3,2.745628
4,5,1.173799


In [27]:
ec_data_test = testing.groupby(['Node'], as_index = False).apply(elastic_centrality)
ec_data_test.head()

Unnamed: 0,Node,EC
0,4,0.774606
1,7,0.776549
2,9,2.200144
3,17,0.903267
4,19,0.776549


In [28]:
# llibreria de funcions utilitzades

def find_mean(temps):
    """Calculates the mean pair value given a list"""
    # If there is no info in the list, then there is no full circle of connections
    # and so provisionally, we count as 0 (because we cannot divide by 0 later)
    if len(temps) == 0:
        return 0
    
    temps = temps.tolist()
    
    # Find each pair, compute the time between itemps and return its mean value
    t_connected = 0
    for i in range(0, len(temps), 2):
        inici = temps[i]
        final = temps[i+1]
        t_connected += (final - inici)
        
    return t_connected/(len(temps)/2)


def mean_btw_conns(temps):
    """To calculate the mean time between different connections (ICT)"""
    # If the list is even then the last value is a "down" item and we take it
    # down to not disturbe the calculations
    if len(temps)%2 == 0:
        temps = temps[:-1]
    
    # We take down the first item in the list so we have inversed pairs to calculate
    # the time mean
    temps = temps[1:]
    
    return find_mean(temps)

def mean_conn_duration(temps):
    """To calculate the mean time of the connections's duration (DURC)"""
    # If the list is not even then the last communication did not conclude
    # provisionally, we take it down
    if len(temps)%2 != 0:
        temps = temps[:-1]
    
    return find_mean(temps)

In [29]:
# calcul per parelles, encara s'ha de calcular les mitjanes totals
dtemps_training = training.groupby(['Node', 'Node2']).agg(pair_DURC = ("Time", mean_conn_duration), pair_ICT = ("Time", mean_btw_conns)).reset_index()
dtemps_training.head()

Unnamed: 0,Node,Node2,pair_DURC,pair_ICT
0,0,8,21.333333,93.8
1,0,15,5.0,-5.0
2,0,28,16.0,-16.0
3,0,36,20.0,-20.0
4,0,43,24.5,135.666667


In [30]:
# calcul per parelles, encara s'ha de calcular les mitjanes totals
dtemps_test = testing.groupby(['Node', 'Node2']).agg(pair_DURC = ("Time", mean_conn_duration), pair_ICT = ("Time", mean_btw_conns)).reset_index()
dtemps_test.head()

Unnamed: 0,Node,Node2,pair_DURC,pair_ICT
0,4,13,10.0,-10.0
1,4,24,33.666667,142.2
2,4,26,18.333333,7.4
3,4,33,10.0,132.0
4,4,37,20.0,-20.0


In [31]:
final_attrs_train = dtemps_training.groupby(['Node']).agg(DURC = ("pair_DURC", np.mean), ICT = ("pair_ICT", np.mean)).reset_index()

# visualització final dels atributs calculats
final_attrs_train

Unnamed: 0,Node,DURC,ICT
0,0,7.285278,562.338845
1,1,5.650332,911.405566
2,2,16.490136,253.854263
3,3,4.808173,482.433422
4,5,13.511776,723.309156
...,...,...,...
193,233,1.456349,883.080925
194,234,1.438571,823.165643
195,235,1.170940,727.757273
196,236,1.115171,517.723831


In [33]:
final_attrs_test = dtemps_test.groupby(['Node']).agg(DURC = ("pair_DURC", np.mean), ICT = ("pair_ICT", np.mean)).reset_index()

# visualització final dels atributs calculats
final_attrs_test.head()

Unnamed: 0,Node,DURC,ICT
0,4,7.398552,678.166631
1,7,9.377404,526.049259
2,9,8.02381,814.939911
3,17,6.314726,895.4102
4,19,12.477162,384.198325


In [34]:
# merge dels datasets importants utilitzant "Node" com a clau
fdset_train = fdset_train.merge(final_attrs_train, how = 'inner')
fdset_train = fdset_train.merge(ec_data_train, how = 'inner')

# afegeix l'escenari d'on s'han extret les dades
fdset_train["Escenari"] = 1

# visualització final
fdset_train

Unnamed: 0,Node,NumCont,DURC,ICT,EC,Escenari
0,0,992,7.285278,562.338845,0.776498,1
1,1,1208,5.650332,911.405566,0.776550,1
2,2,436,16.490136,253.854263,1.200144,1
3,3,456,4.808173,482.433422,2.745628,1
4,5,1100,13.511776,723.309156,1.173799,1
...,...,...,...,...,...,...
193,233,184,1.456349,883.080925,2.200144,1
194,234,180,1.438571,823.165643,1.200144,1
195,235,140,1.170940,727.757273,1.200144,1
196,236,144,1.115171,517.723831,2.200144,1


In [36]:
# merge dels datasets importants utilitzant "Node" com a clau
fdset_test = fdset_test.merge(final_attrs_test, how = 'inner')
fdset_test = fdset_test.merge(ec_data_test, how = 'inner')

# afegeix l'escenari d'on s'han extret les dades
fdset_test["Escenari"] = 1

# visualització final
fdset_test.head()

Unnamed: 0,Node,NumCont,DURC,ICT,EC,Escenari
0,4,1576,7.398552,678.166631,0.774606,1
1,7,904,9.377404,526.049259,0.776549,1
2,9,1302,8.02381,814.939911,2.200144,1
3,17,1340,6.314726,895.4102,0.903267,1
4,19,328,12.477162,384.198325,0.776549,1
