In [1]:
import pandas as pd
import numpy as np
import networkx as nx

import matplotlib.pyplot as plt

from os import listdir #Lib for interact with directoty in the computer
import tqdm #Lib for measuring the progress
import time

### Leandro Stival - Ra: 263013

### Reading the data

In [2]:
#Dataframe with name of networks
df_networks = pd.DataFrame()

In [3]:
list_of_networks_names = []
list_of_network_files = []

#Loop for each file in the folder to find the network files (.txt)
for file in listdir('.'):
    if file.split('.')[-1] == 'txt':
        list_of_network_files.append(file)
        network_name = file.split('.')[0]
        list_of_networks_names.append(network_name)

In [4]:
#Network names
df_networks['networks'] = list_of_networks_names


df_networks['N'] = [702388, 449673, 23133, 57194, 192244,
                    1039, 36595, 4941, 2018, 325729]

#List of average k '<k>' following the tabela 4.1:
df_networks['<K>'] = [83.71, 10.43, 8.08, 1.81, 6.34,
                      5.58, 2.51, 2.67, 2.90, 4.60]

df_networks

Unnamed: 0,networks,N,<K>
0,actor,702388,83.71
1,citation,449673,10.43
2,collaboration,23133,8.08
3,email,57194,1.81
4,internet,192244,6.34
5,metabolic,1039,5.58
6,phonecalls,36595,2.51
7,powergrid,4941,2.67
8,protein,2018,2.9
9,www,325729,4.6


### (A) Calculing the $K_s$

Using the formula $7.15$ $\left( {\left\langle k \right\rangle N} \right)^{1/2}$ <br>
To find the $K_s$ value for each network in table $4.1$.

In [5]:
df_networks['Ks'] = round(df_networks.apply(lambda x: (x['<K>'] * x['N'])**0.5, axis=1), 2)

df_networks

Unnamed: 0,networks,N,<K>,Ks
0,actor,702388,83.71,7667.91
1,citation,449673,10.43,2165.66
2,collaboration,23133,8.08,432.34
3,email,57194,1.81,321.75
4,internet,192244,6.34,1104.0
5,metabolic,1039,5.58,76.14
6,phonecalls,36595,2.51,303.07
7,powergrid,4941,2.67,114.86
8,protein,2018,2.9,76.5
9,www,325729,4.6,1224.07


### (B) $K_s$ is smaller that $K_{max}$ ?

Consedering in the images on Image $7.10$ analising the ploting and comparing $K_{nn}(k)$ with $K_{nn}^{R-S}(k)$ in this way was creating the list <em>is_not_disassortative</em> with False or True.

In [6]:
is_not_disassortative = [False, True, False, True, False, True,
                         True, False, True, False]

In [7]:
# Set values to dataframe
df_networks['Predict Ks < Kmax ?'] = is_not_disassortative
# Replace boolean valeus for labels 'Yes' and 'No'
df_networks['Predict Ks < Kmax ?'].replace({False: 'No', True: 'Yes'}, inplace=True)
df_networks

Unnamed: 0,networks,N,<K>,Ks,Predict Ks < Kmax ?
0,actor,702388,83.71,7667.91,No
1,citation,449673,10.43,2165.66,Yes
2,collaboration,23133,8.08,432.34,No
3,email,57194,1.81,321.75,Yes
4,internet,192244,6.34,1104.0,No
5,metabolic,1039,5.58,76.14,Yes
6,phonecalls,36595,2.51,303.07,Yes
7,powergrid,4941,2.67,114.86,No
8,protein,2018,2.9,76.5,Yes
9,www,325729,4.6,1224.07,No


### (C) Find the $K_{kmax}$ on the networks

In [8]:
list_of_kmax = [] #List with kmax values
network_interation = tqdm.tqdm(list_of_network_files) #Interaction in files in folder

for list_of_edges in network_interation:
    network_interation.set_description('Processing: %s' % list_of_edges.split('.')[-3])

    #If the networks is www the process is different because the header of the file
    if list_of_edges == 'www.edgelist.txt':
        df_edges = pd.read_csv(list_of_edges, low_memory=False)
        G_temp = nx.Graph()
        G_temp.add_edges_from(
            df_edges.iloc[3:,:].reset_index().apply(lambda x:x['index']
                                                    .split('\t'),axis=1))

    # Reading the edgelist and creating a undirect graph    
    else:
        #List of edges
        df_edges = pd.read_csv(list_of_edges, delimiter='\t', header=None)
        G_temp = nx.from_pandas_edgelist(df_edges, 0, 1,create_using=nx.Graph()) #
        
    list_of_kmax.append(max(dict(G_temp.degree()).values()))
    del G_temp

Processing: www: 100%|█████████████████████████████████████████████████████████████████| 10/10 [03:18<00:00, 19.88s/it]


In [9]:
# Set K_max in dataframe with value from edgelist
df_networks['K_max'] = list_of_kmax
df_networks

Unnamed: 0,networks,N,<K>,Ks,Predict Ks < Kmax ?,K_max
0,actor,702388,83.71,7667.91,No,10901
1,citation,449673,10.43,2165.66,Yes,4767
2,collaboration,23133,8.08,432.34,No,279
3,email,57194,1.81,321.75,Yes,6553
4,internet,192244,6.34,1104.0,No,1071
5,metabolic,1039,5.58,76.14,Yes,638
6,phonecalls,36595,2.51,303.07,Yes,80
7,powergrid,4941,2.67,114.86,No,19
8,protein,2018,2.9,76.5,Yes,91
9,www,325729,4.6,1224.07,No,10721


### (D) Filling the last column

In [10]:
#Function to valid if the prediction was confirmed
def confirmed(row):
    if (row['Ks'] < row['K_max']) and row['Predict Ks < Kmax ?'] == 'Yes':
        return 'Yes'
    elif (row['Ks'] > row['K_max']) and row['Predict Ks < Kmax ?'] == 'No':
        return 'Yes'
    return 'No'

In [11]:
#Apply the values 'Yes' and 'No' conforme the predictions
df_networks['Confirmed'] = df_networks.apply(lambda x: confirmed(x), axis=1)
df_networks

Unnamed: 0,networks,N,<K>,Ks,Predict Ks < Kmax ?,K_max,Confirmed
0,actor,702388,83.71,7667.91,No,10901,No
1,citation,449673,10.43,2165.66,Yes,4767,Yes
2,collaboration,23133,8.08,432.34,No,279,Yes
3,email,57194,1.81,321.75,Yes,6553,Yes
4,internet,192244,6.34,1104.0,No,1071,Yes
5,metabolic,1039,5.58,76.14,Yes,638,Yes
6,phonecalls,36595,2.51,303.07,Yes,80,No
7,powergrid,4941,2.67,114.86,No,19,Yes
8,protein,2018,2.9,76.5,Yes,91,Yes
9,www,325729,4.6,1224.07,No,10721,No


### Plotting the table

In [12]:
df_networks.drop(['N','<K>'],axis=1)

Unnamed: 0,networks,Ks,Predict Ks < Kmax ?,K_max,Confirmed
0,actor,7667.91,No,10901,No
1,citation,2165.66,Yes,4767,Yes
2,collaboration,432.34,No,279,Yes
3,email,321.75,Yes,6553,Yes
4,internet,1104.0,No,1071,Yes
5,metabolic,76.14,Yes,638,Yes
6,phonecalls,303.07,Yes,80,No
7,powergrid,114.86,No,19,Yes
8,protein,76.5,Yes,91,Yes
9,www,1224.07,No,10721,No
