In [1]:
import cugraph
import cudf

import numpy as np
import os

from tqdm.notebook import tqdm

In [2]:
file = 'matrices_sin_filt/cvv_siS01.txt'
output_folder = 'parametros_matrices_no_filtradas'

In [3]:
file_name = file.split(os.sep)[-1].split('-')[-1].split('.')[0]

df = cudf.read_csv(
    filepath_or_buffer=file,
    header=None,
    names=['Source', 'Target', 'Weight'],
    sep=' ')

df

Unnamed: 0,Source,Target,Weight
0,5725,5726,0.524
1,5726,34087,0.531
2,5726,62425,0.521
3,5727,17896,0.527
4,5727,21783,0.503
...,...,...,...
7294127,76323,76387,0.551
7294128,76383,76384,0.572
7294129,76385,76449,0.531
7294130,76447,76448,0.608


In [4]:
#percentages = [5 * (20 - i) for i in range(1, 20)] # [5, 95]
percentages = [5 * (10 - i) for i in range(0,10)] # [10,50]

percentages

[50, 45, 40, 35, 30, 25, 20, 15, 10, 5]

In [5]:
fields = []

list_weights = df['Weight'].drop_duplicates().to_arrow().to_pylist()
nodes_weight = cudf.concat([
    df[['Source', 'Weight']].rename(columns={"Source": "Vertex", "Weight": "Weight"}),
    df[['Target', 'Weight']].rename(columns={"Target": "Vertex", "Weight": "Weight"})
                            ])
node_list = nodes_weight['Vertex'].drop_duplicates().to_arrow().to_pylist()


for weight in tqdm(list_weights):
    n = len(nodes_weight[nodes_weight['Weight'] >= weight])
    fields.append({
            'Weight': weight,
            'N': n,
            'n%': (n / len(nodes_weight)) * 100
        })


df_percentages = cudf.DataFrame(fields).sort_values(by='n%', ascending=False)
weight_df = cudf.DataFrame()
closest_percentage = []

for percentage in percentages:
    val_list = np.abs(df_percentages['n%'] - percentage).to_arrow().to_pylist()
    val_index = val_list.index(
        min(val_list)
    )
    value = float(df_percentages['n%'].values[val_index])
    weight_df = cudf.concat([
                       weight_df,
                       df_percentages[df_percentages['n%'] == value]

    ])
    for i in range(df_percentages[df_percentages['n%'] == value].shape[0]):
        closest_percentage.append(percentage)

weight_df['closest_n%'] = closest_percentage

weight_df

  0%|          | 0/491 [00:00<?, ?it/s]

Unnamed: 0,Weight,N,n%,closest_n%
99,0.599,7287546,49.954854,50
112,0.612,6583672,45.129921,45
127,0.627,5832078,39.977875,40
143,0.643,5100118,34.960417,35
160,0.66,4394370,30.122638,30
180,0.68,3653268,25.042514,25
203,0.703,2905378,19.915858,20
229,0.729,2187452,14.994601,15
262,0.762,1451214,9.947818,10
307,0.807,732108,5.018472,5


In [6]:
def clustering(G):
    degree = G.in_degree()
    
    clusters = degree.merge(cugraph.triangle_count(G))
    clusters = clusters[clusters['counts'] >= 1]
    
    return (clusters['counts']/(clusters['degree']*(clusters['degree'] - 1))).sum()

In [7]:
def distance(G, nodes):
    dist = 0
    inv_dist = 0

    for node in tqdm(nodes):
        distances = cugraph.sssp(G, node)
        dist += distances[distances['predecessor'] > -1]['distance'].sum()/(len(nodes)*(len(nodes) - 1))
        inv_dist += (1/distances[distances['predecessor'] > -1]['distance']).sum()/(len(nodes)*(len(nodes) - 1))
    return dist, inv_dist

In [8]:
def shannon_entropy(G):
    hitcount = G.in_degree()
    frequencies = hitcount[hitcount['degree'] > 0]['degree'].value_counts().to_numpy()

    p_k = frequencies/sum(frequencies)
    return sum(-p_k * np.log(p_k))

In [9]:
def get_params():
    
    print('##### BEGING THE PROCESS OF OBTAINING THE NETWORK PARAMETERS USING DIFFERENT THRESHOLDS ########')
    # print(f'\n 100% - Weight threshold: 0.500')
    
    #G = cugraph.Graph()
    #G.from_cudf_edgelist(df, source='Source', destination='Target', renumber=False)

    #clustering_zero = clustering(G)
    #distance_zero, efficiency_zero = distance(G, node_list)
    #shannon_zero = shannon_entropy(G)

    #fields = [{
     #         'Percentage': 100,
     #         'Real Percentage': 100,
     #         'Number of voxels': len(node_list),
     #         'Weight': 0.500,
     #         'Distance': distance_zero,
     #         'Clustering': clustering_zero,
     #         'Global Efficiency': efficiency_zero,
     #         'Shannon Entropy': shannon_zero
     #     }]
    
    #G.clear()
    fields = []
    
    if weight_df.empty:
        return cudf.DataFrame()

    for percentage in percentages:
        
        weight = weight_df[weight_df['closest_n%'] == percentage]['Weight'].to_numpy()[0]
        print(f'\n {percentage}% - Weight threshold: {weight:.3f}')
        filt_df = df[df['Weight'] >= weight]
        nodes = nodes_weight[nodes_weight['Weight'] >= weight]['Vertex'].drop_duplicates().to_arrow().to_pylist()

        G = cugraph.Graph()
        G.from_cudf_edgelist(filt_df, source='Source', destination='Target', renumber=False)

        if len(G.edges()) < 1:
            continue
      
        clustering_coeff = clustering(G)
        distance_coeff, efficiency_coeff = distance(G, nodes)
        shannon_coeff = shannon_entropy(G)

        fields.append({
              'Percentage': percentage,
              'Real Percentage': weight_df[weight_df['closest_n%'] == percentage]['n%'].to_numpy()[0],
              'Number of voxels': len(nodes),
              'Weight': weight_df[weight_df['closest_n%'] == percentage]['Weight'].to_numpy()[0],
              'Distance': distance_coeff,
              'Clustering': clustering_coeff,
              'Global Efficiency': efficiency_coeff,
              'Shannon Entropy': shannon_coeff
          })
        G.clear()

    dataframe = cudf.DataFrame(fields)

    if dataframe.empty:
        return cudf.DataFrame()

    dataframe.to_csv(
          path_or_buf=f'{output_folder}/params-{file_name}.txt',
          index=False,
          header=True,
          sep='\t',
      )

    return dataframe

In [10]:
get_params()

##### BEGING THE PROCESS OF OBTAINING THE NETWORK PARAMETERS USING DIFFERENT THRESHOLDS ########

 50% - Weight threshold: 0.599


  0%|          | 0/15620 [00:00<?, ?it/s]


 45% - Weight threshold: 0.612


  0%|          | 0/15276 [00:00<?, ?it/s]


 40% - Weight threshold: 0.627


  0%|          | 0/14846 [00:00<?, ?it/s]


 35% - Weight threshold: 0.643


  0%|          | 0/14396 [00:00<?, ?it/s]


 30% - Weight threshold: 0.660


  0%|          | 0/13856 [00:00<?, ?it/s]


 25% - Weight threshold: 0.680


  0%|          | 0/13129 [00:00<?, ?it/s]


 20% - Weight threshold: 0.703


  0%|          | 0/12284 [00:00<?, ?it/s]


 15% - Weight threshold: 0.729


  0%|          | 0/11248 [00:00<?, ?it/s]


 10% - Weight threshold: 0.762


  0%|          | 0/9737 [00:00<?, ?it/s]


 5% - Weight threshold: 0.807


  0%|          | 0/7608 [00:00<?, ?it/s]

Unnamed: 0,Percentage,Real Percentage,Number of voxels,Weight,Distance,Clustering,Global Efficiency,Shannon Entropy
0,50,49.954854,15620,0.599,3.416514,4151.944488,0.317286,6.365996
1,45,45.129921,15276,0.612,3.499682,4046.306179,0.307563,6.248608
2,40,39.977875,14846,0.627,3.620462,3909.283841,0.296462,6.111609
3,35,34.960417,14396,0.643,3.727298,3774.913296,0.282992,5.958158
4,30,30.122638,13856,0.66,3.867498,3605.938542,0.270466,5.79404
5,25,25.042514,13129,0.68,3.975356,3397.208398,0.255864,5.627726
6,20,19.915858,12284,0.703,4.201501,3143.874914,0.238884,5.416331
7,15,14.994601,11248,0.729,4.307763,2824.746347,0.216842,5.167809
8,10,9.947818,9737,0.762,4.628364,2397.979706,0.192792,4.900155
9,5,5.018472,7608,0.807,4.658894,1796.02533,0.155882,4.49549
