In [1]:
import cugraph
import cudf

import numpy as np
import os

from tqdm.notebook import tqdm

In [2]:
file = 'matrices_sin_filt/cvv_siS01.txt'
output_folder = 'parametros_matrices_no_filtradas'

In [3]:
file_name = file.split(os.sep)[-1].split('-')[-1].split('.')[0]

df = cudf.read_csv(
    filepath_or_buffer=file,
    header=None,
    names=['Source', 'Target', 'Weight'],
    sep=' ')

df = df[df['Weight'] > 0.800]

In [4]:
percentages = [5 * (20 - i) for i in range(1, 20)] # [5, 95]
#percentages = [5 * (10 - i) for i in range(0,10)] # [10,50]

percentages

[95, 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10, 5]

In [5]:
fields = []

list_weights = df['Weight'].drop_duplicates().to_arrow().to_pylist()
nodes_weight = cudf.concat([
    df[['Source', 'Weight']].rename(columns={"Source": "Vertex", "Weight": "Weight"}),
    df[['Target', 'Weight']].rename(columns={"Target": "Vertex", "Weight": "Weight"})
                            ])
node_list = nodes_weight['Vertex'].drop_duplicates().to_arrow().to_pylist()


for weight in tqdm(list_weights):
    n = len(nodes_weight[nodes_weight['Weight'] >= weight])
    fields.append({
            'Weight': weight,
            'N': n,
            'n%': (n / len(nodes_weight)) * 100
        })


df_percentages = cudf.DataFrame(fields).sort_values(by='n%', ascending=False)
weight_df = cudf.DataFrame()
closest_percentage = []

for percentage in percentages:
    val_list = np.abs(df_percentages['n%'] - percentage).to_arrow().to_pylist()
    val_index = val_list.index(
        min(val_list)
    )
    value = float(df_percentages['n%'].values[val_index])
    weight_df = cudf.concat([
                       weight_df,
                       df_percentages[df_percentages['n%'] == value]

    ])
    for i in range(df_percentages[df_percentages['n%'] == value].shape[0]):
        closest_percentage.append(percentage)

weight_df['closest_n%'] = closest_percentage

weight_df

  0%|          | 0/198 [00:00<?, ?it/s]

Unnamed: 0,Weight,N,n%,closest_n%
4,0.805,1654180,94.476631,95
7,0.808,1583870,90.460955,90
11,0.812,1493216,85.283353,85
15,0.816,1404314,80.205816,80
19,0.82,1319568,75.365643,75
24,0.825,1219338,69.641119,70
28,0.829,1142362,65.244722,65
33,0.834,1050954,60.024056,60
38,0.839,963942,55.054464,55
43,0.844,882174,50.384376,50


In [6]:
def clustering(G):
    degree = G.in_degree()
    
    clusters = degree.merge(cugraph.triangle_count(G))
    clusters = clusters[clusters['counts'] >= 1]
    
    return (clusters['counts']/(clusters['degree']*(clusters['degree'] - 1))).sum()

In [7]:
def distance(G, nodes):
    dist = 0
    inv_dist = 0

    for node in tqdm(nodes):
        distances = cugraph.sssp(G, node)
        dist += distances[distances['predecessor'] > -1]['distance'].sum()/(len(nodes)*(len(nodes) - 1))
        inv_dist += (1/distances[distances['predecessor'] > -1]['distance']).sum()/(len(nodes)*(len(nodes) - 1))
    return dist, inv_dist

In [8]:
def shannon_entropy(G):
    hitcount = G.in_degree()
    frequencies = hitcount[hitcount['degree'] > 0]['degree'].value_counts().to_numpy()

    p_k = frequencies/sum(frequencies)
    return sum(-p_k * np.log(p_k))

In [9]:
def get_params():
    
    print('##### BEGING THE PROCESS OF OBTAINING THE NETWORK PARAMETERS USING DIFFERENT THRESHOLDS ########')
    print(f'\n 100% - Weight threshold: {min(list_weights):.3f}')
    
    G = cugraph.Graph()
    G.from_cudf_edgelist(df, source='Source', destination='Target', renumber=False)

    clustering_zero = clustering(G)
    distance_zero, efficiency_zero = distance(G, node_list)
    shannon_zero = shannon_entropy(G)

    fields = [{
              'Percentage': 100,
             'Real Percentage': 100,
              'Number of voxels': len(node_list),
              'Weight': min(list_weights),
              'Distance': distance_zero,
              'Clustering': clustering_zero,
              'Global Efficiency': efficiency_zero,
              'Shannon Entropy': shannon_zero
          }]
    
    G.clear()
    
    if weight_df.empty:
        return cudf.DataFrame()

    for percentage in percentages:
        
        weight = weight_df[weight_df['closest_n%'] == percentage]['Weight'].to_numpy()[0]
        print(f'\n {percentage}% - Weight threshold: {weight:.3f}')
        filt_df = df[df['Weight'] >= weight]
        nodes = nodes_weight[nodes_weight['Weight'] >= weight]['Vertex'].drop_duplicates().to_arrow().to_pylist()

        G = cugraph.Graph()
        G.from_cudf_edgelist(filt_df, source='Source', destination='Target', renumber=False)

        if len(G.edges()) < 1:
            continue
      
        clustering_coeff = clustering(G)
        distance_coeff, efficiency_coeff = distance(G, nodes)
        shannon_coeff = shannon_entropy(G)

        fields.append({
              'Percentage': percentage,
              'Real Percentage': weight_df[weight_df['closest_n%'] == percentage]['n%'].to_numpy()[0],
              'Number of voxels': len(nodes),
              'Weight': weight_df[weight_df['closest_n%'] == percentage]['Weight'].to_numpy()[0],
              'Distance': distance_coeff,
              'Clustering': clustering_coeff,
              'Global Efficiency': efficiency_coeff,
              'Shannon Entropy': shannon_coeff
          })
        G.clear()

    dataframe = cudf.DataFrame(fields)

    if dataframe.empty:
        return cudf.DataFrame()

    dataframe.to_csv(
          path_or_buf=f'{output_folder}/params-{file_name}.txt',
          index=False,
          header=True,
          sep='\t',
      )

    return dataframe

In [10]:
get_params()

##### BEGING THE PROCESS OF OBTAINING THE NETWORK PARAMETERS USING DIFFERENT THRESHOLDS ########

 100% - Weight threshold: 0.801


  0%|          | 0/13132 [00:00<?, ?it/s]


 95% - Weight threshold: 0.805


  0%|          | 0/12924 [00:00<?, ?it/s]


 90% - Weight threshold: 0.808


  0%|          | 0/12790 [00:00<?, ?it/s]


 85% - Weight threshold: 0.812


  0%|          | 0/12583 [00:00<?, ?it/s]


 80% - Weight threshold: 0.816


  0%|          | 0/12333 [00:00<?, ?it/s]


 75% - Weight threshold: 0.820


  0%|          | 0/12116 [00:00<?, ?it/s]


 70% - Weight threshold: 0.825


  0%|          | 0/11811 [00:00<?, ?it/s]


 65% - Weight threshold: 0.829


  0%|          | 0/11586 [00:00<?, ?it/s]


 60% - Weight threshold: 0.834


  0%|          | 0/11266 [00:00<?, ?it/s]


 55% - Weight threshold: 0.839


  0%|          | 0/10972 [00:00<?, ?it/s]


 50% - Weight threshold: 0.844


  0%|          | 0/10657 [00:00<?, ?it/s]


 45% - Weight threshold: 0.850


  0%|          | 0/10266 [00:00<?, ?it/s]


 40% - Weight threshold: 0.856


  0%|          | 0/9837 [00:00<?, ?it/s]


 35% - Weight threshold: 0.863


  0%|          | 0/9352 [00:00<?, ?it/s]


 30% - Weight threshold: 0.870


  0%|          | 0/8885 [00:00<?, ?it/s]


 25% - Weight threshold: 0.878


  0%|          | 0/8336 [00:00<?, ?it/s]


 20% - Weight threshold: 0.888


  0%|          | 0/7558 [00:00<?, ?it/s]


 15% - Weight threshold: 0.899


  0%|          | 0/6737 [00:00<?, ?it/s]


 10% - Weight threshold: 0.913


  0%|          | 0/5565 [00:00<?, ?it/s]


 5% - Weight threshold: 0.933


  0%|          | 0/3934 [00:00<?, ?it/s]

Unnamed: 0,Percentage,Real Percentage,Number of voxels,Weight,Distance,Clustering,Global Efficiency,Shannon Entropy
0,100,100.0,13132,0.801,4.800618,3036.790313,0.164481,4.778159
1,95,94.476631,12924,0.805,4.784952,2965.798581,0.160195,4.727833
2,90,90.460955,12790,0.808,4.746974,2907.106878,0.156858,4.692871
3,85,85.283353,12583,0.812,4.71573,2838.159985,0.15287,4.645633
4,80,80.205816,12333,0.816,4.728963,2766.437905,0.149706,4.605067
5,75,75.365643,12116,0.82,4.630384,2698.704411,0.145997,4.557523
6,70,69.641119,11811,0.825,4.599247,2618.52657,0.141001,4.506871
7,65,65.244722,11586,0.829,4.520476,2548.819341,0.136861,4.460272
8,60,60.024056,11266,0.834,4.551358,2460.677596,0.13301,4.408947
9,55,55.054464,10972,0.839,4.464356,2371.206993,0.129597,4.346249
