# Import GMLs


In [1]:
import networkx as nx
SNet = nx.read_gml("data/graphs/snet.gml")
SNetF = nx.read_gml("data/graphs/snetf.gml")
SNetT = nx.read_gml("data/graphs/snett.gml")
UserNet = nx.read_gml("data/graphs/usernet.gml")

In [2]:
print(nx.info(SNet))
print(nx.info(SNetF))
print(nx.info(SNetT))
print(nx.info(UserNet))

Name: 
Type: Graph
Number of nodes: 4191
Number of edges: 135974
Average degree:  64.8886
Name: 
Type: Graph
Number of nodes: 4191
Number of edges: 6113
Average degree:   2.9172
Name: 
Type: Graph
Number of nodes: 39
Number of edges: 741
Average degree:  38.0000
Name: 
Type: DiGraph
Number of nodes: 18975
Number of edges: 102043
Average in degree:   5.3778
Average out degree:   5.3778


# Centrality analysis

## 16 Degree, closeness and betweenness centralities. Summary of most notable elements 

In [139]:
import pandas as pd

def degree_centrality(graph):
    dict_dc = nx.degree_centrality(graph)
    dc = pd.DataFrame.from_dict(dict_dc, orient="index", columns=["degree_centrality"])
    dc.index.name = "subreddit"
    dc.sort_values(by="degree_centrality", ascending=False, inplace=True)
    return dc

def directed_degree_centrality(graph):
    dict_dc_in = nx.in_degree_centrality(graph)
    in_dc = pd.DataFrame.from_dict(dict_dc_in, orient="index", columns=["in_degree_centrality"])
    in_dc.index.name = "subreddit"
    in_dc.sort_values(by="in_degree_centrality", ascending=False, inplace=True)
    
    dict_dc_out = nx.out_degree_centrality(graph)
    out_dc = pd.DataFrame.from_dict(dict_dc_out, orient="index", columns=["out_degree_centrality"])
    out_dc.index.name = "subreddit"
    out_dc.sort_values(by="out_degree_centrality", ascending=False, inplace=True)
    
    return in_dc, out_dc

def closeness_centrality(graph):
    dict_cc = nx.closeness_centrality(graph)
    cc = pd.DataFrame.from_dict(dict_cc, orient="index", columns=["closeness_centrality"])
    cc.index.name = "subreddit"
    cc.sort_values(by="closeness_centrality", ascending=False, inplace=True)
    return cc

def betweenness_centrality(graph):
    dict_bc = nx.betweenness_centrality(graph)
    bc = pd.DataFrame.from_dict(dict_bc, orient="index", columns=["betweenness_centrality"])
    bc.index.name = "subreddit"
    bc.sort_values(by="betweenness_centrality", ascending=False, inplace=True)
    return bc

def centralities(graph):
    centralities = ()
    if nx.is_directed(graph):
        centralities = directed_degree_centrality(graph)
        
    centralities = centralities + (degree_centrality(graph),)
    centralities = centralities + (closeness_centrality(graph),)
    centralities = centralities + (betweenness_centrality(graph),)
    return centralities

def print_top_ten_centralities(graph_name, g_dfs):
    print(f"Top ten {graph_name}'s elements by centrality")
    for i in range(len(g_dfs)):
        print(g_dfs[i].iloc[:10])
        
def print_top_ten_specific_centrality(graph_name, g_dfs, centrality_name, index):
    print(f"Top ten {graph_name}'s elements by {centrality_name} centrality")
    print(g_dfs[index].iloc[:10])

In [192]:
import pickle

def write(centrality_name, centrality):
    pickle.dump(centrality, open(f"data/centralities/{centrality_name}", "wb"))
    
def get_centrality(centrality_name, graph, overwrite_centrality=False):
    def create_and_write(centrality_name, graph):
            centrality = centralities(graph)
            write(centrality_name, centrality)
            return centrality
        
    try:
        if overwrite_centrality:
            return create_and_write(centrality_name, graph)
        else:
            return pickle.load(open(f"data/centralities/{centrality_name}", "rb"))
    except (OSError, IOError) as e:
        return create_and_write(centrality_name, graph)

In [188]:
snet_centralities = get_centrality("snet_centralities", SNet) #, overwrite_centrality=True)

In [191]:
print_top_ten_centralities("SNet", snet_centralities)

Top ten SNet's elements by centrality
               degree_centrality
subreddit                       
reddit.com              0.714320
politics                0.489976
technology              0.489499
pics                    0.480907
funny                   0.468496
science                 0.464678
entertainment           0.456325
worldnews               0.454654
programming             0.450119
WTF                     0.449403
               closeness_centrality
subreddit                          
reddit.com                 0.722837
technology                 0.576672
politics                   0.576304
pics                       0.571567
funny                      0.565136
science                    0.563258
entertainment              0.559424
worldnews                  0.558732
programming                0.555755
WTF                        0.555755
               betweenness_centrality
subreddit                            
reddit.com                   0.248260
technology          

In [196]:
snetf_centralities = get_centrality("snetf_centralities", SNetF) #, overwrite_centrality=True)

In [123]:
print_top_ten_centralities("SNetF", snetf_centralities)

Top ten SNetF's elements by centrality
               degree_centrality
subreddit                       
reddit.com              0.057041
pics                    0.050597
politics                0.050119
science                 0.049403
technology              0.049403
funny                   0.048687
worldnews               0.047971
WTF                     0.046539
entertainment           0.046062
programming             0.044391
               closeness_centrality
subreddit                          
reddit.com                 0.057056
pics                       0.051061
politics                   0.050694
science                    0.050152
technology                 0.050152
funny                      0.049623
worldnews                  0.049104
WTF                        0.048098
entertainment              0.047772
programming                0.046665
               betweenness_centrality
subreddit                            
reddit.com                   0.000857
politics           

In [177]:
snett_centralities = get_centrality("snett_centrality", SNetT)

In [178]:
print_top_ten_centralities("SNetT", snett_centralities)

Top ten SNetT's elements by centrality
             degree_centrality
subreddit                     
reddit.com                 1.0
funny                      1.0
atheism                    1.0
bestof                     1.0
technology                 1.0
WTF                        1.0
canada                     1.0
geek                       1.0
photography                1.0
history                    1.0
             closeness_centrality
subreddit                        
reddit.com                    1.0
funny                         1.0
atheism                       1.0
bestof                        1.0
technology                    1.0
WTF                           1.0
canada                        1.0
geek                          1.0
photography                   1.0
history                       1.0
             betweenness_centrality
subreddit                          
reddit.com                      0.0
funny                           0.0
atheism                         0.0
b

In [194]:
usernet_centralities = get_centrality("usernet_centralities", UserNet) #, overwrite_centrality=True) 

In [195]:
print_top_ten_centralities("UserNet", usernet_centralities)

Top ten UserNet's elements by centrality
                 in_degree_centrality
subreddit                            
NoMoreNicksLeft              0.024507
7oby                         0.022873
mutatron                     0.021292
Poromenos                    0.017392
fingers                      0.016338
matts2                       0.015969
amstrdamordeath              0.015811
MrKlaatu                     0.015811
nixonrichard                 0.015495
malcontent                   0.015284
                out_degree_centrality
subreddit                            
alllie                       0.055866
7oby                         0.049120
rmuser                       0.045272
qgyh2                        0.039791
deuteros                     0.029092
tsteele93                    0.027511
bobcat                       0.023032
grauenwolf                   0.017919
Aerik                        0.017340
AMerrickanGirl               0.017129
                 degree_centrality
subreddit   

## 17 Most important actors by *Eigenvector* centrality? What can we deduce? 

Variant of the degree centrality which takes into consideration the node's neighborhood. We can learn about it's power and influence.
* A more powerful node is that node which stands out as the one with the most edges, controling the troughput
* A more influencing node is that node which blends into a well connected neighborhood, having many edges as well

In [27]:
def eigenvector_centrality(graph):
    dict_ec = nx.eigenvector_centrality(graph, weight="weight")
    ec = pd.DataFrame.from_dict(dict_ec, orient="index", columns=["eigenvector_centrality"])
    ec.index.name = "subreddit"
    ec.sort_values(by="eigenvector_centrality", ascending=False, inplace=True)
    return ec

In [149]:
snet_centralities = snet_centralities + (eigenvector_centrality(SNet),)

In [150]:
print_top_ten_specific_centrality("SNet", snet_centralities, "Eigenvector", -1)

Top ten SNet's elements by Eigenvector centrality
               eigenvector_centrality
subreddit                            
reddit.com                   0.384282
politics                     0.315704
pics                         0.292063
funny                        0.280539
science                      0.273836
technology                   0.265365
worldnews                    0.258570
WTF                          0.253545
entertainment                0.253039
programming                  0.220225


In [151]:
snetf_centralities = snetf_centralities + (eigenvector_centrality(SNetF),)

In [153]:
print_top_ten_specific_centrality("SNetF", snetf_centralities, "Eigenvector", -1)

Top ten SNetF's elements by Eigenvector centrality
               eigenvector_centrality
subreddit                            
reddit.com                   0.384340
politics                     0.315757
pics                         0.292108
funny                        0.280581
science                      0.273877
technology                   0.265404
worldnews                    0.258607
WTF                          0.253581
entertainment                0.253076
programming                  0.220253


In [142]:
snett_centralities = snett_centralities + (eigenvector_centrality(SNetT),)

In [155]:
print_top_ten_specific_centrality("SNetT", snett_centralities, "Eigenvector", -1)

Top ten SNetT's elements by Eigenvector centrality
               eigenvector_centrality
subreddit                            
reddit.com                   0.388978
politics                     0.319382
pics                         0.295185
funny                        0.283491
science                      0.276774
technology                   0.268279
worldnews                    0.261412
WTF                          0.256010
entertainment                0.255542
programming                  0.222823


In [171]:
user_net_centralities = usernet_centralities + (eigenvector_centrality(UserNet),)

In [157]:
print_top_ten_specific_centrality("UserNet", usernet_centralities, "Eigenvector", -1)

Top ten UserNet's elements by Eigenvector centrality
                 betweenness_centrality
subreddit                              
7oby                           0.038758
alllie                         0.021461
qgyh2                          0.015406
rmuser                         0.014558
NoMoreNicksLeft                0.012480
mutatron                       0.010970
tsteele93                      0.010731
deuteros                       0.009837
bobcat                         0.008828
glengyron                      0.008512


## 18 Rank nodes using the *Katz* centrality with parameter variation. Play with the beta parameter for the subreddit "reddit.com". Show most important actors, with and without specific betas

* Katz centrality computes the centrality for a node based on the centrality of its neighbors. It is a generalization of the eigenvector centrality.
* The parameter  controls the initial centrality 

 
* Katz centrality computes the relative influence of a node within a network by measuring the number of the immediate neighbors (first degree nodes) and also all other nodes in the network that connect to the node under consideration through these immediate neighbors.

* This algorithm it uses the power method to find the eigenvector corresponding to the largest eigenvalue of the adjacency matrix of G. The parameter alpha should be strictly less than the inverse of largest eigenvalue of the adjacency matrix for the algorithm to converge. You can use max(nx.adjacency_spectrum(G)) to get  the largest eigenvalue of the adjacency matrix. The iteration will stop after max_iter iterations or an error tolerance of number_of_nodes(G) * tol has been reached.

* When $$ \alpha=\frac{1}{\lambda} $$ and $$\beta=0$$ , Katz centrality is the same as eigenvector centrality.

In [8]:
def get_alpha(graph):
    lambda_max = max(nx.adjacency_spectrum(graph))
    print(lambda_max)
    print(f"alpha need to be < {1.0/lambda_max}")

In [9]:
get_alpha(SNet), get_alpha(SNetF), get_alpha(SNetT)

(158672.32654746872+0j)
alpha need to be < (6.30229619593331e-06+0j)
(158645.5693695278+0j)
alpha need to be < (6.30335914185371e-06+0j)
(155722.39506192337+0j)
alpha need to be < (6.421683917732884e-06+0j)


(None, None, None)

Since there is no need to run UserNet against a subreddit, I will skip Katz' centrality for it.

Also, alpha need to be smaller than all three values, so I will take 6.0e-06 as a smaller value

In [18]:
alpha = 6.0e-05
special_node = "reddit.com"

def katz_centrality(graph, beta=1.0):
    dict_kc = nx.katz_centrality(graph, alpha=alpha, beta=beta, weight="weight")
    kc = pd.DataFrame.from_dict(dict_kc, orient="index", columns=["katz_centrality"])
    kc.index.name = "subreddit"
    kc.sort_values(by="katz_centrality", ascending=False, inplace=True)
    return kc

def special_katz_centrality(graph, special_beta):
    betas = [(node, special_beta if node == special_node else 1.0) for node in graph]
    return katz_centrality(graph, betas)

In [20]:
katz_centrality(SNetF)

PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 1000 iterations')

In [None]:
special_bets = [2.0, 4.0, 16.0, 256.0] # experiment

# *TODO* Since katz isn't working for me, I don't know how to test. I will get back to it 

##  19 Taking into account the previous results, create a composite centrality (heuristic) used for finding the most influencial actors. Take into consideration the direction of network's edges. 

As we saw that DC, CC, BC and EC were positivly coupled, we can merely get the product of the four.

**Composite Centrality = DC x CC x BC x EC**

In [137]:
# reset snett
snett_centralities = centralities(SNetT)
snett_centralities = snett_centralities + (eigenvector_centrality(SNetT),)

In [166]:
from numpy import ones

def add_composite_centrality_df(centralities):
    df = pd.DataFrame(ones(centralities[0].shape[0]), columns=['Rank'])
    df.index = centralities[0].index
    return centralities + (df,)

def add_rank_column(centrality, columns):
    centrality[columns[1]] = centrality[columns[0]].rank(ascending=False)

def create_composite_centralities(centralities, offset=0):
    centralities = add_composite_centrality_df(centralities)
    composite = -1
    map_centrality_columns = {0: ["degree_centrality", "Rank_DC"], 1: ["closeness_centrality", "Rank_DC"],
                        2: ["betweenness_centrality", "Rank_BC"], 3: ["eigenvector_centrality", "Rank_EC"]}
    for i in range(len(centralities) - 1):
        add_rank_column(centralities[i+offset], map_centrality_columns[i])
        centralities[composite]['Rank'] *=  centralities[i+offset][map_centrality_columns[i][1]]

    centralities[composite].sort_values(by="Rank", ascending=True, inplace=True)
    return centralities

In [151]:
snet_centralities = create_composite_centralities(snet_centralities)

In [157]:
print_top_ten_centralities("SNet",snet_centralities)

Top ten SNet's elements by centrality
               degree_centrality  Rank_DC
subreddit                                
reddit.com              0.714320      1.0
politics                0.489976      2.0
technology              0.489499      3.0
pics                    0.480907      4.0
funny                   0.468496      5.0
science                 0.464678      6.0
entertainment           0.456325      7.0
worldnews               0.454654      8.0
programming             0.450119      9.0
WTF                     0.449403     10.0
               closeness_centrality  Rank_DC
subreddit                                   
reddit.com                 0.722837      1.0
technology                 0.576672      2.0
politics                   0.576304      3.0
pics                       0.571567      4.0
funny                      0.565136      5.0
science                    0.563258      6.0
entertainment              0.559424      7.0
worldnews                  0.558732      8.0
programm

In [152]:
snetf_centralities = create_composite_centralities(snetf_centralities)

In [158]:
print_top_ten_centralities("SNetF",snetf_centralities)

Top ten SNetF's elements by centrality
               degree_centrality  Rank_DC
subreddit                                
reddit.com              0.057041      1.0
pics                    0.050597      2.0
politics                0.050119      3.0
science                 0.049403      4.5
technology              0.049403      4.5
funny                   0.048687      6.0
worldnews               0.047971      7.0
WTF                     0.046539      8.0
entertainment           0.046062      9.0
programming             0.044391     10.0
               closeness_centrality  Rank_DC
subreddit                                   
reddit.com                 0.057056      1.0
pics                       0.051061      2.0
politics                   0.050694      3.0
science                    0.050152      4.5
technology                 0.050152      4.5
funny                      0.049623      6.0
worldnews                  0.049104      7.0
WTF                        0.048098      8.0
enterta

In [143]:
snett_centralities = create_composite_centralities(snett_centralities)

In [145]:
print_top_ten_centralities("SNetT",snett_centralities)

Top ten SNetT's elements by centrality
             degree_centrality  Rank_DC
subreddit                              
reddit.com                 1.0     20.0
funny                      1.0     20.0
atheism                    1.0     20.0
bestof                     1.0     20.0
technology                 1.0     20.0
WTF                        1.0     20.0
canada                     1.0     20.0
geek                       1.0     20.0
photography                1.0     20.0
history                    1.0     20.0
             closeness_centrality  Rank_DC
subreddit                                 
reddit.com                    1.0     20.0
funny                         1.0     20.0
atheism                       1.0     20.0
bestof                        1.0     20.0
technology                    1.0     20.0
WTF                           1.0     20.0
canada                        1.0     20.0
geek                          1.0     20.0
photography                   1.0     20.0
history 

In [167]:
usernet_centralities = create_composite_centralities(usernet_centralities, 2)

KeyError: 'eigenvector_centrality'

In [172]:
usernet_centralities[5]

IndexError: tuple index out of range

In [161]:
print_top_ten_centralities("UserNet",usernet_centralities)

Top ten UserNet's elements by centrality
                 in_degree_centrality
subreddit                            
NoMoreNicksLeft              0.024507
7oby                         0.022873
mutatron                     0.021292
Poromenos                    0.017392
fingers                      0.016338
matts2                       0.015969
amstrdamordeath              0.015811
MrKlaatu                     0.015811
nixonrichard                 0.015495
malcontent                   0.015284
                out_degree_centrality
subreddit                            
alllie                       0.055866
7oby                         0.049120
rmuser                       0.045272
qgyh2                        0.039791
deuteros                     0.029092
tsteele93                    0.027511
bobcat                       0.023032
grauenwolf                   0.017919
Aerik                        0.017340
AMerrickanGirl               0.017129
                 degree_centrality
subreddit   