# Import Modules

In [13]:
import tarfile
import pandas as pd
import networkx as nx
import random
import matplotlib.pyplot as plt
import os
import community as community_louvain



# Extracting Data from facebook user network.

In [2]:
with tarfile.open('facebook.tar.gz') as tar:
    tar.extractall(path='data')


FileNotFoundError: ignored

In [None]:

data_dir = 'data/facebook/'

circles = {}
edges = {}
egofeat = {}
feat = {}
featnames = {}


for filename in os.listdir(data_dir):
    if filename.endswith('.circles'):
        path = os.path.join(data_dir, filename)
        df = pd.read_csv(path)
        circles[filename] = df

    if filename.endswith('.edges'):
        path = os.path.join(data_dir, filename)
        df = pd.read_csv(path)
        edges[filename] = df
      
    if filename.endswith('.egofeat'):
        path = os.path.join(data_dir, filename)
        df = pd.read_csv(path)
        egofeat[filename] = df

    if filename.endswith('.feat'):
        path = os.path.join(data_dir, filename)
        df = pd.read_csv(path)
        feat[filename] = df

    if filename.endswith('.featnames'):
        path = os.path.join(data_dir, filename)
        df = pd.read_csv(path)
        featnames[filename] = df

In [None]:
circles
edges
egofeat
feat
featnames

{'1912.featnames':               0 birthday;anonymized feature 729
 0             1 birthday;anonymized feature 730
 1             2 birthday;anonymized feature 731
 2             3 birthday;anonymized feature 732
 3               4 birthday;anonymized feature 1
 4               5 birthday;anonymized feature 2
 ..                                          ...
 474  475 work;start_date;anonymized feature 168
 475  476 work;start_date;anonymized feature 202
 476  477 work;start_date;anonymized feature 169
 477  478 work;start_date;anonymized feature 170
 478  479 work;start_date;anonymized feature 681
 
 [479 rows x 1 columns],
 '698.featnames':                       0 birthday;anonymized feature 2
 0                     1 birthday;anonymized feature 3
 1       2 education;classes;id;anonymized feature 335
 2       3 education;classes;id;anonymized feature 336
 3   4 education;concentration;id;anonymized featur...
 4         5 education;degree;id;anonymized feature 22
 5        6 educatio

In [None]:
feat
#3437
#3980
#1683
#414
#0
#348
#686
#1912
#698
#107

{'3437.feat':     3438 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0    3439 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...                                                                                                                                                                                                                                                                                                                                                                                                              

# Extract data from Facebook Pages network

In [3]:
with tarfile.open('gemsec_facebook_dataset.tar.gz') as tar:
    tar.extractall(path='gemsec_data')

In [4]:
artist = pd.read_csv('gemsec_data/facebook_clean_data/artist_edges.csv')
athlete = pd.read_csv('gemsec_data/facebook_clean_data/athletes_edges.csv')

In [5]:
athlete
#each row represents an edge between two nodes
#there are 86858 edges in total

Unnamed: 0,node_1,node_2
0,0,7061
1,0,5911
2,0,13203
3,0,13704
4,0,11031
...,...,...
86853,13770,13787
86854,13784,13709
86855,13789,13801
86856,13806,13820


In [7]:
G = nx.Graph()

for index, row in athlete.iterrows():
    G.add_edge(row['node_1'], row['node_2'])

# Plotting the Graph



In [None]:
# Note these all also take a long time to run
nx.draw(G, with_labels=True)
plt.show()

In [None]:
pos = nx.spring_layout(G, seed=42)  # or try other layouts like nx.circular_layout(G), nx.kamada_kawai_layout(G)
nx.draw(G, pos, with_labels=True, node_size=50, font_size=8)
plt.show()

In [None]:
# Community detection to identify and assign different colors to nodes belonging to different communities
# which can make the visualizations more informative.

partition = community_louvain.best_partition(G)
colors = [partition[node] for node in G.nodes()]
nx.draw(G, pos, with_labels=True, node_size=50, font_size=8, node_color=colors, cmap=plt.cm.jet)
plt.show()


In [None]:
# Export the graph as a GraphML file for data viz in Cytoscape
nx.write_graphml(G, "graph.graphml")


# Centrality Measures

In [10]:
'''
Note this takes an incredibly long time to run, may be worth seperating into different cells.
Must calculate centrality measures for each nodes 100,000's+
'''

# Degree Centrality
degree_centrality = nx.degree_centrality(G)

# Closeness Centrality
closeness_centrality = nx.closeness_centrality(G)

# Betweenness Centrality
betweenness_centrality = nx.betweenness_centrality(G)

# Eigenvector Centrality
eigenvector_centrality = nx.eigenvector_centrality(G)

# Information Centrality
information_centrality = nx.information_centrality(G)


print('Degree centrality is:', degree_centrality)
print('Closeness centrality is:', closeness_centrality)
print('Betweenness centrality is:', betweenness_centrality)
print('Eigenvector centrality is:', eigenvector_centrality)
print('The information centrality is:', information_centrality)


KeyboardInterrupt: ignored

In [12]:
print('Degree centrality is:', degree_centrality)
print('The max degree centrality node is', max(degree_centrality))
print('Closeness centrality is:', closeness_centrality)
print('The max degree centrality node is', max(closeness_centrality))

##### Now note that both degree and closeness centrality have the same node with the highest centrality
# Node 13865 for the athlete graph, should try to figure out what that node is

Degree centrality is: {0: 0.0022358456545257845, 7061: 0.0024522178146411827, 5911: 0.005553552109628561, 13203: 0.0020915975477821852, 13704: 0.00043274432023079695, 11031: 0.0033177064551027768, 2095: 0.0023800937612693833, 10340: 0.001298232960692391, 2801: 0.0007933645870897945, 970: 0.003678326721961774, 5983: 0.000576992426974396, 4753: 0.0024522178146411827, 2561: 0.0016588532275513884, 593: 0.007068157230436351, 9074: 0.000144248106743599, 10586: 0.006491164803461955, 9640: 0.002596465921384782, 4498: 0.008366390191128741, 2218: 0.004615939415795168, 5034: 0.0005048683736025965, 5036: 0.00432744320230797, 12430: 0.0024522178146411827, 5474: 0.01291020555355211, 4273: 0.0030292102416155787, 6409: 0.0013703570140641903, 5784: 0.0015146051208077894, 1057: 0.00021637216011539847, 7280: 0.007573025604038947, 12126: 0.0041110710421925715, 9910: 0.012261089073205915, 13103: 0.0035340786152181754, 3879: 0.003101334294987378, 1: 7.21240533717995e-05, 4381: 0.001009736747205193, 2: 0.002

# Information Modeling

In [None]:
import random

def independent_cascade_model(G, seed_nodes, prob=0.1, max_iter=100):
  '''
  The independent_cascade_model function simulates the Independent Cascade Model on a given graph G 
  with an initial set of activated nodes (seed nodes) and a fixed activation probability prob. 
  The function returns the final set of activated nodes after the diffusion process has stopped.
  '''
    activated_nodes = set(seed_nodes)
    newly_activated_nodes = set(seed_nodes)

    for _ in range(max_iter):
        next_activated_nodes = set()

        for node in newly_activated_nodes:
            neighbors = G.neighbors(node)
            for neighbor in neighbors:
                if neighbor not in activated_nodes and random.random() < prob:
                    next_activated_nodes.add(neighbor)

        if not next_activated_nodes:
            break

        activated_nodes |= next_activated_nodes
        newly_activated_nodes = next_activated_nodes

    return activated_nodes

# Example usage
seed_nodes = [random.randrang(len(athlete)), random.randrang(len(athlete))]
activated_nodes = independent_cascade_model(G, seed_nodes, prob=0.1)


In [None]:
def sir_model(G, seed_nodes, infection_prob=0.1, recovery_prob=0.5, max_iter=100):
  '''
  This defines the SIR model function mainly used for disease propagation but can also model information flow.
  Succeptable refers to the population that has not recieved information, whereas infected would have recieved the information.
  Recovered would mean that the information has influenced their belief permanently, 
  Doing an SI model might also be appropriate
  '''
    susceptible_nodes = set(G.nodes()) - set(seed_nodes)
    infected_nodes = set(seed_nodes)
    recovered_nodes = set()

    for _ in range(max_iter):
        new_infected_nodes = set()
        new_recovered_nodes = set()

        for node in infected_nodes:
            neighbors = G.neighbors(node)
            for neighbor in neighbors:
                if neighbor in susceptible_nodes and random.random() < infection_prob:
                    new_infected_nodes.add(neighbor)

            if random.random() < recovery_prob:
                new_recovered_nodes.add(node)

        infected_nodes = (infected_nodes | new_infected_nodes) - new_recovered_nodes
        susceptible_nodes -= new_infected_nodes
        recovered_nodes |= new_recovered_nodes

        if not new_infected_nodes:
            break

    return susceptible_nodes, infected_nodes, recovered_nodes

# Example usage
seed_nodes = [random.randrang(len(athlete)), random.randrang(len(athlete))]
S, I, R = sir_model(G, seed_nodes, infection_prob=0.1, recovery_prob=0.5)
