## Read the graphml file and filter it accordingly

In [1]:
import networkx as nx
import pandas as pd
# Load the graphml file again
file_path = 'bioneer2.graphml'
G = nx.read_graphml(file_path)

# Filter the graph to only include channels with subscriberCount between 50000 and 8318210
G = G.subgraph([node for node in G.nodes() if 50000 <= G.nodes[node]['subscribercount'] <= 8318210])

## Find homophily for different attributes of the channels using various ways

- 2pq
- Assortativity
- E-I Index

In [2]:
# Create a dataframe with the subscriber count of each node
video_df = pd.DataFrame({'subscribercount': [G.nodes[node]['subscribercount'] for node in G.nodes()]})

# Define thresholds for low, mid, and high subscriber counts
# For example, using quantiles
low_threshold = video_df['subscribercount'].quantile(0.4)
high_threshold = video_df['subscribercount'].quantile(0.7)

# Categorize subscriber counts
def categorize_subscriber_count(count):
    if count <= low_threshold:
        return 'low'
    elif count <= high_threshold:
        return 'mid'
    else:
        return 'high'

# Add the category to each node in the graph
for node in G.nodes():
    G.nodes[node]['category'] = categorize_subscriber_count(G.nodes[node]['subscribercount'])

# Calculate the number of cross-category edges
cross_category_edges = len([edge for edge in G.edges() if G.nodes[edge[0]]['category'] != G.nodes[edge[1]]['category']])

# Calculate the total number of edges
total_edges = G.number_of_edges()

# Calculate the homophily
homophily = 1 - (cross_category_edges / total_edges)

# Output the results
print("Number of cross-category edges:", cross_category_edges)
print("Number of edges:", total_edges)
print("Homophily:", homophily)
print(low_threshold)
print(high_threshold)

Number of cross-category edges: 404
Number of edges: 604
Homophily: 0.33112582781456956
535000.0
1520000.0


In [3]:
p = sum(1 for node, attr in G.nodes(data=True) if attr.get('category') == 'low') / G.number_of_nodes()
q = 1 - p
expected_cross_edges = 2 * p * q * total_edges

# Calculate the actual number of cross-edges
actual_cross_edges = sum(1 for u, v in G.edges() if G.nodes[u]['category'] != G.nodes[v]['category'])

# Calculate the homophily
homophily = 1 - (cross_category_edges / total_edges)

print(f'Expected number of cross-edges: {expected_cross_edges}')
print(f'Actual number of cross-edges: {actual_cross_edges}')
print(f'Evidence of homophily: {homophily}')

Expected number of cross-edges: 290.5670236072153
Actual number of cross-edges: 404
Evidence of homophily: 0.33112582781456956


In [11]:
# Calculate assortativity for all the attributes
country_assortativity_coefficient = nx.attribute_assortativity_coefficient(G, 'country')
subscriber_count_assortativity_coefficient = nx.attribute_assortativity_coefficient(G,'subscribercount')
video_count_assortativity_coefficient = nx.attribute_assortativity_coefficient(G,'videocount')
view_count_assortativity_coefficient = nx.attribute_assortativity_coefficient(G, 'viewcount(100s)')

print(f"Country Assortativity Coefficient: {country_assortativity_coefficient:.3f}")
print(f"Subscriber count Assortativity Coefficient: {subscriber_count_assortativity_coefficient:.3f}")
print(f"Video count Assortativity Coefficient: {video_count_assortativity_coefficient:.3f}")
print(f"View count Assortativity Coefficient: {view_count_assortativity_coefficient:.3f}")

Country Assortativity Coefficient: 0.039
Subscriber count Assortativity Coefficient: -0.005
Video count Assortativity Coefficient: -0.005
View count Assortativity Coefficient: -0.005


### E-I Index Calculations

In [5]:
# Function to calculate the E-I Index for video count with a threshold
def calculate_ei_index(graph, threshold):
    internal_connections = 0
    external_connections = 0

    for edge in graph.edges():
        if graph.nodes[edge[0]]['videocount'] <= threshold and graph.nodes[edge[1]]['videocount'] <= threshold:
            internal_connections += 1
        elif graph.nodes[edge[0]]['videocount'] > threshold and graph.nodes[edge[1]]['videocount'] > threshold:
            internal_connections += 1
        else:
            external_connections += 1

    if internal_connections + external_connections == 0:
        return 0  # Prevent division by zero

    ei_index = (external_connections - internal_connections) / (external_connections + internal_connections)
    return ei_index

# Calculate the E-I Index for video count with a threshold of 100 videos
ei_index_video_count = calculate_ei_index(G, 100)

ei_index_video_count

-0.7880794701986755

In [6]:
def calculate_ei_index_view_count(graph, threshold):
    internal_connections = 0
    external_connections = 0

    for edge in graph.edges():
        if graph.nodes[edge[0]]['viewcount(100s)'] <= threshold and graph.nodes[edge[1]]['viewcount(100s)'] <= threshold:
            internal_connections += 1
        elif graph.nodes[edge[0]]['viewcount(100s)'] > threshold and graph.nodes[edge[1]]['viewcount(100s)'] > threshold:
            internal_connections += 1
        else:
            external_connections += 1

    if internal_connections + external_connections == 0:
        return 0  # Prevent division by zero

    ei_index = (external_connections - internal_connections) / (external_connections + internal_connections)
    return ei_index

# Calculate the E-I Index for view count with a specified threshold
threshold_view_count = 10000000  # Example threshold for view count
ei_index_view_count = calculate_ei_index_view_count(G, threshold_view_count)

ei_index_view_count

-0.8675496688741722

In [7]:
def calculate_e_i_index_by_subscriber_count(G, threshold):
    internal_edges = 0
    external_edges = 0

    for u, v in G.edges():
        # Check if 'subscribercount' attribute exists for both nodes
        if 'subscribercount' in G.nodes[u] and 'subscribercount' in G.nodes[v]:
            # Determine if both nodes are above or below the threshold
            u_above_threshold = G.nodes[u]['subscribercount'] >= threshold
            v_above_threshold = G.nodes[v]['subscribercount'] >= threshold

            if u_above_threshold == v_above_threshold:
                internal_edges += 1  # Both nodes are either above or below the threshold
            else:
                external_edges += 1  # One node is above the threshold and the other is below

    total_edges = internal_edges + external_edges
    if total_edges > 0:
        e_i_index = (external_edges - internal_edges) / total_edges
    else:
        e_i_index = None  # E-I Index is not defined for graphs without edges

    return e_i_index


# Define a subscriber count threshold
subscriber_threshold = 3000000  # example threshold

# Calculate the E-I Index
e_i_index = calculate_e_i_index_by_subscriber_count(G, subscriber_threshold)
print("E-I Index:", e_i_index)

E-I Index: -0.38741721854304634


In [8]:
def calculate_homophily_by_country(G):
    internal_edges = 0
    total_edges = 0

    for u, v in G.edges():
        # Check if the 'country' attribute exists for both nodes
        if 'country' in G.nodes[u] and 'country' in G.nodes[v]:
            total_edges += 1  # Count every edge
            if G.nodes[u]['country'] == G.nodes[v]['country']:
                internal_edges += 1  # Count internal edge

    # Calculating the proportion of internal connections
    if total_edges > 0:
        homophily_index = internal_edges / total_edges
    else:
        homophily_index = None  # Homophily is not defined for graphs without edges

    return homophily_index
# Calculate the homophily for the country attribute
country_homophily = calculate_homophily_by_country(G)
print("Homophily based on country:", country_homophily)

Homophily based on country: 0.28642384105960267


## Degree Assortativity and Assortativity

In [ ]:
# Calculate the assortativity coefficient
assortativity_coefficient = nx.degree_assortativity_coefficient(G.to_directed())
assortativity_coefficient

In [9]:
# Calculate the degree assortativity coefficient
degree_assortativity = nx.degree_assortativity_coefficient(G.to_directed())
degree_assortativity

-0.4460479336387566

In [10]:
# Calculate degree assortativity coefficients
degree_assortativity_in = nx.degree_assortativity_coefficient(G, x='in', y='in')
degree_assortativity_out = nx.degree_assortativity_coefficient(G, x='out', y='out')
degree_assortativity_in_out = nx.degree_assortativity_coefficient(G, x='in', y='out')
degree_assortativity_out_in = nx.degree_assortativity_coefficient(G, x='out', y='in')

print(f"Degree assortativity coefficient (in-in): {degree_assortativity_in}")
print(f"Degree assortativity coefficient (out-out): {degree_assortativity_out}")
print(f"Degree assortativity coefficient (in-out): {degree_assortativity_in_out}")
print(f"Degree assortativity coefficient (out-in): {degree_assortativity_out_in}")

Degree assortativity coefficient (in-in): 0.02756598604581837
Degree assortativity coefficient (out-out): -0.05595792039193227
Degree assortativity coefficient (in-out): -0.02576913534472027
Degree assortativity coefficient (out-in): -0.4460479336387566
