In [1]:
import mysql.connector
import pandas as pd
import networkx as nx
import haversine


ModuleNotFoundError: No module named 'mysql'

In [5]:
# Replace these values with your MySQL server information
host = "localhost"
user = "root"
password = "2710"
database = "pt_network_berlin"

In [9]:
query = """
    SELECT s.stop_id, s.combined_location, ls.stop_order, ls.line_id, l.line_name, s.stop_name, l.type
    FROM line_stops_1960 ls
    INNER JOIN stops_df_1960 s ON ls.stop_id = s.stop_id
    INNER JOIN line_df_1960 l ON ls.line_id = l.line_id
    ORDER BY ls.line_id, ls.stop_order
    """

In [10]:
def get_network_data(query):
    mycursor.execute(query)

    results = mycursor.fetchall()
    columns = ["stop_id", "coordinate_location", "stop_order", "line_id", "line_name", "stop_name", "type"]
    df = pd.DataFrame(results, columns=columns)

    return df

In [11]:
try:
    # Establish a connection to the MySQL server
    connection = mysql.connector.connect(
        host=host,
        user=user,
        password=password,
        database=database
    )

    if connection.is_connected():
        print("Connected to MySQL")

    # For example, let's execute a simple query to fetch data from a table:
    mycursor = connection.cursor()
    
    df = get_network_data(query)
    

    # Don't forget to close the cursor and the connection when you're done.
    mycursor.close()
    connection.close()

except mysql.connector.Error as e:
    print(f"Error connecting to MySQL: {e}")

Connected to MySQL


In [14]:
def create_network_graph(df):
    # Create a network graph
    G = nx.MultiGraph()

    # Add nodes for each stop_id with coordinates as attributes
    for index, row in df.iterrows():
        stop_id = row["stop_id"]
        coordinates = row["coordinate_location"].split(",")
        if len(coordinates) == 2:
            coordinate_y = float(coordinates[0])
            coordinate_x = float(coordinates[1])
        else:
            # Handle the case where the coordinate_location is not 2 values separated by a comma
            coordinate_y = 0
            coordinate_x = 0

        G.add_node(stop_id, x=coordinate_x, y=coordinate_y)

# Add edges
    for line in set(df["line_id"]):
        df_line = df[df["line_id"] == line].sort_values("stop_order")
        line_name = df_line.iloc[0]["line_name"]  # get line name associated with line_id
        for i in range(len(df_line) - 1):
            source = df_line.iloc[i]["stop_id"]
            target = df_line.iloc[i + 1]["stop_id"]
            edge_type = df_line.iloc[i]["type"]
            weight = 1
            line_id = df_line.iloc[i]["line_id"]
            
            # Create a dictionary with edge attributes, including "type"
            edge_attributes = {
                "weight": weight,
                "key": line_id,
                "label": line_name,
                "type": edge_type
            }
            
            G.add_edge(source, target, **edge_attributes)

    return G


In [15]:
G = create_network_graph(df)

In [16]:
def add_graph_attributes(G, df):
    node_weights = {node: G.degree(node) for node in G.nodes()}
    nx.set_node_attributes(G, node_weights, "node_weight")

    #set node labels to stop names
    stop_names = {row["stop_id"]: row["stop_name"] for index, row in df.iterrows()}
    nx.set_node_attributes(G, stop_names, "node_label")

    return G

In [19]:
def add_distance_attribute(graph):
    for u, v, data in graph.edges(data=True):
        u_coord = (graph.nodes[u]["y"], graph.nodes[u]["x"])
        v_coord = (graph.nodes[v]["y"], graph.nodes[v]["x"])
        distance = haversine.haversine(u_coord, v_coord, unit="km")
        data["distance"] = distance
        data["edge_type"] = data["type"]


    return graph

In [20]:
G = add_graph_attributes(G, df)
G = add_distance_attribute(G)

In [21]:
for source, target, attrs in G.edges(data=True):
    if source == target:
        print(f"Self-loop edge: {source} to {target} has following attributes:")
        for attr_name, attr_value in attrs.items():
            print(f"{attr_name}: {attr_value}")
    else:
        None


Analysis

In [22]:
node_degrees = dict(G.degree())

In [23]:
# Create a copy of the original graph
G_coords = G.copy()

# Find and remove nodes with (0,0) coordinates from the copy
nodes_to_remove = [node for node, data in G_coords.nodes(data=True) if data['x'] == 0 and data['y'] == 0]
G_coords.remove_nodes_from(nodes_to_remove)

In [24]:
G_coords.nodes(data=True)

NodeDataView({992: {'x': 13.429517625327765, 'y': 52.50956365130009, 'node_weight': 1, 'node_label': 'SchillingBrÃ¼cke (Ostbahnhof)'}, 1193: {'x': 13.432938339097822, 'y': 52.51808306742478, 'node_weight': 2, 'node_label': 'U-Bhf. Strausberger Platz'}, 1077: {'x': 13.432938339097822, 'y': 52.51808306742478, 'node_weight': 2, 'node_label': 'Stalinallee'}, 659: {'x': 13.4326457, 'y': 52.5232658, 'node_weight': 8, 'node_label': 'Leninplatz'}, 10: {'x': 13.4110773, 'y': 52.5233439, 'node_weight': 2, 'node_label': 'Alexanderplats, Memhardstr.'}, 931: {'x': 13.4016121, 'y': 52.5301769, 'node_weight': 8, 'node_label': 'Rosenthaler Platz'}, 212: {'x': 13.3974236, 'y': 52.5324542, 'node_weight': 4, 'node_label': 'Brunnen Ecke Invalidenstrasse'}, 521: {'x': 13.3827302, 'y': 52.5304158, 'node_weight': 8, 'node_label': 'Invaliden- Ecke Chausseestrasse'}, 1253: {'x': 13.388097732360343, 'y': 52.52168602376861, 'node_weight': 6, 'node_label': 'WeidendammBrÃ¼cke, Bhf. Friedrichstrasse'}, 36: {'x': 13

In [25]:
nx.write_gexf(G_coords, "network_graph_1960_with_coordinates.gexf")

Community detection

community.best_partition(G): This function applies the Louvain community detection algorithm to the graph G. It iteratively optimizes the division of nodes into communities based on the network's structure. The goal is to maximize modularity, a measure of the quality of the community structure. Modularity measures how well nodes within a community are connected compared to what we would expect by random chance.

In [103]:
# Perform Louvain community detection
partition = community.best_partition(G)

# Display the communities with node labels
communities = {}
for node, community_id in partition.items():
    # Assuming 'node_label' is the attribute name
    node_label = G.nodes[node]['node_label']
    
    if community_id in communities:
        communities[community_id].append(node_label)
    else:
        communities[community_id] = [node_label]

for comm_id, node_labels in communities.items():
    print(f"Community {comm_id}: {node_labels}")

Community 1: ['SchillingBrÃ¼cke (Ostbahnhof)', 'U-Bhf. Strausberger Platz', 'Stalinallee', 'Platz der Vereinten Nationen', 'MemhardstraÃŸe', 'U Rosenthaler Platz', 'BrunnenstraÃŸe/InvalidenstraÃŸe', 'U Naturkundemuseum', 'WeidendammBrÃ¼cke, Bhf. Friedrichstrasse', 'Am Kupfergraben', 'KÃ¶penicker Strasse Ecke Adalbertstrasse', 'Bhf. JannowitzBrÃ¼cke', 'MemhardstraÃŸe', 'Oranienburger Tor, Hannoversche Strasse', 'Blockdammweg', 'Karlshorster Strasse', 'Boxhagener StraÃŸe/HolteistraÃŸe', 'U Frankfurter Tor', 'Bersarinplatz', 'Landsberger Allee/Petersburger StraÃŸe', 'KniprodestraÃŸe/Danziger StraÃŸe', 'Greifswalder StraÃŸe/Danziger StraÃŸe', 'Prenzlauer Allee/Danziger StraÃŸe', 'U Eberswalder StraÃŸe', 'Oderberger Strasse', 'Rosenthal Nord', 'NordendstraÃŸe', 'Grabbeallee/Pastor-NiemÃ¶ller-Platz', 'Rathaus Pankow', 'Pankow Kirche', 'U VinetastraÃŸe', 'S+U SchÃ¶nhauser Allee', 'Rosenthal Nord', 'MÃ¼hlenstrasse', 'S Warschauer StraÃŸe', 'Boxhagener Strasse', 'S+U Frankfurter Allee', 'Roeder

In [104]:
import community as community_louvain

In [110]:
# Calculate Degree Centrality
degree_centrality = nx.degree_centrality(G)

# Calculate Betweenness Centrality
betweenness_centrality = nx.betweenness_centrality(G)

# Calculate Closeness Centrality
closeness_centrality = nx.closeness_centrality(G)


In [112]:
# Print the top 5 nodes with the highest degree centrality and their node labels
print("Top 5 Degree Centrality Nodes:")
sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
for node, centrality in sorted_degree_centrality[:5]:
    node_label = G.nodes[node]["node_label"]
    print(f"Node {node_label}: {centrality}")

# Degree Centrality (C_deg(v)) = (Number of edges connected to node v) / (Total number of nodes in the network - 1)

# Example questions:
# 1. Which stations have the highest number of direct connections (degree centrality)?
# 2. What are the major transfer hubs where passengers can switch between different modes of transport?
    #for this I first need to create "supernodes" based on location that can be analysed


Top 5 Degree Centrality Nodes:
Node Wittenbergplatz: 0.011816838995568686
Node Dimitroffstrasse: 0.011816838995568686
Node S-Bahnhof Ostkreuz: 0.011816838995568686
Node U Eberswalder StraÃŸe: 0.010339734121122601
Node Bhf. Zoo: 0.010339734121122601


In [114]:
# Print the top 5 nodes with the highest betweenness centrality
print("\nTop 5 Betweenness Centrality Nodes:")
sorted_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
for node, centrality in sorted_betweenness_centrality[:5]:
    node_label = G.nodes[node]["node_label"]
    print(f"Node {node_label}: {centrality}")

# based on the concept of "shortest paths." Nodes with high betweenness centrality act as bridges or intermediaries that connect different parts of the network, and they are crucial for maintaining network connectivity
# fraction of all shortest paths in the network that pass through node

# Example questions:
# 1. Which stations act as critical points for passenger flow, serving as bridges between different parts of the network?
# 2. Where are the potential bottlenecks in the network that might experience high congestion?


Top 5 Betweenness Centrality Nodes:
Node Potsdamer Strasse: 0.05205940663342261
Node Bhf. SchÃ¶neweide: 0.041706446759938835
Node Hermannplatz: 0.0408390481001322
Node Elsenstrasse: 0.038964486682892104
Node Am Treptower Park Ecke Bulgarische Strasse: 0.03868162055357283


In [116]:
# Print the top 5 nodes with the highest closeness centrality
print("\nTop 5 Closeness Centrality Nodes:")
sorted_closeness_centrality = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)
for node, centrality in sorted_closeness_centrality[:5]:
    node_label = G.nodes[node]["node_label"]
    print(f"Node {node_label}: {centrality}")#

#closeness centrality measures how efficiently a node can reach other nodes in the network

# Example questions:
# 1. Which stations are most accessible to the rest of the network, providing shorter travel times for passengers?
# 2. What locations are suitable for setting up key amenities or services to serve a large number of passengers?


Top 5 Closeness Centrality Nodes:
Node Potsdamer Strasse: 0.05477712282058007
Node Berliner Strasse: 0.05362919610267535
Node Hafenplatz: 0.05233171555180417
Node LÃ¼tzowplatz: 0.050909418995759056
Node Rathaus Spandau: 0.050823976614227724
