In [None]:
import pandas as pd
from geopy.distance import great_circle
import numpy as np

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

def adjacency_matrix(df, g):
    if not {'LATITUDE', 'LONGITUDE'}.issubset(df.columns):
        raise ValueError("DataFrame must have 'LATITUDE' and 'LONGITUDE' columns")
    
    num_universities = len(df)
    adj_matrix = np.zeros((num_universities, num_universities))
    avg_dist = []
    num_dist = []
    for idx1, row1 in df.iterrows():
        distances = []
        for idx2, row2 in df.iterrows():
            if idx1 != idx2:
                distance = great_circle((row1['LATITUDE'], row1['LONGITUDE']), (row2['LATITUDE'], row2['LONGITUDE'])).miles
                if distance <= g:
                    adj_matrix[idx1, idx2] = 1
                    distances.append(distance)
        if len(distances) > 0:
            avg_dist.append(np.mean(distances))
            num_dist.append(len(distances))
        else:
            avg_dist.append(None)
            num_dist.append(None)

    return avg_dist, num_dist, adj_matrix

def local_clustering_coefficients(adj_matrix):
    adj_matrix = np.array(adj_matrix)
    adj_matrix_sq = np.matmul(adj_matrix, adj_matrix) # Matrix multiplication to count the paths of length 2
    adj_matrix_cube = np.matmul(adj_matrix_sq, adj_matrix) # Matrix multiplication to count the paths of length 3 (triangles)

    degrees = adj_matrix.sum(axis=1) # Degrees of each node
    num_triangles = np.diagonal(adj_matrix_cube) // 2 # Number of triangles for each node (divided by 2 to avoid double-counting)

    clustering_coeffs = np.zeros(len(adj_matrix))

    # Calculate the local clustering coefficient for each node
    for i, (k_i, T_i) in enumerate(zip(degrees, num_triangles)):
        if k_i > 1:
            clustering_coeffs[i] = 2 * T_i / (k_i * (k_i - 1))
        else:
            clustering_coeffs[i] = 0

    return clustering_coeffs

def count_triples_and_triangles(adj_matrix):
    if not isinstance(adj_matrix, np.ndarray):
        adj_matrix = np.array(adj_matrix)

    if adj_matrix.shape[0] != adj_matrix.shape[1]:
        raise ValueError("Input must be a square matrix")

    # Compute the cube of the adjacency matrix
    adj_matrix_cube = np.linalg.matrix_power(adj_matrix, 3)

    # Count the number of triangles
    num_triangles = int(np.trace(adj_matrix_cube) / 6)

    # Count the number of triples
    num_triples = int(np.sum(adj_matrix_cube) / 6) - 3 * num_triangles

    return num_triples, num_triangles

def count_triples_and_triangles_per_node(adj_matrix):
    if not isinstance(adj_matrix, np.ndarray):
        raise ValueError("Input must be a numpy array")

    if adj_matrix.shape[0] != adj_matrix.shape[1]:
        raise ValueError("Input must be a square matrix")

    # Compute the cube of the adjacency matrix
    adj_matrix_cube = np.linalg.matrix_power(adj_matrix, 3)

    # Count the number of triangles at each node
    triangles_per_node = np.diag(adj_matrix_cube) // 2

    # Count the number of triples at each node
    triples_per_node = np.sum(adj_matrix_cube, axis=1) // 2 - 3 * triangles_per_node

    return triples_per_node, triangles_per_node

def school_analysis(df, g):
    if not {'LATITUDE', 'LONGITUDE'}.issubset(df.columns):
        raise ValueError("DataFrame must have 'LATITUDE' and 'LONGITUDE' columns")

    n = len(df)
    distances = np.zeros((n, n))

    # Calculate distances between schools
    for i, row_i in df.iterrows():
        for j, row_j in df.iterrows():
            if i != j:
                distance = great_circle((row_i['LATITUDE'], row_i['LONGITUDE']), (row_j['LATITUDE'], row_j['LONGITUDE'])).miles
                distances[i, j] = distance

    # Calculate adjacency matrix for schools within g miles
    adjacency_matrix = (distances <= g).astype(int)

    # Count the number of schools within g miles
    num_schools_within_g = np.sum(adjacency_matrix, axis=1) - 1

    # Compute the cube of the adjacency matrix
    adjacency_matrix_cube = np.linalg.matrix_power(adjacency_matrix, 3)

    # Count the number of triangles at each node
    triangles_per_node = np.diag(adjacency_matrix_cube) // 2

    # Count the number of triples at each node
    triples_per_node = np.sum(adjacency_matrix_cube, axis=1) // 2 - 3 * triangles_per_node

    # Calculate the average distance for schools within g miles
    avg_distance_within_g = np.sum(distances * adjacency_matrix, axis=1) / num_schools_within_g

    # Create a new DataFrame with the results
    results = pd.DataFrame({
        f'num_schools_within_{g}_miles': num_schools_within_g,
        f'num_triples_{g}_miles': triples_per_node,
        f'num_triangles_{g}_miles': triangles_per_node,
        f'avg_distance_within_{g}_miles': avg_distance_within_g
    })

    return results

def plot_cluster_coefficient_vs_avg_distance(df, g, triples_col, triangles_col, avg_distance_col):
    if not {triples_col, triangles_col, avg_distance_col}.issubset(df.columns):
        raise ValueError("DataFrame must have the specified 'triples_col', 'triangles_col', and 'avg_distance_col' columns")

    # Calculate the cluster coefficient
    df["cluster_coefficient"] = df[triangles_col] / df[triples_col]

    # Create the scatter plot
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(df["cluster_coefficient"], df[avg_distance_col], alpha=0.8)
    ax.set_xlabel("Cluster Coefficient")
    ax.set_ylabel(f"Average Distance (within {g} miles)")
    ax.set_title("Cluster Coefficient vs Average Distance")

    plt.show()



In [None]:
import pandas as pd
import numpy as np
import gc
from geopy.distance import great_circle
import matplotlib.pyplot as plt
    
df_pth = "N:/Classes/2023_1SPRING/Analytics_Day/Data/csc.csv"
csc_dict_pth = "N:/Classes/2023_1SPRING/Analytics_Day/Data_Dictionary.csv"

output_pth = "N:/Classes/2023_1SPRING/Analytics_Day/Data/"


df = pd.read_csv(df_pth,
                 dtype = {"UNITID": "str",
                          "OPEID": "str",
                          "OPEID6": "str"}
                )

df = pd.concat([df, school_analysis(df, 60)], axis = 1)
df = pd.concat([df, school_analysis(df, 120)], axis = 1)
df = pd.concat([df, school_analysis(df, 250)], axis = 1)
df = pd.concat([df, school_analysis(df, 500)], axis = 1)
df = pd.concat([df, school_analysis(df, 1000)], axis = 1)
df = pd.concat([df, school_analysis(df, 1500)], axis = 1)

df.info()


In [None]:
df.head(3)

In [None]:
plot_cluster_coefficient_vs_avg_distance(df, 250, f"num_triples_{250}_miles", f"num_triangles_{250}_miles", f"avg_distance_within_{250}_miles")

In [None]:
# num_schools_within_60_miles                                               20
# num_triples_60_miles                                                    3507
# num_triangles_60_miles                                                   181
# avg_distance_60_miles                                            1224.588334
# avg_distance_within_60_miles                                       29.528241
# num_schools_within_120_miles                                              39
# num_triples_120_miles                                                  62102
# num_triangles_120_miles                                                  698
# avg_distance_120_miles                                           1224.588334
# avg_distance_within_120_miles                                      59.075037
# num_schools_within_250_miles                                             117
# num_triples_250_miles                                                1564283
# num_triangles_250_miles                                                 6378
# avg_distance_250_miles                                           1224.588334
# avg_distance_within_250_miles                                     143.870306
# num_schools_within_500_miles                                             246
# num_triples_500_miles                                               17611554
# num_triangles_500_miles                                                29620
# avg_distance_500_miles                                           1224.588334
# avg_distance_within_500_miles                                      267.29473
# num_schools_within_1000_miles                                            574
# num_triples_1000_miles                                             181829746
# num_triangles_1000_miles                                              164404
# avg_distance_1000_miles                                          1224.588334
# avg_distance_within_1000_miles                                    553.906907
# num_schools_within_1500_miles                                            855
# num_triples_1500_miles                                             428819427
# num_triangles_1500_miles                                              365003
# avg_distance_1500_miles                                          1224.588334
# avg_distance_within_1500_miles                                     773.84474

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

def adj_matrix_to_graph(adj_matrix):
    # Convert adjacency matrix to a networkx graph object
    graph = nx.Graph(adj_matrix)
  
    # Plot the graph
    nx.draw(G = graph, 
            pos = nx.circular_layout(graph),
            with_labels=True,
            node_color = "purple",
            node_size = 200,
            edge_color = "#FF4040",
            width = 1.0,
            font_size =12,
            font_color = "#000000",
            font_family = "calibri"
           )
    plt.show()
  
    # Return the graph object
    return graph

adj_matrix_to_graph(adj_matrix)

In [None]:
graph = nx.Graph(adj_matrix)

