In [None]:
import pandas as pd
import numpy as np
import gc
from geopy.distance import great_circle
import cartopy.feature as cfeature

df_pth = "YOUR_PATH_HERE"
csc_dict_pth = "YOUR_PATH_HERE"

output_pth = "YOUR_PATH_HERE"


df = pd.read_csv(df_pth,
                 na_values = [None, np.nan, pd.NA, "PrivacySuppressed"], 
                 dtype = {"UNITID": "str",
                          "OPEID": "str",
                          "OPEID6": "str"}
                ).drop(['NPCURL', 'ALIAS', 'T4APPROVALDATE', 'CIPTITLE1', 'CIPTITLE2', 'CIPTITLE3', 'CIPTITLE4', 'CIPTITLE5', 'CIPTITLE6', 'FEDSCHCD'], 
                       axis = 1)

df = df[df["CONTROL"] == 1]
df = df[df["ICLEVEL"] == 1]
df = df[df["STABBR"] != "FM"]
df = df[df["STABBR"] != "GU"]
df = df[df["STABBR"] != "PR"]
df = df[df["STABBR"] != "VI"]
df = df[df["STABBR"] != "AK"]
df = df[df["STABBR"] != "HI"]
df = df[df["CURROPER"] != 0]
df = df.loc[df["DISTANCEONLY"] != 1]
df = df.loc[df["LATITUDE"].isna() == False]
# df = df[df["ADM_RATE"].isna() == False]
df = df[df["OMENRAP_ALL"].isna() == False]


missing_df = pd.DataFrame([[col, df.loc[df[col].isna()].shape[0]] for col in df.columns.to_list()], columns = ["Variable", "nmiss"])
missing_df["n"] = df.shape[0] - missing_df["nmiss"]
missing_df["n_pct"] = missing_df["n"] / df.shape[0]
missing_df["nmiss_pct"] = missing_df["nmiss"] / df.shape[0]

## Creates Data Dictionary
# csc_dict_df = pd.merge(left = pd.read_csv(csc_dict_pth), right = missing_df, left_on = "VARIABLE NAME", right_on = "Variable", how = "left")
# csc_dict_df = csc_dict_df[csc_dict_df["Variable"].isin(df.columns.to_list())]
# csc_dict_df = csc_dict_df.loc[:, ["Variable", "NAME OF DATA ELEMENT", "n", "nmiss", "n_pct", "nmiss_pct"]]
# csc_dict_df["n"] = csc_dict_df["n"].astype('int64')
# csc_dict_df["nmiss"] = csc_dict_df["nmiss"].astype('int64')
# csc_dict_df["n_pct"] = round(csc_dict_df["n_pct"] * 100, 0).astype('int64')
# csc_dict_df["nmiss_pct"] = round(csc_dict_df["nmiss_pct"] * 100, 0).astype('int64')
# csc_dict_df.to_csv(csc_dict_pth)

missing_df = missing_df[missing_df["nmiss"] >= 0.9 * df.shape[0]]

df.drop(missing_df.Variable.to_list() + [col for col in df.columns.to_list() if (("POOL" in col) | ("_SUPP" in col))], axis = 1, inplace = True)

del missing_df
gc.collect()

df.reset_index(inplace = True, drop = True)

df.sort_values("OMENRAP_ALL", ascending = False, inplace = True)

df["CATEG_TRANSFER_RATE"] = -1

df.loc[((df["OMENRAP_ALL"] >= 0) & (df["OMENRAP_ALL"] > 0.125)), "CATEG_TRANSFER_RATE"] = 0
df.loc[((df["OMENRAP_ALL"] >= 0.125) & (df["OMENRAP_ALL"] > 0.25)), "CATEG_TRANSFER_RATE"] = 1
df.loc[((df["OMENRAP_ALL"] >= 0.375) & (df["OMENRAP_ALL"] > 0.5)), "CATEG_TRANSFER_RATE"] = 2
df.loc[((df["OMENRAP_ALL"] >= 0.625) & (df["OMENRAP_ALL"] > 0.75)), "CATEG_TRANSFER_RATE"] = 3
df.loc[df["OMENRAP_ALL"] >= 0.75, "CATEG_TRANSFER_RATE"] = 4

df["CATEG2_TRANSFER_RATE"] = 0
df.loc[df["OMENRAP_ALL"] >= 0.5, "CATEG2_TRANSFER_RATE"] = 1

df["CATEG3_TRANSFER_RATE"] = 0
df.loc[df["OMENRAP_ALL"] >= 0.375, "CATEG3_TRANSFER_RATE"] = 1

df["CATEG_AVG_TRANSFER_RATE"] = 0
df.loc[df["OMENRAP_ALL"] > df["OMENRAP_ALL"].mean(), "CATEG_AVG_TRANSFER_RATE"] = 1

df.to_csv(output_pth + "csc.csv", index = False)

print("RESULTS:", df.shape)
df.head(3)


In [None]:
import plotnine as p9

print(f"AVERAGE: {df['OMENRAP_ALL'].mean()}")
(
    p9.ggplot(p9.aes(x = df["OMENRAP_ALL"]))
        + p9.geom_histogram(bins = 10, fill = "lavender", color = "grey")
        + p9.theme_bw()
)

In [None]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
from itertools import combinations

def create_distance_matrix(df):
    num_schools = len(df)
    distance_matrix = np.zeros((num_schools, num_schools))
    
    for i in range(num_schools):
        for j in range(i+1, num_schools):
            distance = great_circle((df.loc[i, 'LATITUDE'], df.loc[i, 'LONGITUDE']),
                                    (df.loc[j, 'LATITUDE'], df.loc[j, 'LONGITUDE'])).miles
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance
            
    return distance_matrix

def count_triangles_and_triples(matrix, g):
    num_schools = len(matrix)
    triangles = [0] * num_schools
    triples = [0] * num_schools
    
    for i, j, k in combinations(range(num_schools), 3):
        if matrix[i, j] <= g and matrix[j, k] <= g and matrix[k, i] <= g:
            triangles[i] += 1
            triangles[j] += 1
            triangles[k] += 1
        else:
            if matrix[i, j] <= g and matrix[j, k] <= g:
                triples[i] += 1
                triples[j] += 1
            if matrix[j, k] <= g and matrix[k, i] <= g:
                triples[j] += 1
                triples[k] += 1
            if matrix[k, i] <= g and matrix[i, j] <= g:
                triples[k] += 1
                triples[i] += 1
                
    return triangles, triples

def school_summary(df, g):
    distance_matrix = create_distance_matrix(df)
    triangles, triples = count_triangles_and_triples(distance_matrix, g)
    
    num_schools_within_g = []
    avg_distance_within_g = []
    
    for row in distance_matrix:
        schools_within_g = np.sum(row <= g) - 1
        num_schools_within_g.append(schools_within_g)
        
        if schools_within_g > 0:
            avg_distance = np.mean(row[row <= g][1:])
            avg_distance_within_g.append(avg_distance)
        else:
            avg_distance_within_g.append(0)
            
    summary = pd.DataFrame({
        'School': df.index,
        f'Num_Schools_Within_{g}': num_schools_within_g,
        f'Triangles_Within_{g}': triangles,
        f'Triples_Within_{g}': triples,
        f'Avg_Distance_Within_{g}': avg_distance_within_g
    })
    
    return summary

In [None]:
sum_list = []
for g in [50, 100]:
    sum_list.append(school_summary(df, g = g))

In [None]:
df2 = pd.concat([df] + sum_list, axis = 1)

df2.head(3)

In [None]:
df2[df2["cluster_coeff_100"].isna() == False].sort_values("cluster_coeff_100")

In [None]:
import plotnine as p9

df2[f"cluster_coeff_{100}"] = df2[f"Triangles_Within_{100}"] / df2[f"Triples_Within_{100}"]

print(f"AVERAGE: {df2['cluster_coeff_100'].mean()}")
# (
#     p9.ggplot(p9.aes(x = df2[f"Num_Schools_Within_{100}"]))
#         + p9.geom_histogram(bins = 10, fill = "lavender", color = "grey")
#         + p9.theme_bw()
# )
(
    p9.ggplot(p9.aes(x = df2[f"cluster_coeff_{100}"]))
        + p9.geom_histogram(bins = 10, fill = "lavender", color = "grey")
        + p9.theme_bw()
)

In [None]:
def school_analysis(df, g):
    if not {'LATITUDE', 'LONGITUDE'}.issubset(df.columns):
        raise ValueError("DataFrame must have 'LATITUDE' and 'LONGITUDE' columns")

    n = len(df)
    distances = np.zeros((n, n))

    # Calculate distances between schools
    for i, row_i in df.iterrows():
        for j, row_j in df.iterrows():
            if i != j:
                distance = great_circle((row_i['LATITUDE'], row_i['LONGITUDE']), (row_j['LATITUDE'], row_j['LONGITUDE'])).miles
                distances[i, j] = distance

    # Calculate adjacency matrix for schools within g miles
    adjacency_matrix = (distances <= g).astype(int)

    # Count the number of schools within g miles
    num_schools_within_g = np.sum(adjacency_matrix, axis=1) - 1

    # Compute the cube of the adjacency matrix
    adjacency_matrix_cube = np.linalg.matrix_power(adjacency_matrix, 3)

    # Count the number of triangles at each node
    triangles_per_node = np.diag(adjacency_matrix_cube) // 2

    # Count the number of triples at each node
    triples_per_node = np.sum(adjacency_matrix_cube, axis=1) - 3 * triangles_per_node

    # Calculate the average distance for schools within g miles
    avg_distance_within_g = np.sum(distances * adjacency_matrix, axis=1) / num_schools_within_g

    clust_coeff = 0
    if (isinstance(triples_per_node, int) & isinstance(triangles_per_node, int)):
        if ((triples_per_node > 0) & (triangles_per_node > 0)):
            clust_coeff = triangles_per_node / triples_per_node

    # Create a new DataFrame with the results
    results = pd.DataFrame({
        f'num_schools_within_{g}_miles': num_schools_within_g,
        f'num_triples_{g}_miles': triples_per_node,
        f'num_triangles_{g}_miles': triangles_per_node,
        f'avg_distance_within_{g}_miles': avg_distance_within_g,
        f'clust_coeff': clust_coeff
    })

    return results

df = pd.concat([df, school_analysis(df, 100)], axis = 1)
df.head(3)

In [None]:
df.loc[:, ["INSTNM", "num_triangles_100_miles", "num_triples_100_miles", "clust_coeff"]]

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import pandas as pd
from geopy.distance import great_circle
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from matplotlib.colors import ListedColormap

def plot_school_network(df, g, cat_col, student_counts):
    if not {'LATITUDE', 'LONGITUDE', 'OMENRAP_ALL', cat_col}.issubset(df.columns):
        raise ValueError("DataFrame must have 'LATITUDE', 'LONGITUDE', 'OMENRAP_ALL', and the specified 'cat_col' columns")

    n = len(df)
    distances = np.zeros((n, n))
    weights = np.zeros((n, n))
    

    # Calculate distances between schools and edge weights
    for i, row_i in df.iterrows():
        for j, row_j in df.iterrows():
            if i != j:
                distance = great_circle((row_i['LATITUDE'], row_i['LONGITUDE']), (row_j['LATITUDE'], row_j['LONGITUDE'])).miles
                distances[i, j] = distance
                weights[i, j] = row_i['OMENRAP_ALL']

    # Create the adjacency matrix for schools within g miles
    adjacency_matrix = (distances <= g).astype(int)
    np.fill_diagonal(adjacency_matrix, 0)

    # Create the graph from the adjacency matrix
    G = nx.from_numpy_matrix(adjacency_matrix)

    # Add node attributes for latitude, longitude, and student count
    nx.set_node_attributes(G, {i: {'pos': (row['LONGITUDE'], row['LATITUDE']), 'category': row[cat_col], 'students': student_counts[i]} for i, row in df.iterrows()})

    # Add edge attributes for weights
    nx.set_edge_attributes(G, {(i, j): {'weight': weights[i, j]} for i, j in G.edges})

    # Create the map and plot
    fig, ax = plt.subplots(figsize=(20, 16), subplot_kw={'projection': ccrs.PlateCarree()})

    ax.coastlines()
    ax.add_feature(cfeature.OCEAN, alpha=0.5)
    ax.add_feature(cfeature.LAKES.with_scale('10m'), alpha=0.5)
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    ax.add_feature(cfeature.STATES.with_scale('10m'), linestyle=':', linewidth=0.33)
    ax.set_extent([-130, -65, 25, 45])
    ax.set_facecolor('white')

    custom_cmap = ListedColormap(['grey', 'gold'])

    # Create a list of edges where one node is category 1 and the other is category 0
    category_edges = [(u, v, d) for u, v, d in G.edges(data=True) if (G.nodes[u]['category'] == 1 and G.nodes[v]['category'] == 0) or (G.nodes[u]['category'] == 0 and G.nodes[v]['category'] == 1) or (G.nodes[u]['category'] == 1 and G.nodes[v]['category'] == 1)]

    # Add the nodes and edges to the plot
    pos = nx.get_node_attributes(G, 'pos')
    edges = nx.draw_networkx_edges(G,
                                pos,
                                edgelist=category_edges,  # Draw only the category_edges
                                width=[2 * d['weight'] for _, _, d in category_edges],
                                alpha=[0.66 * d['weight'] for _, _, d in category_edges],
                                edge_color='red',
                                ax=ax)

    nodes = nx.draw_networkx_nodes(G, 
                                    pos, 
                                    node_size= [0.001 * d['students'] for _, d in G.nodes(data=True)],  # Update node size based on student count
                                    node_color=[d['category'] for _, d in G.nodes(data=True)], 
                                    cmap=custom_cmap,  # Use custom color map
                                    alpha=[0.5 + 0.5 * d['category'] for _, d in G.nodes(data=True)], 
                                    ax=ax)

    plt.show()


plot_school_network(df, g = 50, cat_col = "CATEG3_TRANSFER_RATE", student_counts = df["UGDS"])
plot_school_network(df, g = 100, cat_col = "CATEG3_TRANSFER_RATE", student_counts = df["UGDS"])

In [None]:

plot_school_network(df, g = 50, cat_col = "CATEG_AVG_TRANSFER_RATE", student_counts = df["UGDS"])
plot_school_network(df, g = 100, cat_col = "CATEG_AVG_TRANSFER_RATE", student_counts = df["UGDS"])


In [None]:

plot_school_network(df, g = 110, cat_col = "CATEG2_TRANSFER_RATE")


In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from geopy.distance import great_circle

def plot_school_network(df, g):
    if not {'LATITUDE', 'LONGITUDE', 'OMENRAP_ALL'}.issubset(df.columns):
        raise ValueError("DataFrame must have 'LATITUDE', 'LONGITUDE', and 'OMENRAP_ALL' columns")

    n = len(df)
    distances = np.zeros((n, n))
    weights = np.zeros((n, n))

    # Calculate distances between schools and edge weights
    for i, row_i in df.iterrows():
        for j, row_j in df.iterrows():
            if i != j:
                distance = great_circle((row_i['LATITUDE'], row_i['LONGITUDE']), (row_j['LATITUDE'], row_j['LONGITUDE'])).miles
                distances[i, j] = distance
                weights[i, j] = row_i['OMENRAP_ALL']

    # Create the adjacency matrix for schools within g miles
    adjacency_matrix = (distances <= g).astype(int)
    np.fill_diagonal(adjacency_matrix, 0)

    # Create the graph from the adjacency matrix
    G = nx.from_numpy_matrix(adjacency_matrix)

    # Add node attributes for latitude and longitude
    nx.set_node_attributes(G, {i: {'pos': (row['LONGITUDE'], row['LATITUDE'])} for i, row in df.iterrows()})

    # Add edge attributes for weights
    nx.set_edge_attributes(G, {(i, j): {'weight': weights[i, j]} for i, j in G.edges})

    # Create the map and plot
    fig, ax = plt.subplots(figsize=(20, 16), subplot_kw={'projection': ccrs.PlateCarree()})
    
    ax.coastlines()
    ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    ax.add_feature(cfeature.STATES.with_scale('10m'), linestyle=':', linewidth=0.33)
    ax.set_extent([-130, -65, 25, 45])
    ax.set_facecolor('white')

    # Add the nodes and edges to the plot
    pos = nx.get_node_attributes(G, 'pos')
    edges = nx.draw_networkx_edges(G,
                                   pos,
                                #    alpha = 1.0,
                                   width = 1.0,
                                   alpha = [d['weight'] for _, _, d in G.edges(data=True)],
                                   edge_color = 'red',
                                   ax = ax
                                  )
    nodes = nx.draw_networkx_nodes(G, pos, node_size = 7.5, node_color='purple', alpha=0.8, ax=ax)

    plt.show()


plot_school_network(df, g = 60)


In [None]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

In [None]:
sub_df = df.loc[:, ['INSTNM', 'LATITUDE', 'LONGITUDE', 'STABBR', 'OMENRAP_ALL', 'TRANS_4', 'DTRANS_4']].copy()

sub_df["TRANS_4"] = sub_df["TRANS_4"].fillna(0)
sub_df["DTRANS_4"] = sub_df["DTRANS_4"].fillna(0).astype('int64')

sub_df["FTFT_TRANS_4yr_num"] = (sub_df["TRANS_4"] * sub_df["DTRANS_4"]).fillna(0).astype('int64')

sub_df.rename(columns = {"OMENRAP_ALL": "WITHDRAW_RATE", "TRANS_4": "FTFT_TRANS_4yr_prop", "DTRANS_4": "FTFT_TRANS_4yr_denom"})


# Replace 'sub_df' with your actual DataFrame
coords = sub_df[['LATITUDE', 'LONGITUDE']].to_numpy()
distances = np.zeros((len(sub_df), len(sub_df)))

for i in range(len(sub_df)):
    lat1, lon1 = coords[i]
    distances[i] = haversine(lat1, lon1, coords[:, 0], coords[:, 1])

distance_df = pd.DataFrame(distances * 0.6213712, index = df["INSTNM"], columns = df["INSTNM"])

# melted_distance_df = distance_df.reset_index().melt(id_vars='INSTNM', var_name='INSTNM2', value_name='distance_miles')
# melted_distance_df = melted_distance_df.rename(columns={'INSTNM': 'INSTNM1'})

# melted_distance_df = melted_distance_df[melted_distance_df["distance_miles"] != 0]

# for idx, row in df.iterrows():
#     distances_within_g = melted_distance_df.loc[(((melted_distance_df['INSTNM1'] == row['INSTNM']) | (melted_distance_df['INSTNM2'] == row['INSTNM'])) & (melted_distance_df['distance_miles'] <= 500)), ['INSTNM1', 'INSTNM2', 'distance_miles']]

# distances_within_g

In [None]:
import pandas as pd

def count_values(df, col, k, comparison='min'):
    """
    Function that takes a DataFrame and returns a list where element i in the list
    represents the number of rows with values less than k (or greater than k) in column i
    of the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame
    k (float): The comparison value
    comparison (str): 'min' for less than k, 'max' for greater than k. Defaults to 'min'.

    Returns:
    list: A list containing the count of values less than or greater than k in each column.
    """

    if comparison == 'min':
        return 
    elif comparison == 'max':
        return df[col][df[col] < k].count()
    else:
        raise ValueError("Invalid comparison value. Accepted values are 'min' or 'max'.")


distance_df.reset_index(drop=True, inplace=True)


In [None]:
def count_universities_within_distance(distance_matrix, threshold):
    return distance_matrix.applymap(lambda x: x <= threshold).sum(axis=1).fillna(0)

count_universities_within_distance(distance_df, threshold = 500)


In [None]:
df["average_distance"] = distance_df.mean().to_list()

df["num_within_250m"] = count_universities_within_distance(distance_df.reset_index(drop = True), threshold = 250)
df["num_within_60m"] = count_universities_within_distance(distance_df.reset_index(drop = True), threshold = 60)
df["num_within_15m"] = count_universities_within_distance(distance_df.reset_index(drop = True), threshold = 15)



df.dtypes #head(3)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt

def knn_regression(df, independent_vars, dependent_var, k):
    # Standardize the independent variables
    scaler = StandardScaler()
    X = scaler.fit_transform(df[independent_vars])
    
    # Fit the KNN model
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X, df[dependent_var])
    
    # Predict the dependent variable using the KNN model
    y_pred = knn.predict(X)
    
    # Create a scatterplot of the actual vs. predicted values
    plt.figure(figsize=(10, 6))
    plt.scatter(df[dependent_var], y_pred, c = y_pred.astype('object'), alpha=0.5)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'KNN Regression with k={k}: Actual vs. Predicted {dependent_var}')
    plt.show()

    return y_pred

# Example usage
dependent_var = 'OMENRAP_ALL'
independent_vars = ['local_clust_coeff_60', 'local_clust_coeff_110', 'local_clust_coeff_250', 'num_within_250m', 'num_within_60m'] # 'average_distance', 'num_within_250m', 'num_within_60m', , 'ADM_RATE_ALL'
for col in independent_vars:
    df[col] = df[col].fillna(0)

k = 5

df["k_cluster"] = knn_regression(df, independent_vars, dependent_var, k)


In [None]:
melted_distance_df = pd.merge(left = melted_distance_df,
                              right = df.loc[:, ["INSTNM", "OMENRAP_ALL"]],
                              left_on = "INSTNM1",
                              right_on = "INSTNM",
                              how = "left").drop("INSTNM", axis = 1)

melted_distance_df = pd.merge(left = melted_distance_df,
                              right = df.loc[:, ["INSTNM", "OMENRAP_ALL"]],
                              left_on = "INSTNM2",
                              right_on = "INSTNM",
                              how = "left").drop("INSTNM", axis = 1)

melted_distance_df


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def stratified_scatterplot(df, x_col, y_col, cat_col):
    if not {x_col, y_col, cat_col}.issubset(df.columns):
        raise ValueError("DataFrame must have '{}', '{}', and '{}' columns".format(x_col, y_col, cat_col))

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x=x_col, y=y_col, hue=cat_col, legend = True)
    plt.title('Scatterplot of {} vs. {} Stratified by {}'.format(x_col, y_col, cat_col))
    plt.show()

stratified_scatterplot(df, 'k_cluster', 'average_distance', 'CONTROL')

In [None]:
import numpy as np

def adjacency_matrix(df, g):
    if not {'LATITUDE', 'LONGITUDE'}.issubset(df.columns):
        raise ValueError("DataFrame must have 'LATITUDE' and 'LONGITUDE' columns")
    
    num_universities = len(df)
    adj_matrix = np.zeros((num_universities, num_universities))

    for idx1, row1 in df.iterrows():
        for idx2, row2 in df.iterrows():
            if idx1 != idx2:
                distance = great_circle((row1['LATITUDE'], row1['LONGITUDE']), (row2['LATITUDE'], row2['LONGITUDE'])).miles
                if distance <= g:
                    adj_matrix[idx1, idx2] = 1

    return adj_matrix

def local_clustering_coefficients(adj_matrix):
    adj_matrix = np.array(adj_matrix)
    adj_matrix_sq = np.matmul(adj_matrix, adj_matrix) # Matrix multiplication to count the paths of length 2
    adj_matrix_cube = np.matmul(adj_matrix_sq, adj_matrix) # Matrix multiplication to count the paths of length 3 (triangles)

    degrees = adj_matrix.sum(axis=1) # Degrees of each node
    num_triangles = np.diagonal(adj_matrix_cube) // 2 # Number of triangles for each node (divided by 2 to avoid double-counting)

    clustering_coeffs = np.zeros(len(adj_matrix))

    # Calculate the local clustering coefficient for each node
    for i, (k_i, T_i) in enumerate(zip(degrees, num_triangles)):
        if k_i > 1:
            clustering_coeffs[i] = 2 * T_i / (k_i * (k_i - 1))
        else:
            clustering_coeffs[i] = 0

    return clustering_coeffs


adj_matrix_60 = adjacency_matrix(df, g = 60)
adj_matrix_110 = adjacency_matrix(df, g = 110)
adj_matrix_250 = adjacency_matrix(df, g = 250)

df["local_clust_coeff_60"] = local_clustering_coefficients(adj_matrix_60)
df["local_clust_coeff_110"] = local_clustering_coefficients(adj_matrix_110)
df["local_clust_coeff_250"] = local_clustering_coefficients(adj_matrix_250)

df.head(3)



In [None]:
adj_matrix_60

In [None]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

def plot_network(matrix, matrix_type='adjacency', threshold=None):
    if matrix_type not in ['adjacency', 'distance']:
        raise ValueError("matrix_type should be either 'adjacency' or 'distance'.")

    # Convert distance matrix to adjacency matrix
    if matrix_type == 'distance':
        if threshold is None:
            raise ValueError("threshold value is required for distance matrix.")
        adjacency_matrix = (np.array(matrix) <= threshold).astype(int)
    else:
        adjacency_matrix = matrix

    # Create a graph from the adjacency matrix
    G = nx.from_numpy_matrix(adjacency_matrix)

    # Plot the network graph
    pos = nx.circular_layout(G)
    nx.draw(G, pos, node_color='blue', with_labels = False)
    nx.draw_networkx_edges(G, pos, alpha=0.5)
    plt.show()

plot_network(adj_matrix_60)


In [None]:
# df["local_clust_coeff"] = coeffs

import plotnine as p9

print([(
    p9.ggplot(p9.aes(x = df["local_clust_coeff_60"]))
        + p9.geom_histogram(bins = 30, fill = "lavender", color = "grey")
        + p9.theme_bw()
),
(
    p9.ggplot(p9.aes(x = df["local_clust_coeff_110"]))
        + p9.geom_histogram(bins = 30, fill = "lavender", color = "grey")
        + p9.theme_bw()
),
(
    p9.ggplot(p9.aes(x = df["local_clust_coeff_250"]))
        + p9.geom_histogram(bins = 30, fill = "lavender", color = "grey")
        + p9.theme_bw()
)])

# df

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, probplot

def normality_test_and_visualizations(series):
    # Perform the Shapiro-Wilk test for normality
    stat, p = shapiro(series)
    print(f'Shapiro-Wilk test statistic: {stat:.5f}')
    print(f'p-value: {p:.5f}')
    
    if p > 0.05:
        print("Fail to reject the null hypothesis - the data may be normally distributed.")
    else:
        print("Reject the null hypothesis - the data may not be normally distributed.")
    
    # Create a histogram for visual inspection
    plt.figure(figsize=(10, 6))
    sns.histplot(series, kde=True, color='blue', bins=30)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Histogram')
    plt.show()

    # Create a Q-Q plot for visual inspection
    plt.figure(figsize=(10, 6))
    probplot(series, dist='norm', plot=plt)
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Ordered Values')
    plt.title('Q-Q Plot')
    plt.show()


normality_test_and_visualizations(df["OMENRAP_ALL"])

In [None]:
df[df["OMENRAP_ALL"] == 0]

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import probplot

indep_vars = ['HCM2', 'MAIN', 'NUMBRANCH', 'PREDDEG', 'HIGHDEG', 'CONTROL', 'ST_FIPS', 'REGION', 'LOCALE', 'CCBASIC', 'CCUGPROF', 'CCSIZSET', 'ADM_RATE', 'ADM_RATE', 'ADM_RATE_ALL', 'SATVR25', 'SATVR75', 'SATMT25', 'SATMT75', 'SATVRMID', 'SATMTMID', 'ACTCM25', 'ACTCM75', 'ACTEN25', 'ACTEN75', 'ACTMT25', 'ACTMT75', 'ACTCMMID', 'ACTENMID', 'ACTMTMID', 'SAT_AVG', 'SAT_AVG_ALL', 'PCIP01', 'PCIP03', 'PCIP04', 'PCIP05', 'PCIP09', 'PCIP10', 'PCIP11', 'PCIP12', 'PCIP13', 'PCIP14', 'PCIP15', 'PCIP16', 'PCIP19', 'PCIP22', 'PCIP23', 'PCIP24', 'PCIP25', 'PCIP26', 'PCIP27', 'PCIP29', 'PCIP30', 'PCIP31', 'PCIP38', 'PCIP39', 'PCIP40', 'PCIP41', 'PCIP42', 'PCIP43', 'PCIP44', 'PCIP45', 'PCIP46', 'PCIP47', 'PCIP48', 'PCIP49', 'PCIP50', 'PCIP51', 'PCIP52', 'PCIP54', 'COSTT4_A', 'TUITIONFEE_IN', 'TUITIONFEE_OUT', 'TUITFTE', 'INEXPFTE', 'AVGFACSAL', 'PFTFAC', 'PCTPELL', 'OPENADMP', 'UGNONDS', 'GRADS']





In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.feature as cfeature

def create_colored_graph_v3(df, state_col='state'):
    if not {'INSTNM', 'LATITUDE', 'LONGITUDE', state_col}.issubset(df.columns):
        raise ValueError("DataFrame must have 'INSTNM', 'LATITUDE', 'LONGITUDE', and '{}' columns".format(state_col))

    # Create a graph
    G = nx.Graph()

    # Set the color palette
    sns.set_palette("husl")

    # Add nodes to the graph and assign colors
    for idx, row in df.iterrows():
        G.add_node(row['INSTNM'], lat=row['LATITUDE'], lon=row['LONGITUDE'], state=row[state_col])

    # Get a list of unique states
    unique_states = df[state_col].unique()

    # Create a dictionary for state colors
    state_colors = {state: sns.color_palette("colorblind", len(unique_states))[i] for i, state in enumerate(unique_states)}

    # Set node colors
    node_colors = [state_colors[node[1]['state']] for node in G.nodes(data=True)]

    # Create a Cartopy Albers Equal Area projection
    projection = ccrs.AlbersEqualArea(central_longitude=-98.35, central_latitude=39.50)

    # Draw the graph using the projected coordinates
    plt.figure(figsize=(16, 12))
    ax = plt.axes(projection=projection)

    # Draw the map features
    ax.add_feature(cfeature.LAND)
    # ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    ax.add_feature(cfeature.STATES.with_scale('110m'), linestyle=':', linewidth=0.33)
    ax.set_extent([-155, -65, 10, 75])

    # Draw the nodes
    for node in G.nodes(data=True):
        plt.plot(node[1]['lon'], node[1]['lat'], marker='o', color=state_colors[node[1]['state']], markersize=2.5, transform=ccrs.PlateCarree())

    plt.title('Graph of Universities Colored by State')
    plt.show()

# Example usage:
# df = pd.read_csv("your_college_data.csv")
# create_colored_graph_v3(df, state_col='state')

create_colored_graph_v3(sub_df, state_col='STABBR')

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.feature as cfeature

def create_colored_graph_v2(df, state_col='state'):
    if not {'INSTNM', 'LATITUDE', 'LONGITUDE', state_col}.issubset(df.columns):
        raise ValueError("DataFrame must have 'INSTNM', 'LATITUDE', 'LONGITUDE', and '{}' columns".format(state_col))

    # Create a graph
    G = nx.Graph()

    # Set the color palette
    sns.set_palette("colorblind")

    # Add nodes to the graph and assign colors
    for idx, row in df.iterrows():
        G.add_node(row['INSTNM'], lat=row['LATITUDE'], lon=row['LONGITUDE'], state=row[state_col])

    # Get a list of unique states
    unique_states = df[state_col].unique()

    # Create a dictionary for state colors
    state_colors = {state: sns.color_palette("coolwarm", len(unique_states))[i] for i, state in enumerate(unique_states)}

    # Set node colors
    node_colors = [state_colors[node[1]['state']] for node in G.nodes(data=True)]

    # Create a Cartopy Albers Equal Area projection
    projection = ccrs.AlbersEqualArea(central_longitude=-98.35, central_latitude=39.50)

    # Draw the graph using the projected coordinates
    plt.figure(figsize=(12, 8))
    ax = plt.axes(projection=projection)

    # Draw the map features
    ax.add_feature(cfeature.LAND)
    ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    ax.set_extent([-130, -70, 20, 60])

    # Draw the nodes
    for node in G.nodes(data=True):
        plt.plot(node[1]['lon'], node[1]['lat'], marker='o', color=state_colors[node[1]['state']], markersize=5, transform=ccrs.PlateCarree())

    plt.title('Graph of Universities Colored by State')
    plt.show()

# Example usage:
# df = pd.read_csv("your_college_data.csv")
# create_colored_graph_v2(df, state_col='state')

create_colored_graph_v2(sub_df, state_col='STABBR')

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

def create_colored_graph(df, state_col='state'):
    if not {'INSTNM', 'LATITUDE', 'LONGITUDE', state_col}.issubset(df.columns):
        raise ValueError("DataFrame must have 'INSTNM', 'LATITUDE', 'LONGITUDE', and '{}' columns".format(state_col))

    # Create a graph
    G = nx.Graph()

    # Set the color palette
    sns.set_palette("husl")

    # Add nodes to the graph and assign colors
    for idx, row in df.iterrows():
        G.add_node(row['INSTNM'], pos=(row['LONGITUDE'], row['LATITUDE']), state=row[state_col])

    # Get a list of unique states
    unique_states = df[state_col].unique()

    # Create a dictionary for state colors
    state_colors = {state: sns.color_palette("coolwarm", len(unique_states))[i] for i, state in enumerate(unique_states)}

    # Set node colors
    node_colors = [state_colors[node[1]['state']] for node in G.nodes(data=True)]

    # Draw the graph
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(G, pos=nx.get_node_attributes(G, 'pos'), node_size=50, node_color=node_colors, with_labels=False)
    plt.title('Graph of Universities Colored by State')
    plt.axis('off')
    plt.show()

create_colored_graph(sub_df, state_col='STABBR')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import great_circle
import networkx as nx
import folium


def INSTNM_distance_analysis_v2(df, g, k, k_kappa="max"):
    if not {'INSTNM', 'LATITUDE', 'LONGITUDE'}.issubset(df.columns):
        raise ValueError("DataFrame must have 'INSTNM', 'LATITUDE', and 'LONGITUDE' columns")

    # Calculate distances between universities
    distances = []
    for idx1, row1 in df.iterrows():
        for idx2, row2 in df.iterrows():
            if idx1 < idx2:
                distance = great_circle((row1['LATITUDE'], row1['LONGITUDE']), (row2['LATITUDE'], row2['LONGITUDE'])).miles
                distances.append([row1['INSTNM'], row2['INSTNM'], distance])

    distance_df = pd.DataFrame(distances, columns=['INSTNM1', 'INSTNM2', 'distance_miles'])

    # Create a histogram of distances
    plt.figure(figsize=(10, 6))
    plt.hist(distance_df['distance_miles'], bins='auto')
    plt.xlabel('Distance (miles)')
    plt.ylabel('Frequency')
    plt.title('Histogram of Distances between Universities')
    plt.show()

    # Calculate the average distance and number of schools within 'g' miles for each INSTNM
    INSTNM_stats = []
    for idx, row in df.iterrows():
        distances_within_g = distance_df.loc[(((distance_df['INSTNM1'] == row['INSTNM']) | (distance_df['INSTNM2'] == row['INSTNM'])) & (distance_df['distance_miles'] <= g)), 'distance_miles']
        avg_distance = distances_within_g.mean()
        num_schools = distances_within_g.count()
        INSTNM_stats.append([row['INSTNM'], avg_distance, num_schools])

    INSTNM_stats_df = pd.DataFrame(INSTNM_stats, columns=['INSTNM', 'average_distance', 'num_schools_within_g'])

    # Create a scatterplot of average distance vs. number of schools within 'g' miles
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=INSTNM_stats_df, x='num_schools_within_g', y='average_distance')
    plt.title(f'Average Distance vs. Number of Schools within {g} miles')
    plt.show()

    # Get the top 'k' universities based on the highest average distance
    if k_kappa in ["max", 1, "1", "largest", "large"]:
        top_k_universities = INSTNM_stats_df.nlargest(k, 'average_distance')['INSTNM']
    if k_kappa in ["min", 0, "0", "smallest", "small"]:
        top_k_universities = INSTNM_stats_df.nsmallest(k, 'average_distance')['INSTNM']
    if k_kappa not in ["max", 1, "1", "largest", "large", "min", 0, "0", "smallest", "small"]:
        print("k_kappa Error must be one of [max, 1, largest, large, min, 0, smallest, small]")

    # Create a heatmap of distances for the top 'k' universities
    top_k_distance = distance_df.loc[(distance_df['INSTNM1'].isin(top_k_universities)) & (distance_df['INSTNM2'].isin(top_k_universities))]
    heatmap_data = top_k_distance.pivot_table(index='INSTNM1', columns='INSTNM2', values='distance_miles')

    plt.figure(figsize=(10, 6))
    sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt='.1f')
    plt.title(f'Heatmap of Distances between Top {k} Universities')
    plt.show()

    # Calculate positions for each INSTNM
    edges = top_k_distance[['INSTNM1', 'INSTNM2']].values.tolist()
    pos = {row['INSTNM']: (row['LONGITUDE'], row['LATITUDE']) for idx, row in df.iterrows()}

    # Create a simple US map with vertices and edges
    fig = plt.figure(figsize=(12, 6))
    ax = fig.add_subplot(111, projection=ccrs.PlateCarree())

    ax.set_extent([-130, -65, 22, 50])
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    ax.add_feature(cfeature.STATES, linestyle=':')

    # Add vertices
    for INSTNM, location in pos.items():
        x, y = location[0], location[1]
        ax.plot(x, y, marker='o', markersize=6, markeredgewidth=1, markeredgecolor='k', markerfacecolor='blue', alpha=0.8, transform=ccrs.PlateCarree())

    # Add edges
    for edge in edges:
        INSTNM1 = edge[0]
        INSTNM2 = edge[1]
        lon1, lat1 = pos[INSTNM1]
        lon2, lat2 = pos[INSTNM2]
        ax.plot([lon1, lon2], [lat1, lat2], linewidth=2, color='blue', alpha=0.6, transform=ccrs.PlateCarree())

    plt.title("Vertices and Edges for Top {} Universities".format(k))
    plt.show()

    return distance_df, INSTNM_stats_df, top_k_universities

distance_df, INSTNM_stats_df, top_k_universities = INSTNM_distance_analysis_v2(sub_df, g = 500, k = 25, k_kappa = "min")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import great_circle
import networkx as nx
import folium

def INSTNM_distance_analysis(df, g, k, k_kappa = "max"):
    if not {'INSTNM', 'LATITUDE', 'LONGITUDE'}.issubset(df.columns):
        raise ValueError("DataFrame must have 'INSTNM', 'LATITUDE', and 'LONGITUDE' columns")

    # Calculate distances between universities
    distances = []
    for idx1, row1 in df.iterrows():
        for idx2, row2 in df.iterrows():
            if idx1 < idx2:
                distance = great_circle((row1['LATITUDE'], row1['LONGITUDE']), (row2['LATITUDE'], row2['LONGITUDE'])).miles
                distances.append([row1['INSTNM'], row2['INSTNM'], distance])

    distance_df = pd.DataFrame(distances, columns=['INSTNM1', 'INSTNM2', 'distance_miles'])

    # Create a histogram of distances
    plt.figure(figsize=(10, 6))
    plt.hist(distance_df['distance_miles'], bins='auto')
    plt.xlabel('Distance (miles)')
    plt.ylabel('Frequency')
    plt.title('Histogram of Distances between Universities')
    plt.show()

    # Calculate the average distance and number of schools within 'g' miles for each INSTNM
    INSTNM_stats = []
    for idx, row in df.iterrows():
        distances_within_g = distance_df.loc[(((distance_df['INSTNM1'] == row['INSTNM']) | (distance_df['INSTNM2'] == row['INSTNM'])) & (distance_df['distance_miles'] <= g)), 'distance_miles']
        avg_distance = distances_within_g.mean()
        num_schools = distances_within_g.count()
        INSTNM_stats.append([row['INSTNM'], avg_distance, num_schools])

    INSTNM_stats_df = pd.DataFrame(INSTNM_stats, columns=['INSTNM', 'average_distance', 'num_schools_within_g'])

    # Create a scatterplot of average distance vs. number of schools within 'g' miles
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=INSTNM_stats_df, x='num_schools_within_g', y='average_distance')
    plt.title(f'Average Distance vs. Number of Schools within {g} miles')
    plt.show()

    # Get the top 'k' universities based on the highest average distance
    if k_kappa in ["max", 1, "1", "largest", "large"]:
        top_k_universities = INSTNM_stats_df.nlargest(k, 'average_distance')['INSTNM']
    if k_kappa in ["min", 0, "0", "smallest", "small"]:
        top_k_universities = INSTNM_stats_df.nsmallest(k, 'average_distance')['INSTNM']
    if k_kappa not in ["max", 1, "1", "largest", "large", "min", 0, "0", "smallest", "small"]:
        print("k_kappa Error must be one of [max, 1, largest, large, min, 0, smallest, small]")

    # Create a heatmap of distances for the top 'k' universities
    top_k_distance = distance_df.loc[(distance_df['INSTNM1'].isin(top_k_universities)) & (distance_df['INSTNM2'].isin(top_k_universities))]
    heatmap_data = top_k_distance.pivot_table(index='INSTNM1', columns='INSTNM2', values='distance_miles')

    plt.figure(figsize=(10, 6))
    sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt='.1f')
    plt.title(f'Heatmap of Distances between Top {k} Universities')
    plt.show()

    # Create a cartographic map with vertices where each INSTNM is and edges to the top 'k' universities
    edges = top_k_distance[['INSTNM1', 'INSTNM2']].values.tolist()
    G = nx.Graph()
    G.add_edges_from(edges)

    pos = {row['INSTNM']: (row['LONGITUDE'], row['LATITUDE']) for idx, row in df.iterrows()}
    nx.set_node_attributes(G, pos, 'pos')

    # Calculate the center of the map
    center_LATITUDE = df['LATITUDE'].mean()
    center_LONGITUDE = df['LONGITUDE'].mean()

    map = folium.Map(location=[center_LATITUDE, center_LONGITUDE], zoom_start=5)

    # Add vertices and edges to the map
    for edge in edges:
        INSTNM1 = edge[0]
        INSTNM2 = edge[1]
        coordinates = [pos[INSTNM1], pos[INSTNM2]]
        folium.PolyLine(coordinates, color='blue', weight=5, opacity=1).add_to(map)

    for INSTNM, location in pos.items():
        folium.Circle(location, radius=1000, popup=INSTNM, color='blue', fill=True, fill_color='blue', fill_opacity=1).add_to(map)


    map.save("INSTNM_map.html")
    print("Cartographic map with vertices and edges saved as 'INSTNM_map.html'")

    return distance_df, INSTNM_stats_df, top_k_universities


INSTNM_distance_analysis(sub_df, g = 500, k = 10, k_kappa = "min")



In [None]:
%%time

import pandas as pd
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# Replace 'sub_df' with your actual DataFrame
coords = sub_df[['LATITUDE', 'LONGITUDE']].to_numpy()
distances = np.zeros((len(sub_df), len(sub_df)))

for i in range(len(sub_df)):
    lat1, lon1 = coords[i]
    distances[i] = haversine(lat1, lon1, coords[:, 0], coords[:, 1])

distance_df = pd.DataFrame(distances * 0.6213712, index = df["INSTNM"], columns = df["INSTNM"])

melted_distance_df = distance_df.reset_index().melt(id_vars='INSTNM', var_name='INSTNM2', value_name='distance_miles')
melted_distance_df = melted_distance_df.rename(columns={'INSTNM': 'INSTNM1'})

melted_distance_df = melted_distance_df[melted_distance_df["distance_miles"] != 0]

print(f"{melted_distance_df.shape}")
melted_distance_df.head(3)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def INSTNM_distance_analysis(df, n = 25, ):
    # Ensure input DataFrame has the required columns
    if not {'INSTNM1', 'INSTNM2', 'distance_miles'}.issubset(df.columns):
        raise ValueError("DataFrame must have 'INSTNM1', 'INSTNM2', and 'distance_miles' columns")

    # Create a histogram of distances
    plt.figure(figsize=(10, 6))
    plt.hist(df['distance_miles'], bins='auto')
    plt.xlabel('Distance (miles)')
    plt.ylabel('Frequency')
    plt.title('Histogram of Distances between Universities')
    plt.show()

    # Calculate the average distance for each INSTNM
    avg_distance = (df.groupby('INSTNM1')['distance_miles'].mean() + df.groupby('INSTNM2')['distance_miles'].mean()) / 2

    # Get the top 25 universities with the highest average distance
    top_25_universities = avg_distance.nlargest(n).index

    # Create a heatmap of distances for the top 25 universities
    top_25_distance = df.loc[(df['INSTNM1'].isin(top_25_universities)) & (df['INSTNM2'].isin(top_25_universities))]

    # Pivot the DataFrame to create a matrix suitable for heatmap
    heatmap_data = top_25_distance.pivot_table(index='INSTNM1', columns='INSTNM2', values='distance_miles', aggfunc=np.mean)

    # Generate the heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', fmt='.1f', linewidths=.5)
    plt.title('Heatmap of Distances between Top 25 Universities')
    plt.show()

INSTNM_distance_analysis(melted_distance_df)

In [None]:
melted_distance_df.distance_miles.min()

In [None]:
melted_distance_df.distance_miles.mean()

In [None]:
melted_distance_df.distance_miles.describe()