# Permutation testing
Null: there is no relationship between location of flares and the CA minority population ('MINORPOP').
Alt: there is a relationship between flare location and the CA minority population.

Statistic: difference in proportion of MINORPOP within a 5km buffer of any flare versus the MINORPOP proportion outside the flares buffer.

1) Calculate the actual test statistic  
2) Randomize the flares location, restricted to only CA  
3) Calculate buffer zones  
4) Overlay the block groups with the new buffers  
5) Calculate the statistics based on the new overlay  
6) Put stats into a new df for storage
7) visualize the stats with a density plot

In [None]:
# randomizing the point geometries

def randomize_flares(points_df, bg_df):
    
    # match crs
    points_df = points_df.to_crs(meters_crs)
    bg_df = bg_df.to_crs(meters_crs)
    
    points_df = points_df.set_geometry('geometry')  # set geometry to the points
    
    # convert geometries to numpy array
    geom_array = np.array(points_df.geometry)

    # shuffle the array, limiting the random points to the boundary of California
    np.random.shuffle(geom_array)

    # convert the array back to geometry objects
    points_df.geometry = geom_array.tolist()
    
    # set 5km buffers around randomized points
    points_df["buffer_5000m"] = points_df['geometry'].buffer(distance = 5000)

    bg_flares_sim = gp.overlay(points_df, bg_df, how='intersection')
    
    # Calculate the area of each block group within the buffer zone
    for bg in bg_flares_sim['geometry']:
        bg_flares_sim['intersection_area'] = bg_flares_sim.geometry.area
    
    # Calculate the proportion of each block group within the buffer zone
    bg_flares_sim['bg_5kbuf_prop'] = bg_flares_sim['intersection_area'] / bg_df.geometry.area

    # It's not calculating correctly when a block group is completely contained within a buffer.
    # so set the ceiling of possible proportions to 1. Band-aid fix. 
    # Does this mean the area calculation is not right either?
    bg_flares_sim['bg_5kbuf_prop'] = bg_flares_sim['bg_5kbuf_prop'].clip(0, 1)

    # calculate the outside-buffer proportion
    bg_flares_sim['bg_5kbuf_out_prop'] = 1 - bg_flares_sim['bg_5kbuf_prop']

    # Apply the proportion to each demographic variable to find counts by variable
    demo_vars = ['ACSTOTPOP','MINORPOP']

    # find overall proportions for each demo var by dividing their count by their total population    
    for var in demo_vars:
        bg_flares_sim[var + '_bg_totprop'] = bg_flares_sim[var] / bg_flares_sim['ACSTOTPOP']
    
    return bg_flares_sim    

In [None]:
def get_buffer_proportion(df):
    in_buffer_prop = df['MINORPOP_bg_totprop'].mean()
    out_buffer_prop = 1-in_buffer_prop
    return in_buffer_prop - out_buffer_prop

In [None]:
### Looks like the permutations are working, but the result is meaningless because I haven't restricted the random points to only CA yet.
### Also need to make sure I'm understanding CRS systems and make sure I'm properly moving back and forth between geographic and projected CRSs during the project
  
### Also getting a "MemoryError: Unable to allocate 3.64 PiB for an array with shape (512769447408461,) and data type float64" error if I try to run more than 3 permutations. Look into adjusting code somehow so it's not so memory-intensive. 


In [None]:
actual_value = get_buffer_proportion(bg_flares)

n_sim = 3

results = pd.DataFrame({'statistic': np.concatenate(([actual_value], np.full(n_sim, np.nan)))})

for i in range(1, n_sim+1):
    bg_flares_sim = randomize_flares(ca_flares, bg_formodel)
    results.loc[i, 'statistic'] = get_buffer_proportion(bg_flares_sim)

pval = (np.abs(results['statistic'] >= actual_value)).mean()

import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(results['statistic'], kde=True, color='blue')
plt.axvline(x=actual_value, color='red')
plt.show()
print(f' P value is {pval}')
print(f' Orig diff in proportions is {round(actual_value, 2)}')