### Number of protests within given distance
Number of protests that happened in the past week within a given ditance. This is using the weekly dataset, where all locations are recorded every week, with an indicator for whether a protest happened here or not

In [26]:
import pandas as pd
import numpy as np

from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [27]:
data = pd.read_csv('data/blm_weekly_protests.csv')
distances = pd.read_csv('data/dist_matrix_geoid.csv')
distances = distances.set_index('GEOID10')
urban_pop = pd.read_csv('data/population_data.csv')

In [28]:
protest_list = data['Protest?'].to_list()

In [29]:
split_list = [protest_list[x:x+3601] for x in range(0, len(protest_list), 3601)]

## Protests under a certain distance
Matrix multiplication using the distance matrix and the column recording protest occurrence

In [30]:
under_500 = distances.copy()
under_500[under_500<=500] = 1
under_500[under_500==0] = 0
under_500[under_500>500] = 0

protests_under_500 = []

for i in range(0, 159):
    protests_under_500.append(np.dot(split_list[i], under_500).tolist())
    
zeros = [0 for i in range(0, 3601)]
protests_under_500.insert(0, zeros)
protests_under_500 = protests_under_500[0:159]

flat_500_list = [i for a in protests_under_500 for i in a]

In [31]:
under_1000 = distances.copy()
under_1000[under_1000<=1000] = 1
under_1000[under_1000==0] = 0
under_1000[under_1000>1000] = 0

protests_under_1000 = []

for i in range(0, 159):
    protests_under_1000.append(np.dot(split_list[i], under_1000).tolist())
    
zeros = [0 for i in range(0, 3601)]
protests_under_1000.insert(0, zeros)
protests_under_1000 = protests_under_1000[0:159]

flat_1000_list = [i for a in protests_under_1000 for i in a]

In [32]:
under_2000 = distances.copy()
under_2000[under_2000<=2000] = 1
under_2000[under_2000==0] = 0
under_2000[under_2000>2000] = 0

protests_under_2000 = []

for i in range(0, 159):
    protests_under_2000.append(np.dot(split_list[i], under_2000).tolist())
    
zeros = [0 for i in range(0, 3601)]
protests_under_2000.insert(0, zeros)
protests_under_2000 = protests_under_2000[0:159]

flat_2000_list = [i for a in protests_under_2000 for i in a]

## Closest protest in the past week

First replacing 0s by infinity so that we can get minimum distance

In [33]:
min_dist = []
min_geoid = []

for i in range(0, 159):
    df = split_list[i]*distances
    min_dist.append(df[df!=0].min(axis=1).to_list())
    min_geoid.append(df[df!=0].idxmin(axis=1).to_list())

    
zeros = [0 for i in range(0, 3601)]
min_dist.insert(0, zeros)
min_dist = min_dist[0:159]
flat_min_dist = [i for a in min_dist for i in a] 

zeros = [0 for i in range(0, 3601)]
min_geoid.insert(0, zeros)
min_geoid = min_geoid[0:159]
flat_min_geoid = [i for a in min_geoid for i in a] 

## Total number of protests in the past week

In [34]:
sum_list = [sum(split_list[i]) for i in range(0, len(split_list))]

In [35]:
sum_for_df = [val for val in sum_list for _ in range(0, 3601)]

In [36]:
zeros = [0 for i in range(0, 3601)]

In [37]:
final_sum = zeros + sum_for_df # need a week lag
final_sum = final_sum[0:572559]

## Number of protests in the same state in the past week

In [38]:
data['State'] = data['NAME10'].str.split(',').str[1]
data['State'] = data['State'].str.strip()

In [39]:
state_list = data['State'].to_list()

In [40]:
split_state_list = [state_list[x:x+3601] for x in range(0, len(state_list), 3601)]

In [41]:
list_lists = []

for a, b in zip(range(0, 159), range(0, 159)):
    new_list = []
    for c, d in zip(split_list[a], split_state_list[b]):
        if c == 0:
            pass
        elif c == 1:
            new_list.append(d)
    list_lists.append(new_list)

In [42]:
empty = []
list_lists.insert(0, empty)
list_lists = list_lists[0:159]

In [43]:
dicts = [dict(Counter(list_lists[i])) for i in range(0, 159)]

In [44]:
protests_per_state = []

for state in state_list:
    try:
        protests_per_state.append(dicts[23][state])
    except:
        protests_per_state.append(0)

In [45]:
protests_per_state = []

for i, j in zip(range(0, 159), range(0, 159)):
    for a in split_state_list[i]:
        try:
            protests_per_state.append(dicts[j][a])
        except:
            protests_per_state.append(0)

# Adding all columns created to the df

In [46]:
data['Protests in past week'] = final_sum
data['Protests in same state past week'] = protests_per_state
data['Protests under 500km past week'] = flat_500_list
data['Protests under 1000km past week'] = flat_1000_list
data['Protests under 2000km past week'] = flat_2000_list
data['Distance to closest protest past week'] = flat_min_dist
data['GEOID closest protest past week'] = flat_min_geoid

## Population data for the closes protest in the past week

Adding population data for the closest protest in the past week, as well as difference between these and population data for the current location

In [47]:
data['GEOID closest protest past week'] = pd.to_numeric(data['GEOID closest protest past week'])

In [48]:
final_df = data.merge(urban_pop,left_on='GEOID closest protest past week', right_on='GEOID10', how='left')

In [49]:
columns_format = ['20 to 24 years_x', '20 to 24 years_y', '25 to 34 years_x', '25 to 34 years_y', 
          '65 years and over_x', '65 years and over_y', 'Male_x', 'Male_y',
          'Black or African American_x', 'Black or African American_y',
          'White_x', 'White_y']

for column in columns_format:
    final_df[column] = final_df[column].replace('[%\,]', '', regex=True).astype(float)
    
final_df['Dif 20-24'] = final_df['20 to 24 years_y'] - final_df['20 to 24 years_x']
final_df['Dif 25-34'] = final_df['25 to 34 years_y'] - final_df['25 to 34 years_x']
final_df['Dif over 65'] = final_df['65 years and over_y'] - final_df['65 years and over_x']
final_df['Dif Male'] = final_df['Male_y'] - final_df['Male_x']
final_df['Dif Black'] = final_df['Black or African American_y'] - final_df['Black or African American_x']
final_df['Dif White'] = final_df['White_y'] - final_df['White_x']
final_df['Dif HBCU'] = final_df['HBCU_y'] - final_df['HBCU_x']



## Exporting final dataset

In [50]:
final_df.to_csv('data/final_weekly_data.csv')