### Number of protests within given distance
Number of protests that happened in the past week within a given ditance. This is using the weekly dataset, where all locations are recorded every week, with an indicator for whether a protest happened here or not

In [1]:
import pandas as pd
import numpy as np

from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
blm = pd.read_csv('data/blm_weekly_protests.csv')
alt_right = pd.read_csv('data/alt-right_weekly_protests.csv')
distances = pd.read_csv('data/dist_matrix_geoid.csv')
distances = distances.set_index('GEOID10')
distances_name = pd.read_csv('data/dist_matrix_name.csv')
distances_name = distances_name.set_index('NAME10')
urban_pop = pd.read_csv('data/population_data.csv')

In [3]:
def protests_under_dist(data, occ, dist):
    """Creates list containing number of protests
    in the past week under given km.
    Matrix multiplication using the distance matrix 
    and the column recording protest occurrence"""
    
    data = data.copy()
    data[data<=dist] = 1
    data[data>dist] = 0

    protests_under_dist = []

    for i in range(0, 159):
        protests_under_dist.append(np.dot(occ[i], data).tolist())

    protests_under_dist.insert(0, [np.nan for i in range(0, 3601)])
    protests_under_dist = protests_under_dist[0:159]

    return [i for a in protests_under_dist for i in a]

In [4]:
# Keeping only distances for locations in the same state

for index, row in distances_name.iterrows():
    for col_name in distances_name.columns:
        if index.split(",")[-1] == col_name.split(",")[-1]:
            pass
        elif index.split(",")[-1] != col_name.split(",")[-1]:
            row[col_name] = np.inf

In [8]:
len(alt_right['Protest?'].to_list())

572559

In [11]:
len(flat_min_geoid)

572559

In [5]:
datasets = [blm, alt_right]

for index, data in enumerate(datasets):
    protest_list = data['Protest?'].to_list()
    split_list = [protest_list[x:x+3601] for x in range(0, len(protest_list), 3601)]
    
    # Closest protest in the past week (not same location)
    min_dist = []
    min_geoid = []

    for i in range(0, 159):
        df = split_list[i]*distances
        min_dist.append(df[df!=0].min(axis=1).to_list())
        min_geoid.append(df[df!=0].idxmin(axis=1).to_list())

    min_dist.insert(0, [np.nan for i in range(0, 3601)])
    min_dist = min_dist[0:159]
    flat_min_dist = [i for a in min_dist for i in a] 

    min_geoid.insert(0, [np.nan for i in range(0, 3601)])
    min_geoid = min_geoid[0:159]
    flat_min_geoid = [i for a in min_geoid for i in a] 
    
    # Closest protest in the past week (including same location)
    min_dist = []
    min_geoid = []
    new_list = []

    for item in split_list:
        item = [np.inf if i == 0 else i for i in item]
        new_list.append(item)

    for i in range(0, 159):
        df = new_list[i]*distances
        min_dist.append(df[df!=np.inf].min(axis=1).to_list())
        min_geoid.append(df[df!=np.inf].idxmin(axis=1).to_list())

    min_dist.insert(0, [np.nan for i in range(0, 3601)])
    min_dist = min_dist[0:159]
    flat_min_dist_all = [i for a in min_dist for i in a] 

    min_geoid.insert(0, [np.nan for i in range(0, 3601)])
    min_geoid = min_geoid[0:159]
    flat_min_geoid_all = [i for a in min_geoid for i in a] 
    
    # Total number of protests in the past week
    sum_list = [sum(split_list[i]) for i in range(0, len(split_list))]
    sum_for_df = [val for val in sum_list for _ in range(0, 3601)]
    final_sum = [0 for i in range(0, 3601)] + sum_for_df # need a week lag
    final_sum = final_sum[0:572559]
    
    # Number of protests in the same state in the past week
    data['State'] = data['NAME10'].str.split(',').str[1]
    data['State'] = data['State'].str.strip()
    state_list = data['State'].to_list()
    split_state_list = [state_list[x:x+3601] for x in range(0, len(state_list), 3601)]

    list_lists = []

    for a, b in zip(range(0, 159), range(0, 159)):
        new_list = []
        for c, d in zip(split_list[a], split_state_list[b]):
            if c == 0:
                pass
            elif c == 1:
                new_list.append(d)
        list_lists.append(new_list)

    empty = []
    list_lists.insert(0, empty)
    list_lists = list_lists[0:159]

    dicts = [dict(Counter(list_lists[i])) for i in range(0, 159)]
    protests_per_state = []

    for i, j in zip(range(0, 159), range(0, 159)):
        for a in split_state_list[i]:
            try:
                protests_per_state.append(dicts[j][a])
            except:
                protests_per_state.append(0)
                
    # Time since last event
    time_since = []
    
    for i in range(0, 3601):
        time = []
        weeks = np.nan
        for j in range(0, 159):
            if split_list[j][i] == 0:
                if weeks == np.nan:
                    pass
                else:
                    weeks = weeks + 1

            elif split_list[j][i] == 1:
                weeks = 0

            time.append(weeks)
        time = [np.nan if x == 0 else x for x in time] 
        time_since.append(time)

    last_event = []

    for i in range(0, 159):
        weekly = []
        for j in time_since:
            weekly.append(j[i])

        last_event.append(weekly)

    flat_last_event = [i for a in last_event for i in a] 
    
    # Adding all columns to the df
    data['Protests in past week'] = final_sum
    data['Protests in same state past week'] = protests_per_state
    data['Protests under 50km past week'] = protests_under_dist(distances, split_list, 50) 
    data['Protests under 500km past week'] = protests_under_dist(distances, split_list, 500)
    data['Protests under 1000km past week'] = protests_under_dist(distances, split_list, 1000)
    data['Protests under 2000km past week'] = protests_under_dist(distances, split_list, 2000)
    data['Protests under 50km past week (same state)'] = protests_under_dist(distances_name, split_list, 50) 
    data['Distance to closest protest past week'] = flat_min_dist
    data['GEOID closest protest past week'] = flat_min_geoid
    data['Distance to closest protest past week (inc same)'] = flat_min_dist_all
    data['GEOID closest protest past week (inc same)'] = flat_min_geoid_all
    data['Time since last event'] = flat_last_event 
    
    # Population data for closest protest in the past week
    data['GEOID closest protest past week'] = pd.to_numeric(data['GEOID closest protest past week'])
    final_df = data.merge(urban_pop,left_on='GEOID closest protest past week', right_on='GEOID10', how='left')
    columns_format = ['20 to 24 years_x', '20 to 24 years_y', '25 to 34 years_x', '25 to 34 years_y', 
              '65 years and over_x', '65 years and over_y', 'Male_x', 'Male_y',
              'Black or African American_x', 'Black or African American_y',
              'White_x', 'White_y', 'One race_x', 'One race_y', 'Total population_x', 'Total population_y',
                'Sex ratio (males per 100 females)_x', 'Sex ratio (males per 100 females)_y', 
                'Median age (years)_x', 'Median age (years)_y', 'Total housing units_x', 'Total housing units_y']
    for column in columns_format:
        final_df[column] = final_df[column].replace('[%\,]', '', regex=True).astype(float)

    final_df['Dif 20-24'] = final_df['20 to 24 years_y'] - final_df['20 to 24 years_x']
    final_df['Dif 25-34'] = final_df['25 to 34 years_y'] - final_df['25 to 34 years_x']
    final_df['Dif over 65'] = final_df['65 years and over_y'] - final_df['65 years and over_x']
    final_df['Dif median age'] = final_df['Median age (years)_x'] - final_df['Median age (years)_y']
    final_df['Dif Male'] = final_df['Male_y'] - final_df['Male_x']
    final_df['Dif Black'] = final_df['Black or African American_y'] - final_df['Black or African American_x']
    final_df['Dif White'] = final_df['White_y'] - final_df['White_x']
    final_df['Dif one race'] = final_df['One race_x'] - final_df['One race_y']
    final_df['Dif HBCU'] = final_df['HBCU_y'] - final_df['HBCU_x']
    final_df['Protests same state past week > 50km'] = final_df['Protests in same state past week'] - final_df['Protests under 50km past week (same state)']
    final_df['Protests past week not same state'] = final_df['Protests in past week'] - final_df['Protests in same state past week']
    
    # Exporting final dataset
    if index == 0:
        final_df.to_csv('data/blm_final_weekly_data.csv')
        
    elif index == 1:
        final_df.to_csv('data/alt-right_final_weekly_data.csv')

In [12]:
final_df

Unnamed: 0,Unnamed: 0_x,UACE10_x,GEOID10_x,NAME10_x,NAMELSAD10_x,INTPTLAT10_x,INTPTLON10_x,HBCU_x,Location_x,Label_x,...,Dif 25-34,Dif over 65,Dif median age,Dif Male,Dif Black,Dif White,Dif one race,Dif HBCU,Protests same state past week > 50km,Protests past week not same state
0,0,24310,24310,"Dixon, IL","Dixon, IL Urban Cluster",41.852951,-89.481744,0,DixonILUrbanCluster,"Dixon, IL Urban Cluster",...,,,,,,,,,,0.0
1,1,27847,27847,"Escanaba, MI","Escanaba, MI Urban Cluster",45.870484,-87.063840,0,EscanabaMIUrbanCluster,"Escanaba, MI Urban Cluster",...,,,,,,,,,,0.0
2,2,18100,18100,"Clintonville, WI","Clintonville, WI Urban Cluster",44.623220,-88.761128,0,ClintonvilleWIUrbanCluster,"Clintonville, WI Urban Cluster",...,,,,,,,,,,0.0
3,3,6166,6166,"Bedford, IN","Bedford, IN Urban Cluster",38.856653,-86.501238,0,BedfordINUrbanCluster,"Bedford, IN Urban Cluster",...,,,,,,,,,,0.0
4,4,75270,75270,"Riverdale, CA","Riverdale, CA Urban Cluster",36.431071,-119.862054,0,RiverdaleCAUrbanCluster,"Riverdale, CA Urban Cluster",...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572554,572625,95482,95482,"Williamston, MI","Williamston, MI Urban Cluster",42.686669,-84.257221,0,WilliamstonMIUrbanCluster,"Williamston, MI Urban Cluster",...,,,,,,,,,0.0,0.0
572555,572626,56980,56980,"Midland, MI","Midland, MI Urbanized Area",43.616301,-84.219878,0,MidlandMIUrbanizedArea,"Midland, MI Urbanized Area",...,,,,,,,,,0.0,0.0
572556,572627,10675,10675,"Brooklyn, MI","Brooklyn, MI Urban Cluster",42.102062,-84.274847,0,BrooklynMIUrbanCluster,"Brooklyn, MI Urban Cluster",...,,,,,,,,,0.0,0.0
572557,572628,31450,31450,"Franklin--Highland Lake, NJ","Franklin--Highland Lake, NJ Urban Cluster",41.169210,-74.546082,0,FranklinHighlandLakeNJUrbanCluster,"Franklin--Highland Lake, NJ Urban Cluster",...,,,,,,,,,0.0,0.0


## Merging datasets

Column name suffixes:
- x --> Demographic data for the location and BLM protest data
- y_x --> Data for location of closest BLM protest within the past week 
- y --> Alt-right protest data
- y_y --> Data for location of closest alt-right protest within the past week 

In [None]:
blm_weekly = pd.read_csv('data/blm_final_weekly_data.csv')
right_weekly = pd.read_csv('data/alt-right_final_weekly_data.csv')

common_columns = blm_weekly.iloc[:,:106]
blm_weekly = blm_weekly.iloc[:,106:].add_suffix('_x')
right_weekly = right_weekly.iloc[:,106:].add_suffix('_y')
merged_data = pd.concat([common_columns, blm_weekly, right_weekly], axis=1)
merged_data = merged_data.drop(['Unnamed: 0', 'Unnamed: 0_x', 'Unnamed: 0_y_x', 'Unnamed: 0_y_y'], axis = 1)
merged_data.to_csv('data/combined_final_weekly_data.csv')