In [24]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy

#import packages
import pandas as pd
import numpy as np
import datetime as dt



In [92]:
# Import data
data_crime = pd.read_csv(r'https://raw.githubusercontent.com/mariusgruenewald/pol_viol/main/crime_data_edited.csv')
data_share = pd.read_csv(r'https://raw.githubusercontent.com/mariusgruenewald/pol_viol/main/election_data_edited.csv')

In [93]:
# get into datetime format
data_crime['year'] = data_crime['date'].astype(str).str.split('.', expand=True)[2]
data_crime['month'] = data_crime['date'].astype(str).str.split('.', expand=True)[1]
data_crime['day'] = data_crime['date'].astype(str).str.split('.', expand=True)[0]
data_crime['date'] = pd.to_datetime(data_crime[['year', 'month', 'day']].astype(float), errors='ignore')

In [57]:
# Merge crime and candidate statistics with merge over city string
data_crime = data_crime[(data_crime['Land'] == 14) | (data_crime['Land'] == 8) | (data_crime['Land'] == 16) | (data_crime['Land'] == 12)]
data_share['city'] = data_share['city'].str.replace(r'Stadt ', '')
data_share['city'] = data_share['city'].str.replace('Ä', 'A').str.replace('Ö', 'O').str.replace('Ü', 'U')

list_cities_share = data_share.sort_values('city')['city'].unique()
list_cities_crime = data_crime['city'].unique()

data_crime.rename({'Land':'state'}, axis=1, inplace=True)
data_crime['state'] = data_crime['state'].map({16:'TH', 8:'BW', 14:'SN', 12:'BB'})

In [58]:
data_share.rename({'year':'cycle_1'}, axis=1, inplace=True)
data_share['cycle_2'] = data_share['cycle_1']
data_share['cycle_3'] = data_share['cycle_1']

Merging over years and cities? How many different cutoffs? Directly on election day, one month before and three months before

Cut-off days for the three states: 05.26.2019, 04.26.2019, 02.26.2019

In [59]:
# creating different cycle variables in crime data to merge over
#
data_crime['cycle_1'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 5, 26, 0, 0), 'cycle_1'] = 2019

data_crime['cycle_2'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 4, 26, 0, 0), 'cycle_2'] = 2019

data_crime['cycle_3'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 2, 26, 0, 0), 'cycle_3'] = 2019

In [79]:
# remove duplicates from data_share
# The following mistake happened: during merger full line and line w/out merger success being kept
# If done before, file will not contain unmatched entities -> necessary for manual inspection
data_share.drop_duplicates(subset=['city','party','Anzahl Bewerber','darunter Frauen','cycle_1','state'], ignore_index=True, inplace=True)

In [81]:
# Find double cities in data share
# we cannot use plz here, since crime data doesn't have any of that -> merger on plz doesnt work
list_double = pd.DataFrame()
for city in data_share['city'].unique():
    
    data_city = data_share[data_share['city'] == city]
    for party in data_city['party'].unique():
        
        data_party = data_city[data_city['party'] == party]
        for election in data_party['cycle_1'].unique():
            
            data_election = data_party[data_party['cycle_1'] == election]
            
            if len(data_election) > 1:
                
                # append to dataset of 
                list_double = list_double.append(data_election)
                

In [89]:
# drop cities that are not uniquely identifable by cityname
list_double_cities = data_share[data_share.city.isin(list_double.city)]['city'].unique()
list_to_drop_crime = data_crime[data_crime.city.isin(list_double.city)]
data_crime = data_crime[~data_crime.city.isin(list_to_drop_crime.city)]

In [91]:
data_crime.dropna()

Unnamed: 0.1,Unnamed: 0,crime,city,law,date,background,suspects,party,state,year,month,day,cycle_1,cycle_2,cycle_3
91,91,,Aalen,,NaT,,,,BW,,,,2014,2014,2014
146,146,,Abstatt,,NaT,,,,BW,,,,2014,2014,2014
147,147,,Abtsbessingen,,NaT,,,,TH,,,,2014,2014,2014
148,148,Sachbeschadigung,Abtsgmund,303 StGB,2019-05-20,Nicht zuzuordnen,0.0,CDU,BW,2019,05,20,2014,2019,2019
149,149,Sachbeschadigung,Abtsgmund,303 StGB,2019-05-20,Nicht zuzuordnen,0.0,CDU,BW,2019,05,20,2014,2019,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60411,60411,Sachbeschadigung,Zwickau,303 StGB,2018-12-03,Rechts,0.0,,SN,2018,12,03,2014,2014,2014
60412,60412,Sachbeschadigung,Zwickau,303 StGB,2019-01-11,Links,0.0,,SN,2019,01,11,2014,2014,2014
60413,60413,Bedrohung,Zwickau,241 StGB,2019-02-13,Nicht,0.0,,SN,2019,02,13,2014,2014,2014
60414,60414,,Zwiefalten,,NaT,,,,BW,,,,2014,2014,2014


In [10]:
base_city = pd.DataFrame(data_share['city'].unique(), columns=['city'])
base_city['cycle_1'] = 2014
base_city_2 = base_city.copy()
base_city_2['cycle_1'] = 2019
base_city = base_city.append(base_city_2)
base_city.reset_index(drop=True)

Unnamed: 0,city,cycle_1
0,Stuttgart,2014
1,Aidlingen,2014
2,Boblingen,2014
3,Bondorf,2014
4,Deckenpfronn,2014
...,...,...
3597,Grossneuhausen,2019
3598,Lindewerra,2019
3599,Jacobsdorf,2019
3600,Planetal,2019


In [11]:
# Generate crime_count variable for easier aggregation
data_crime['crime_count'] = 0
data_crime.loc[~data_crime['crime'].isna(), 'crime_count'] = 1
data_crime.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
data_crime.dropna(subset=['party'], inplace=True)

In [13]:
data_crime.head()

Unnamed: 0,crime,city,law,date,background,suspects,party,state,year,month,day,cycle_1,cycle_2,cycle_3,crime_count
148,Sachbeschadigung,Abtsgmund,303 StGB,2019-05-20,Nicht zuzuordnen,0.0,CDU,BW,2019,5,20,2014,2019,2019,1
149,Sachbeschadigung,Abtsgmund,303 StGB,2019-05-20,Nicht zuzuordnen,0.0,CDU,BW,2019,5,20,2014,2019,2019,1
151,Sachbeschadigung,Abtsgmund,303 StGB,2019-05-20,Nicht zuzuordnen,0.0,SPD,BW,2019,5,20,2014,2019,2019,1
166,Verstoss gegen das Kunsturhebergesetz,Achern,KUG,2019-08-07,Links,0.0,AfD,BW,2019,8,7,2019,2019,2019,1
167,Beleidigung,Achern,185 StGB,2019-05-04,Links,0.0,AfD,BW,2019,5,4,2014,2019,2019,1


### Generate various measures of crime

In [14]:
# regular summarization across cities and parties
data_crime_base = data_crime.groupby(['city', 'background', 'party', 'cycle_1', 'state'], as_index=False).sum()
data_crime_party = data_crime.groupby(['city', 'party', 'cycle_1', 'state'], as_index=False).sum()
data_crime_lr = data_crime[(data_crime['background'] == 'Links') | (data_crime['background'] == 'Rechts') | (data_crime['background'].isna())] #drop unidentified crime for left-right analysis
data_crime_lr_merge= data_crime_lr.groupby(['city', 'background', 'cycle_1', 'state'], as_index=False).sum()
data_crime_city = data_crime.groupby(['city', 'cycle_1', 'state'], as_index=False).sum()
data_crime_base.rename({'crime_count':'crime_count_party_lr'}, axis=1, inplace=True)
data_crime_party.rename({'crime_count':'crime_count_party'}, axis=1, inplace=True)
data_crime_lr_merge.rename({'crime_count':'crime_count_lr'}, axis=1, inplace=True)
data_crime_city.rename({'crime_count':'crime_count_city'}, axis=1, inplace=True)

In [15]:
# putting all measures into on dataframe. CAUTION: Alsways do drop_duplicates and dropna if other than crime_count used
crime_master = pd.merge(base_city, data_crime, on=['city', 'cycle_1'], how='left', validate='1:m')
crime_master_background = pd.merge(crime_master, data_crime_base[['city', 'background', 'party', 'cycle_1', 'state', 'crime_count_party_lr']], on=['city', 'background', 'party', 'cycle_1', 'state'], how='left', validate='m:1')
crime_master_party = pd.merge(crime_master_background, data_crime_party[['city', 'party', 'cycle_1', 'state', 'crime_count_party']], on=['city', 'party', 'cycle_1', 'state'], how='left', validate='m:1')
crime_master_lr = pd.merge(crime_master_party, data_crime_lr_merge[['city', 'background', 'cycle_1', 'state', 'crime_count_lr']], on=['city', 'background', 'cycle_1', 'state'], how='left', validate='m:1')
crime_master = pd.merge(crime_master_lr, data_crime_city[['city', 'cycle_1', 'state', 'crime_count_city']], on=['city', 'cycle_1', 'state'], how='left', validate='m:1') 

In [16]:
len(crime_master_party[crime_master_party['crime_count'].isna()])

3173

In [17]:
crime_master

Unnamed: 0,city,cycle_1,crime,law,date,background,suspects,party,state,year,month,day,cycle_2,cycle_3,crime_count,crime_count_party_lr,crime_count_party,crime_count_lr,crime_count_city
0,Stuttgart,2014,Korperverletzung,223 StGB,2019-04-20,Links,0.0,AfD,BW,2019,04,20,2014.0,2019.0,1.0,10.0,14.0,13.0,27.0
1,Stuttgart,2014,Beleidigung,185 StGB,2019-05-06,Nicht zuzuordnen,1.0,FDP,BW,2019,05,06,2019.0,2019.0,1.0,3.0,4.0,,27.0
2,Stuttgart,2014,offentliche Aufforderung zu Straftaten,111 StGB,2019-05-18,Links,0.0,AfD,BW,2019,05,18,2019.0,2019.0,1.0,10.0,14.0,13.0,27.0
3,Stuttgart,2014,Sachbeschadigung,303 StGB,2019-05-22,Links,0.0,AfD,BW,2019,05,22,2019.0,2019.0,1.0,10.0,14.0,13.0,27.0
4,Stuttgart,2014,Uble Nachrede,186 StGB,2019-05-24,Links,0.0,AfD,BW,2019,05,24,2019.0,2019.0,1.0,10.0,14.0,13.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6110,Grossneuhausen,2019,,,NaT,,,,,,,,,,,,,,
6111,Lindewerra,2019,,,NaT,,,,,,,,,,,,,,
6112,Jacobsdorf,2019,,,NaT,,,,,,,,,,,,,,
6113,Planetal,2019,,,NaT,,,,,,,,,,,,,,


### Generate aggregation of fem_share

In [18]:
data_share

Unnamed: 0.1,Unnamed: 0,city,party,Anzahl Bewerber,darunter Frauen,cycle_1,state,plz,city_id,Land,bl_kuerzel,cycle_2,cycle_3
0,0,Stuttgart,AfD,60.0,16.0,2019.0,BW,70173.0,8111000.0,8.0,BW,2019.0,2019.0
1,1,Stuttgart,CDU,60.0,25.0,2019.0,BW,70173.0,8111000.0,8.0,BW,2019.0,2019.0
2,2,Stuttgart,DIE LINKE,60.0,29.0,2019.0,BW,70173.0,8111000.0,8.0,BW,2019.0,2019.0
3,3,Stuttgart,FDP,60.0,18.0,2019.0,BW,70173.0,8111000.0,8.0,BW,2019.0,2019.0
4,4,Stuttgart,GRÜNE,60.0,30.0,2019.0,BW,70173.0,8111000.0,8.0,BW,2019.0,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20616,20616,Planetal,GRÜNE,1.0,0.0,2019.0,BB,14806.0,12069474.0,,,2019.0,2019.0
21166,21166,,,,,,,15907.0,12061316.0,,,,
21167,21167,,,,,,,15749.0,12061332.0,,,,
21188,21188,,,,,,,15236.0,12064480.0,,,,


In [19]:
# Merge crime with share data
data_base_1 = pd.merge(crime_master,
                       data_share[['city','party','Anzahl Bewerber','darunter Frauen','cycle_1','state','plz','city_id','Land','bl_kuerzel']],
                       on=['cycle_1', 'city', 'party', 'state'], how='right', validate='m:1')


MergeError: Merge keys are not unique in right dataset; not a many-to-one merge

In [None]:
# fill in non-affected crime cities
for column in ('crime_count', 'crime_count_party_lr', 'crime_count_party', 'crime_count_lr', 'crime_count_city'):
    data_base_1[column].fillna(0, inplace=True)

In [None]:
city_share_agg_b = data_base_1.groupby(['city', 'city_id', 'cycle_1', 'crime_count_party_lr'], as_index=False).sum()[['city', 'city_id', 'cycle_1', 'background','crime_count_party_lr', 'Anzahl Bewerber','darunter Frauen']]
city_share_agg_b.rename({'Anzahl Bewerber':'Bewerber_city_lr', 'darunter Frauen':'Frauen_city_lr'}, axis=1, inplace=True)
city_share_agg = data_base_1.groupby(['city', 'city_id', 'cycle_1'], as_index=False).sum()[['city', 'city_id', 'cycle_1', 'Anzahl Bewerber','darunter Frauen']]
city_share_agg.rename({'Anzahl Bewerber':'Bewerber_city', 'darunter Frauen':'Frauen_city'}, axis=1, inplace=True)
city_share_agg_b

In [78]:
# p_female works with crime_count_party_lr and crime_count_party
data_base_1['p_female'] = data_base_1['darunter Frauen']/data_base_1['Anzahl Bewerber']
data_base_1

Unnamed: 0,city,cycle_1,crime,law,date,background,suspects,party,state,year,...,crime_count_party,crime_count_lr,crime_count_city,Anzahl Bewerber,darunter Frauen,plz,city_id,Land,bl_kuerzel,p_female
0,Stuttgart,2019.0,Sachbeschadigung,STGB-303,2019-07-25,Links,0.0,AfD,BW,2019,...,8.0,10.0,16.0,60.0,16.0,70173.0,8111000.0,8.0,BW,0.266667
1,Stuttgart,2019.0,Verleumdung ohne Verleumdung ohne sexuelle Gru...,STGB-187,2019-07-25,Links,0.0,AfD,BW,2019,...,8.0,10.0,16.0,60.0,16.0,70173.0,8111000.0,8.0,BW,0.266667
2,Stuttgart,2019.0,Volksverhetzung,STGB-130,2019-08-12,Rechts,1.0,AfD,BW,2019,...,8.0,6.0,16.0,60.0,16.0,70173.0,8111000.0,8.0,BW,0.266667
3,Stuttgart,2019.0,Sachbeschadigung,STGB-303,2019-08-26,Links,0.0,AfD,BW,2019,...,8.0,10.0,16.0,60.0,16.0,70173.0,8111000.0,8.0,BW,0.266667
4,Stuttgart,2019.0,Sachbeschadigung,STGB-303,2019-07-25,Links,0.0,AfD,BW,2019,...,8.0,10.0,16.0,60.0,16.0,70173.0,8111000.0,8.0,BW,0.266667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17781,Planetal,2014.0,,,NaT,,,SPD,BB,,...,0.0,0.0,0.0,2.0,0.0,14806.0,12069474.0,,,0.000000
17782,Planetal,2014.0,,,NaT,,,CDU,BB,,...,0.0,0.0,0.0,1.0,0.0,14806.0,12069474.0,,,0.000000
17783,Planetal,2019.0,,,NaT,,,CDU,BB,,...,0.0,0.0,0.0,1.0,1.0,14806.0,12069474.0,,,1.000000
17784,Planetal,2019.0,,,NaT,,,SPD,BB,,...,0.0,0.0,0.0,1.0,0.0,14806.0,12069474.0,,,0.000000


In [90]:
# regular summarization across cities and parties
data_crime_lr_agg = data_crime_lr.groupby(['city', 'background', 'cycle_1', 'state'], as_index=False).sum()
data_crime_lr_agg.drop(['suspects', 'cycle_2', 'cycle_3'], axis=1, inplace=True)
data_base_1 = pd.merge(data_crime_lr_agg, data_share, on=['cycle_1', 'city', 'state'], how='right', validate='m:1')[['city','background','cycle_1','state','crime_count','Anzahl Bewerber','Prozent Frauen','darunter Frauen']]
data_base_1

MergeError: Merge keys are not unique in right dataset; not a many-to-one merge

### What we have now:
* Panel data set in levels

### What we need:
* Difference in election shares

In [29]:
# Generate differences
data_panel_diff = data_base_1[data_base_1['cycle_1'] == 2014]
data_panel_diff.drop(['Anzahl Bewerber', 'Prozent Frauen', 'darunter Frauen'], axis=1, inplace=True)
data_panel_diff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,city,party,cycle_1,state,crime_count
6444,Stuttgart,AfD,2014,BW,11.0
6445,Stuttgart,CDU,2014,BW,1.0
6446,Stuttgart,DIE LINKE,2014,BW,0.0
6447,Stuttgart,FDP,2014,BW,4.0
6448,Stuttgart,GRÜNE,2014,BW,0.0
...,...,...,...,...,...
16455,Neuhaus am Rennweg,GRÜNE,2014,TH,0.0
16456,Sonneberg,GRÜNE,2014,TH,0.0
16457,Rudolstadt,GRÜNE,2014,TH,0.0
16458,Saalfeld/Saale,GRÜNE,2014,TH,0.0


In [30]:
# drop units with only one observation

data_panel = pd.DataFrame()
for city in data_base_1['city'].unique():
    data_city = data_base_1[data_base_1['city'] == city]
    for party in data_city['party'].unique():
        data_party = data_city[data_city['party'] == party]
        
        if len(data_party) > 1:
            
            data_panel_diff.loc[
                (data_panel_diff['party'] == party) & (data_panel_diff['city'] == city),
                'fem_share_diff'] = data_base_1[(data_base_1['party'] == party) & (data_base_1['city'] == city) & (data_base_1['cycle_1'] == 2019)
                                               ]['Prozent Frauen'].values - data_base_1[(data_base_1['party'] == party) & (data_base_1['city'] == city) & (data_base_1['cycle_1'] == 2014)
                                                                                       ]['Prozent Frauen'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.


In [31]:
data_panel_diff['crime_count'] = data_panel_diff['crime_count'].fillna(0)
data_panel_diff.to_csv(r'C:\Users\mariu\Documents\pol_viol\pol_viol\data_election_crime_merged.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
