In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy

#import packages
import pandas as pd
import numpy as np
import datetime as dt



In [2]:
# Import data
data_crime = pd.read_csv(r'https://raw.githubusercontent.com/mariusgruenewald/pol_viol/main/crime_data_edited.csv')
data_share = pd.read_csv(r'https://raw.githubusercontent.com/mariusgruenewald/pol_viol/main/election_data_edited.csv')

In [3]:
# get into datetime format
data_crime['year'] = data_crime['date'].astype(str).str.split('.', expand=True)[2]
data_crime['month'] = data_crime['date'].astype(str).str.split('.', expand=True)[1]
data_crime['day'] = data_crime['date'].astype(str).str.split('.', expand=True)[0]
data_crime['date'] = pd.to_datetime(data_crime[['year', 'month', 'day']].astype(float), errors='ignore')

In [4]:
# Merge crime and candidate statistics with merge over city string
data_crime = data_crime[(data_crime['Land'] == 14) | (data_crime['Land'] == 8) | (data_crime['Land'] == 16) | (data_crime['Land'] == 12)]
data_share['city'] = data_share['city'].str.replace(r'Stadt ', '')
data_share['city'] = data_share['city'].str.replace('Ä', 'A').str.replace('Ö', 'O').str.replace('Ü', 'U')

list_cities_share = data_share.sort_values('city')['city'].unique()
list_cities_crime = data_crime['city'].unique()

data_crime.rename({'Land':'state'}, axis=1, inplace=True)
data_crime['state'] = data_crime['state'].map({16:'TH', 8:'BW', 14:'SN', 12:'BB'})

In [8]:
data_share.rename({'year':'cycle_1'}, axis=1, inplace=True)
data_share['cycle_2'] = data_share['cycle_1']
data_share['cycle_3'] = data_share['cycle_1']

Merging over years and cities? How many different cutoffs? Directly on election day, one month before and three months before

Cut-off days for the three states: 05.26.2019, 04.26.2019, 02.26.2019

In [9]:
# creating different cycle variables in crime data to merge over
#
data_crime['cycle_1'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 5, 26, 0, 0), 'cycle_1'] = 2019

data_crime['cycle_2'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 4, 26, 0, 0), 'cycle_2'] = 2019

data_crime['cycle_3'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 2, 26, 0, 0), 'cycle_3'] = 2019

In [10]:
# Find 
list_double = pd.DataFrame()
for city in data_share['city'].unique():
    
    data_city = data_share[data_share['city'] == city]
    for party in data_city['party'].unique():
        
        data_party = data_city[data_city['party'] == party]
        for election in data_party['cycle_1'].unique():
            
            data_election = data_party[data_party['cycle_1'] == election]
            
            if len(data_election) > 1:
                
                # append to dataset of 
                list_double = list_double.append(data_election)
                

In [11]:
# drop cities that are not uniquely identifable
data_share = data_share[~data_share.city.isin(list_double.city)]
list_double_cities = data_share[data_share.city.isin(list_double.city)]['city'].unique()

In [24]:
list_double['city'].unique()

array(['Altdorf', 'Hochdorf', 'Durnau', 'Urbach', 'Talheim', 'Waldenburg',
       'Rosenberg', 'Westhausen', 'Malsch', 'Sulzfeld', 'Lichtenau',
       'Limbach', 'Walldorf', 'Schomberg', 'Seebach', 'Steinach',
       'Hohenstein', 'Altheim', 'Neukirch', 'Bernsdorf', 'Hirschfeld',
       'Reinsdorf', 'Leutersdorf', 'Bohlen', 'Heideland', 'Mittenwalde'],
      dtype=object)

In [33]:
data_crime[data_crime['city']=='Leutersdorf']

Unnamed: 0.1,Unnamed: 0,crime,city,law,date,background,suspects,party,state,year,month,day,cycle_1,cycle_2,cycle_3


In [184]:
base_city = pd.DataFrame(data_share['city'].unique(), columns=['city'])
base_city['cycle_1'] = 2014
base_city_2 = base_city.copy()
base_city_2['cycle_1'] = 2019
base_city = base_city.append(base_city_2)
base_city.reset_index(drop=True)

Unnamed: 0,city,cycle_1
0,Stuttgart,2014
1,Aidlingen,2014
2,Bondorf,2014
3,Boblingen,2014
4,Deckenpfronn,2014
...,...,...
4613,Gross Schacksdorf-Simmersdorf,2019
4614,Zossen,2019
4615,Uckerfelde,2019
4616,Zichow,2019


In [185]:
# Generate crime_count variable for easier aggregation
data_crime['crime_count'] = 0
data_crime.loc[~data_crime['crime'].isna(), 'crime_count'] = 1
data_crime.drop('Unnamed: 0', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [186]:
data_crime.dropna(subset=['party'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Generate various measures of crime

In [187]:
# regular summarization across cities and parties
data_crime_base = data_crime.groupby(['city', 'background', 'party', 'cycle_1', 'state'], as_index=False).sum()
data_crime_party = data_crime.groupby(['city', 'party', 'cycle_1', 'state'], as_index=False).sum()
data_crime_lr = data_crime[(data_crime['background'] == 'Links') | (data_crime['background'] == 'Rechts') | (data_crime['background'].isna())] #drop unidentified crime for left-right analysis
data_crime_lr_merge= data_crime_lr.groupby(['city', 'background', 'cycle_1', 'state'], as_index=False).sum()
data_crime_city = data_crime.groupby(['city', 'cycle_1', 'state'], as_index=False).sum()
data_crime_base.rename({'crime_count':'crime_count_party_lr'}, axis=1, inplace=True)
data_crime_party.rename({'crime_count':'crime_count_party'}, axis=1, inplace=True)
data_crime_lr_merge.rename({'crime_count':'crime_count_lr'}, axis=1, inplace=True)
data_crime_city.rename({'crime_count':'crime_count_city'}, axis=1, inplace=True)

In [188]:
crime_master = pd.merge(base_city, data_crime, on=['city', 'cycle_1'], how='left', validate='1:m')
crime_master = pd.merge(crime_master, data_crime_base[['city', 'background', 'party', 'cycle_1', 'state', 'crime_count_party_lr']], on=['city', 'background', 'party', 'cycle_1', 'state'], how='left', validate='m:1')
crime_master = pd.merge(crime_master, data_crime_party[['city', 'party', 'cycle_1', 'state', 'crime_count_party']], on=['city', 'party', 'cycle_1', 'state'], how='left', validate='m:1')
crime_master = pd.merge(crime_master, data_crime_lr_merge[['city', 'background', 'cycle_1', 'state', 'crime_count_lr']], on=['city', 'background', 'cycle_1', 'state'], how='left', validate='m:1')
crime_master = pd.merge(crime_master, data_crime_city[['city', 'cycle_1', 'state', 'crime_count_city']], on=['city', 'cycle_1', 'state'], how='left', validate='m:1')

In [196]:
len(crime_master[crime_master['crime_count'].isna()])

4030

In [197]:
crime_master

Unnamed: 0,city,cycle_1,crime,law,date,background,suspects,party,state,year,month,day,cycle_2,cycle_3,crime_count,crime_count_party_lr,crime_count_party,crime_count_lr,crime_count_city
0,Stuttgart,2014,Korperverletzung,223 StGB,2019-04-20,Links,0.0,AfD,BW,2019,04,20,2014.0,2019.0,1.0,10.0,14.0,13.0,27.0
1,Stuttgart,2014,Beleidigung,185 StGB,2019-05-06,Nicht zuzuordnen,1.0,FDP,BW,2019,05,06,2019.0,2019.0,1.0,3.0,4.0,,27.0
2,Stuttgart,2014,offentliche Aufforderung zu Straftaten,111 StGB,2019-05-18,Links,0.0,AfD,BW,2019,05,18,2019.0,2019.0,1.0,10.0,14.0,13.0,27.0
3,Stuttgart,2014,Sachbeschadigung,303 StGB,2019-05-22,Links,0.0,AfD,BW,2019,05,22,2019.0,2019.0,1.0,10.0,14.0,13.0,27.0
4,Stuttgart,2014,Uble Nachrede,186 StGB,2019-05-24,Links,0.0,AfD,BW,2019,05,24,2019.0,2019.0,1.0,10.0,14.0,13.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7982,Gross Schacksdorf-Simmersdorf,2019,,,NaT,,,,,,,,,,,,,,
7983,Zossen,2019,,,NaT,,,,,,,,,,,,,,
7984,Uckerfelde,2019,,,NaT,,,,,,,,,,,,,,
7985,Zichow,2019,,,NaT,,,,,,,,,,,,,,


### Generate aggregation of fem_share

In [190]:
data_share

Unnamed: 0.1,Unnamed: 0,city,party,Anzahl Bewerber,darunter Frauen,cycle_1,state,cycle_2,cycle_3
0,0,Stuttgart,AfD,60.0,16.0,2019,BW,2019,2019
1,1,Stuttgart,CDU,60.0,25.0,2019,BW,2019,2019
2,2,Stuttgart,DIE LINKE,60.0,29.0,2019,BW,2019,2019
3,3,Stuttgart,FDP,60.0,18.0,2019,BW,2019,2019
4,4,Stuttgart,GRÜNE,60.0,30.0,2019,BW,2019,2019
...,...,...,...,...,...,...,...,...,...
19001,19001,Pinnow,CDU,9.0,2.0,2019,BB,2019,2019
19002,19002,Pinnow,DIE LINKE,1.0,1.0,2019,BB,2019,2019
19003,19003,Pinnow,GRÜNE,1.0,0.0,2019,BB,2019,2019
19004,19004,Schoneberg,DIE LINKE,1.0,0.0,2019,BB,2019,2019


In [42]:
# Merge crime with share data
data_base_1 = pd.merge(data_crime_high_agg, data_share, on=['cycle_1', 'city', 'party', 'state'], how='right', validate='m:1')[['city','party','cycle_1','state','crime_count','Anzahl Bewerber','Prozent Frauen','darunter Frauen']]


In [43]:
# fill in non-affected crime cities
for data in (data_base_1,data_base_2,data_base_3,data_base_4):
    data['crime_count'].fillna(0, inplace=True)

In [89]:
data_share_lr 

Unnamed: 0.1,Unnamed: 0,city,party,Anzahl Bewerber,Prozent Frauen,darunter Frauen,cycle_1,state,cycle_2,cycle_3
0,0,Stuttgart,AfD,60.0,26.666667,16.0,2019,BW,2019,2019
1,1,Stuttgart,CDU,60.0,41.666667,25.0,2019,BW,2019,2019
2,2,Stuttgart,DIE LINKE,60.0,48.333333,29.0,2019,BW,2019,2019
3,3,Stuttgart,FDP,60.0,30.000000,18.0,2019,BW,2019,2019
4,4,Stuttgart,GRÜNE,60.0,50.000000,30.0,2019,BW,2019,2019
...,...,...,...,...,...,...,...,...,...,...
16848,16848,Neuhaus am Rennweg,GRÜNE,1.0,100.000000,1.0,2014,TH,2014,2014
16849,16849,Sonneberg,GRÜNE,7.0,14.285714,1.0,2014,TH,2014,2014
16850,16850,Rudolstadt,GRÜNE,10.0,40.000000,4.0,2014,TH,2014,2014
16851,16851,Saalfeld/Saale,GRÜNE,13.0,38.461538,5.0,2014,TH,2014,2014


In [90]:
# regular summarization across cities and parties
data_crime_lr_agg = data_crime_lr.groupby(['city', 'background', 'cycle_1', 'state'], as_index=False).sum()
data_crime_lr_agg.drop(['suspects', 'cycle_2', 'cycle_3'], axis=1, inplace=True)
data_base_1 = pd.merge(data_crime_lr_agg, data_share, on=['cycle_1', 'city', 'state'], how='right', validate='m:1')[['city','background','cycle_1','state','crime_count','Anzahl Bewerber','Prozent Frauen','darunter Frauen']]
data_base_1

MergeError: Merge keys are not unique in right dataset; not a many-to-one merge

### What we have now:
* Panel data set in levels

### What we need:
* Difference in election shares

In [29]:
# Generate differences
data_panel_diff = data_base_1[data_base_1['cycle_1'] == 2014]
data_panel_diff.drop(['Anzahl Bewerber', 'Prozent Frauen', 'darunter Frauen'], axis=1, inplace=True)
data_panel_diff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,city,party,cycle_1,state,crime_count
6444,Stuttgart,AfD,2014,BW,11.0
6445,Stuttgart,CDU,2014,BW,1.0
6446,Stuttgart,DIE LINKE,2014,BW,0.0
6447,Stuttgart,FDP,2014,BW,4.0
6448,Stuttgart,GRÜNE,2014,BW,0.0
...,...,...,...,...,...
16455,Neuhaus am Rennweg,GRÜNE,2014,TH,0.0
16456,Sonneberg,GRÜNE,2014,TH,0.0
16457,Rudolstadt,GRÜNE,2014,TH,0.0
16458,Saalfeld/Saale,GRÜNE,2014,TH,0.0


In [30]:
# drop units with only one observation

data_panel = pd.DataFrame()
for city in data_base_1['city'].unique():
    data_city = data_base_1[data_base_1['city'] == city]
    for party in data_city['party'].unique():
        data_party = data_city[data_city['party'] == party]
        
        if len(data_party) > 1:
            
            data_panel_diff.loc[
                (data_panel_diff['party'] == party) & (data_panel_diff['city'] == city),
                'fem_share_diff'] = data_base_1[(data_base_1['party'] == party) & (data_base_1['city'] == city) & (data_base_1['cycle_1'] == 2019)
                                               ]['Prozent Frauen'].values - data_base_1[(data_base_1['party'] == party) & (data_base_1['city'] == city) & (data_base_1['cycle_1'] == 2014)
                                                                                       ]['Prozent Frauen'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.


In [31]:
data_panel_diff['crime_count'] = data_panel_diff['crime_count'].fillna(0)
data_panel_diff.to_csv(r'C:\Users\mariu\Documents\pol_viol\pol_viol\data_election_crime_merged.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
