In [12]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy

#import packages
import pandas as pd
import numpy as np
import datetime as dt



In [123]:
# Import data
data_crime = pd.read_csv(r'https://raw.githubusercontent.com/mariusgruenewald/pol_viol/main/crime_data_edited.csv')
data_share = pd.read_csv(r'https://raw.githubusercontent.com/mariusgruenewald/pol_viol/main/election_data_edited.csv')

In [124]:
# get into datetime format
data_crime['year'] = data_crime['date'].astype(str).str.split('.', expand=True)[2]
data_crime['month'] = data_crime['date'].astype(str).str.split('.', expand=True)[1]
data_crime['day'] = data_crime['date'].astype(str).str.split('.', expand=True)[0]
data_crime['date'] = pd.to_datetime(data_crime[['year', 'month', 'day']].astype(float), errors='ignore')

In [125]:
# Merge crime and candidate statistics with merge over city string
data_crime = data_crime[(data_crime['Land'] == 14) | (data_crime['Land'] == 8) | (data_crime['Land'] == 16)]
data_share['city'] = data_share['city'].str.replace(r'Stadt ', '')
data_share['city'] = data_share['city'].str.replace('Ä', 'A').str.replace('Ö', 'O').str.replace('Ü', 'U')

list_cities_share = data_share.sort_values('city')['city'].unique()
list_cities_crime = data_crime['city'].unique()

data_crime.rename({'Land':'state'}, axis=1, inplace=True)
data_crime['state'] = data_crime['state'].map({16:'TH', 8:'BW', 14:'SN'})

In [126]:
data_share.rename({'year':'cycle_1'}, axis=1, inplace=True)
data_share['cycle_2'] = data_share['cycle_1']
data_share['cycle_3'] = data_share['cycle_1']

Merging over years and cities? How many different cutoffs? Directly on election day, one month before and three months before

Cut-off days for the three states: 05.26.2019, 04.26.2019, 02.26.2019

In [127]:
# creating different cycle variables in crime data to merge over
#
data_crime['cycle_1'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 5, 26, 0, 0), 'cycle_1'] = 2019

data_crime['cycle_2'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 4, 26, 0, 0), 'cycle_2'] = 2019

data_crime['cycle_3'] = 2014
data_crime.loc[data_crime['date'] > dt.datetime(2019, 2, 26, 0, 0), 'cycle_3'] = 2019

In [128]:
list_double = pd.DataFrame()
for city in data_share['city'].unique():
    
    data_city = data_share[data_share['city'] == city]
    for party in data_city['party'].unique():
        
        data_party = data_city[data_city['party'] == party]
        for election in data_party['cycle_1'].unique():
            
            data_election = data_party[data_party['cycle_1'] == election]
            
            if len(data_election) > 1:
                
                # append to dataset of 
                list_double = list_double.append(data_election)
                

In [129]:
# drop cities that are not uniquely identifable
data_share = data_share[~data_share.city.isin(list_double.city)]

In [130]:
# Generate variable for easier aggregation
data_crime['crime_count'] = 0
data_crime.loc[data_crime['crime'] != 0, 'crime_count'] = 1
data_crime.drop('Unnamed: 0', axis=1, inplace=True)

In [131]:
# regular summarization across cities and parties
data_crime_high_agg = data_crime.groupby(['city', 'party', 'cycle_1', 'state'], as_index=False).sum()
data_crime_high_agg.drop(['suspects', 'cycle_2', 'cycle_3'], axis=1, inplace=True)

# taking background into account
data_crime_low_agg = data_crime.groupby(['city', 'party', 'cycle_1', 'background', 'state'], as_index=False).sum()
data_crime_low_agg.drop(['suspects', 'cycle_2', 'cycle_3'], axis=1, inplace=True)

# taking crime type into account
data_crime_low_agg2 = data_crime.groupby(['city', 'party', 'cycle_1', 'crime', 'state'], as_index=False).sum()
data_crime_low_agg2.drop(['suspects', 'cycle_2', 'cycle_3'], axis=1, inplace=True)

# Taking both factors into account
data_crime_vlow_agg = data_crime.groupby(['city', 'party', 'cycle_1', 'crime', 'background', 'state'], as_index=False).sum()
data_crime_vlow_agg.drop(['suspects', 'cycle_2', 'cycle_3'], axis=1, inplace=True)

In [136]:
# Merge crime with share data
data_base_1 = pd.merge(data_crime_high_agg, data_share, on=['cycle_1', 'city', 'party', 'state'], how='right', validate='m:1')[['city','party','cycle_1','state','crime_count','Anzahl Bewerber','Prozent Frauen','darunter Frauen']]
data_base_2 = pd.merge(data_crime_low_agg, data_share, on=['cycle_1', 'city', 'party', 'state'], how='right', validate='m:1')[['city','party','cycle_1','background','state','crime_count','Anzahl Bewerber','Prozent Frauen','darunter Frauen']]
data_base_3 = pd.merge(data_crime_low_agg2, data_share, on=['cycle_1', 'city', 'party', 'state'], how='right', validate='m:1')[['city','party','cycle_1','crime','state','crime_count','Anzahl Bewerber','Prozent Frauen','darunter Frauen']]
data_base_4 = pd.merge(data_crime_vlow_agg, data_share, on=['cycle_1', 'city', 'party', 'state'], how='right', validate='m:1')[['city','party','cycle_1','crime','background','state','crime_count','Anzahl Bewerber','Prozent Frauen','darunter Frauen']]

In [137]:
# fill in non-affected crime cities
for data in (data_base_1,data_base_2,data_base_3,data_base_4):
    data['crime_count'].fillna(0, inplace=True)

Unnamed: 0,city,party,cycle_1,crime,background,state,crime_count,Anzahl Bewerber,Prozent Frauen,darunter Frauen
0,Stuttgart,AfD,2019,Sachbeschadigung,Links,BW,4.0,60.0,26.666667,16.0
1,Stuttgart,AfD,2019,Verleumdung ohne Verleumdung ohne sexuelle Gru...,Links,BW,1.0,60.0,26.666667,16.0
2,Stuttgart,AfD,2019,Verleumdung ohne sexuelle Grundlage,Links,BW,1.0,60.0,26.666667,16.0
3,Stuttgart,AfD,2019,Volksverhetzung,Rechts,BW,2.0,60.0,26.666667,16.0
4,Stuttgart,CDU,2019,Verleumdung ohne sexuelle Grundlage,Rechts,BW,2.0,60.0,41.666667,25.0
...,...,...,...,...,...,...,...,...,...,...
16699,Neuhaus am Rennweg,GRÜNE,2014,,,TH,,1.0,100.000000,1.0
16700,Sonneberg,GRÜNE,2014,,,TH,,7.0,14.285714,1.0
16701,Rudolstadt,GRÜNE,2014,,,TH,,10.0,40.000000,4.0
16702,Saalfeld/Saale,GRÜNE,2014,,,TH,,13.0,38.461538,5.0


### What we have now:
* Panel data set in levels

### What we need:
* Difference in election shares

In [230]:
# Generate differences
data_panel_diff = data_base_1[data_base_1['cycle_1'] == 2014]
data_panel_diff.drop(['Anzahl Bewerber', 'Prozent Frauen', 'darunter Frauen'], axis=1, inplace=True)
data_panel_diff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,city,party,cycle_1,state,crime_count
6444,Stuttgart,AfD,2014,BW,11.0
6445,Stuttgart,CDU,2014,BW,1.0
6446,Stuttgart,DIE LINKE,2014,BW,
6447,Stuttgart,FDP,2014,BW,4.0
6448,Stuttgart,GRÜNE,2014,BW,
...,...,...,...,...,...
16455,Neuhaus am Rennweg,GRÜNE,2014,TH,
16456,Sonneberg,GRÜNE,2014,TH,
16457,Rudolstadt,GRÜNE,2014,TH,
16458,Saalfeld/Saale,GRÜNE,2014,TH,


In [231]:
# drop units with only one observation

data_panel = pd.DataFrame()
for city in data_base_1['city'].unique():
    data_city = data_base_1[data_base_1['city'] == city]
    for party in data_city['party'].unique():
        data_party = data_city[data_city['party'] == party]
        
        if len(data_party) > 1:
            
            data_panel_diff.loc[
                (data_panel_diff['party'] == party) & (data_panel_diff['city'] == city),
                'fem_share_diff'] = data_base_1[(data_base_1['party'] == party) & (data_base_1['city'] == city) & (data_base_1['cycle_1'] == 2019)
                                               ]['Prozent Frauen'].values - data_base_1[(data_base_1['party'] == party) & (data_base_1['city'] == city) & (data_base_1['cycle_1'] == 2014)
                                                                                       ]['Prozent Frauen'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.


In [232]:
data_panel_diff['crime_count'] = data_panel_diff['crime_count'].fillna(0)
data_panel_diff.to_csv(r'C:\Users\mariu\Documents\pol_viol\pol_viol\data_election_crime_merged.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
