In [2]:
# import sys
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install statsmodels
# !{sys.executable} -m pip install tabulate
# !{sys.executable} -m pip install httpimport

import pandas as pd
import statsmodels.api as sm
import numpy as np
from statsmodels.iolib.summary2 import summary_col

In [9]:
data_panel_diff = pd.read_csv(r'https://raw.githubusercontent.com/mariusgruenewald/pol_viol/main/data_election_crime_merged.csv')

In [11]:
data_panel_diff = pd.concat([data_panel_diff, pd.get_dummies(data_panel_diff['party'], prefix='party')],axis=1)
data_panel_diff = pd.concat([data_panel_diff, pd.get_dummies(data_panel_diff['state'], prefix='bl')],axis=1)

In [10]:
data_panel_diff['state'].unique()

array(['BW', 'SN', 'TH', 'BB'], dtype=object)

### Let's consider the standard measure of crime (against a party in a city)

In [12]:
data_panel_diff = data_panel_diff[data_panel_diff['cycle_1'] == 2014]
data_panel_diff = data_panel_diff.drop_duplicates(['city', 'party', 'plz', 'state', 'p_female_diff', 'crime_count_party'], ignore_index=True)
data_panel_diff

Unnamed: 0.1,Unnamed: 0,crime,city,law,date,background,suspects,party,state,plz,...,party_Piraten,party_Pro Deutschland,party_REP,party_SPD,party_SPD.1,party_Tierschutzpartei,bl_BB,bl_BW,bl_SN,bl_TH
0,17,Beleidigung,Stuttgart,185 StGB,2019-01-09,Links,1.0,AfD,BW,70173.0,...,0,0,0,0,0,0,0,1,0,0
1,19,,Stuttgart,,,,,CDU,BW,70173.0,...,0,0,0,0,0,0,0,1,0,0
2,20,,Stuttgart,,,,,DIE LINKE,BW,70173.0,...,0,0,0,0,0,0,0,1,0,0
3,21,,Stuttgart,,,,,FDP,BW,70173.0,...,0,0,0,0,0,0,0,1,0,0
4,22,,Stuttgart,,,,,GRÜNE,BW,70173.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9030,20336,,Pinnow,,,,,DIE LINKE,BB,16278.0,...,0,0,0,0,0,0,1,0,0,0
9031,20337,,Pinnow,,,,,CDU,BB,16278.0,...,0,0,0,0,0,0,1,0,0,0
9032,20338,,Pinnow,,,,,GRÜNE/B90,BB,16278.0,...,0,0,0,0,0,0,1,0,0,0
9033,20342,,Passow,,,,,CDU,BB,16306.0,...,0,0,0,0,0,0,1,0,0,0


In [13]:
data_panel_diff['p_female_diff'] = data_panel_diff['p_female_diff']*100

In [14]:
data_panel_diff['p_female_diff'].mean()

1.7664718018598529

In [15]:
data_panel_diff[['p_female_diff', 'city', 'crime_count_party']]

Unnamed: 0,p_female_diff,city,crime_count_party
0,6.666667,Stuttgart,14.0
1,-5.000000,Stuttgart,1.0
2,1.666667,Stuttgart,0.0
3,-1.666667,Stuttgart,4.0
4,0.000000,Stuttgart,1.0
...,...,...,...
9030,20.000000,Pinnow,0.0
9031,5.555556,Pinnow,0.0
9032,,Pinnow,0.0
9033,0.000000,Passow,0.0


In [170]:
print("Conditional means based on party (no restriction to wether a crime has happened or whether the party stood twice)")
data_panel_diff.groupby(['party'], as_index=False).mean()[['party', 'crime_count_party', 'p_female_diff']], len(data_panel_diff[~data_panel_diff['p_female_diff'].isna()])

Conditional means based on party (no restriction to wether a crime has happened or whether the party stood twice)


(       party  crime_count_party  p_female_diff
 0        AfD           0.340058       5.092337
 1        CDU           0.075219       0.876552
 2  DIE LINKE           0.000000       2.532890
 3        FDP           0.031177       0.588290
 4      GRÜNE           0.068636       2.112651
 5        SPD           0.139840       0.964442, 3017)

In [16]:
# Drop missing values
data_panel_diff.dropna(subset=['p_female_diff', 'crime_count_party'], inplace=True)
print("Conditional means based on party when they stood for office twice")
data_panel_diff.groupby(['party'], as_index=False).mean()[['party', 'crime_count_party', 'p_female_diff']], len(data_panel_diff[~data_panel_diff['p_female_diff'].isna()])

Conditional means based on party when they stood for office twice


(        party  crime_count_party  p_female_diff
 0         AfD           5.702128       6.342469
 1         CDU           0.106383       1.378859
 2        CDU            0.000000      -4.166667
 3   DIE LINKE           0.000000       2.876802
 4         DKP           0.000000     100.000000
 5   Die Linke           0.000000      16.666667
 6     FAMILIE           0.000000     -10.000000
 7         FDP           0.121951       1.608198
 8       GRÜNE           0.226300       2.148809
 9   GRÜNE/B90           0.000000       9.930556
 10        NPD           0.000000      -9.090909
 11       NPD            0.000000       0.000000
 12    PIRATEN           0.000000      24.000000
 13        SPD           0.267215       1.324384,
 3785)

In [150]:
X = data_panel_diff['crime_count_party']
y = data_panel_diff['p_female_diff']
X = sm.add_constant(X)
model1 = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': data_panel_diff['party_lr']})
model1.summary()

0,1,2,3
Dep. Variable:,p_female_diff,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,18.63
Date:,"Sat, 11 Sep 2021",Prob (F-statistic):,0.145
Time:,18:14:20,Log-Likelihood:,-12683.0
No. Observations:,3017,AIC:,25370.0
Df Residuals:,3015,BIC:,25380.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.2169,0.369,3.302,0.001,0.495,1.939
crime_count_party,0.0522,0.012,4.316,0.000,0.028,0.076

0,1,2,3
Omnibus:,460.807,Durbin-Watson:,2.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5299.815
Skew:,0.333,Prob(JB):,0.0
Kurtosis:,9.459,Cond. No.,2.32


In [151]:
model2 = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': data_panel_diff['party']})
model2.summary()

0,1,2,3
Dep. Variable:,p_female_diff,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,2.716
Date:,"Sat, 11 Sep 2021",Prob (F-statistic):,0.16
Time:,18:14:32,Log-Likelihood:,-12683.0
No. Observations:,3017,AIC:,25370.0
Df Residuals:,3015,BIC:,25380.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.2169,0.277,4.397,0.000,0.674,1.759
crime_count_party,0.0522,0.032,1.648,0.099,-0.010,0.114

0,1,2,3
Omnibus:,460.807,Durbin-Watson:,2.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5299.815
Skew:,0.333,Prob(JB):,0.0
Kurtosis:,9.459,Cond. No.,2.32


In [153]:
print(summary_col([model1,model2]))
# A increase by one crime against a party is associated with an 0.0005% higher share of females in the following period. 


                  p_female_diff I p_female_diff II
--------------------------------------------------
const             1.2169          1.2169          
                  (0.3685)        (0.2768)        
crime_count_party 0.0522          0.0522          
                  (0.0121)        (0.0317)        
Standard errors in parentheses.


In [154]:
data_panel_diff2 = data_panel_diff.drop_duplicates(['city', 'plz', 'state', 'p_female_city_diff', 'crime_count_city'], ignore_index=True)
data_panel_diff2.dropna(subset=['p_female_city_diff', 'crime_count_city'], inplace=True)

X2 = data_panel_diff2["crime_count_city"]
y = data_panel_diff2["p_female_city_diff"]
X2 = sm.add_constant(X2)
model4 = sm.OLS(y, X2).fit(cov_type='cluster', cov_kwds={'groups': data_panel_diff2['party_lr']})
model4.summary()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0,1,2,3
Dep. Variable:,p_female_city_diff,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.4491
Date:,"Sat, 11 Sep 2021",Prob (F-statistic):,0.624
Time:,18:18:12,Log-Likelihood:,1053.7
No. Observations:,1386,AIC:,-2103.0
Df Residuals:,1384,BIC:,-2093.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0010,0.002,0.427,0.670,-0.004,0.006
crime_count_city,6.721e-05,0.000,0.670,0.503,-0.000,0.000

0,1,2,3
Omnibus:,278.124,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2307.081
Skew:,-0.689,Prob(JB):,0.0
Kurtosis:,9.169,Cond. No.,6.05


In [155]:
X2 = data_panel_diff2["crime_count_city"]
y = data_panel_diff2["p_female_city_diff"]
X2 = sm.add_constant(X2)
model4 = sm.OLS(y, X2).fit(cov_type='cluster', cov_kwds={'groups': data_panel_diff2['party']})
model4.summary()

0,1,2,3
Dep. Variable:,p_female_city_diff,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.3571
Date:,"Sat, 11 Sep 2021",Prob (F-statistic):,0.576
Time:,18:18:14,Log-Likelihood:,1053.7
No. Observations:,1386,AIC:,-2103.0
Df Residuals:,1384,BIC:,-2093.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0010,0.002,0.458,0.647,-0.003,0.005
crime_count_city,6.721e-05,0.000,0.598,0.550,-0.000,0.000

0,1,2,3
Omnibus:,278.124,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2307.081
Skew:,-0.689,Prob(JB):,0.0
Kurtosis:,9.169,Cond. No.,6.05


In [158]:
data_panel_lr = data_panel_diff.drop_duplicates(['city', 'party', 'plz', 'state', 'p_female_party_lr', 'crime_count_party_lr'], ignore_index=True)
data_panel_lr.dropna(subset=['p_female_party_lr', 'crime_count_party_lr'], inplace=True)

X2 = data_panel_lr["crime_count_party_lr"]
y = data_panel_lr["p_female_party_lr"]
X2 = sm.add_constant(X2)
model4 = sm.OLS(y, X2).fit(cov_type='HC3')
model4.summary()

0,1,2,3
Dep. Variable:,p_female_party_lr,R-squared:,0.037
Model:,OLS,Adj. R-squared:,-0.014
Method:,Least Squares,F-statistic:,0.3286
Date:,"Sat, 11 Sep 2021",Prob (F-statistic):,0.573
Time:,18:18:59,Log-Likelihood:,10.958
No. Observations:,21,AIC:,-17.92
Df Residuals:,19,BIC:,-15.83
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2664,0.041,6.467,0.000,0.186,0.347
crime_count_party_lr,-0.0014,0.002,-0.573,0.566,-0.006,0.003

0,1,2,3
Omnibus:,0.792,Durbin-Watson:,1.368
Prob(Omnibus):,0.673,Jarque-Bera (JB):,0.751
Skew:,-0.209,Prob(JB):,0.687
Kurtosis:,2.173,Cond. No.,25.5


In [159]:
data_panel_diff_left = data_panel_diff[data_panel_diff['background'] == 'Rechts']
#data_panel_diff_left = data_panel_diff_left.dropna(subset=['p_female_diff'])
data_panel_diff_left

Unnamed: 0.1,Unnamed: 0,crime,city,law,date,background,suspects,party,state,plz,...,p_female_diff,p_female_city_diff,p_female_lr_diff,p_female_party_lr_diff,party_AfD,party_CDU,party_DIE LINKE,party_FDP,party_GRÜNE,party_SPD
2662,5442,Volksverhetzung,Nussloch,130 StGB,2019-02-21,Rechts,1.0,GRÜNE,BW,69226.0,...,-11.111111,-0.074074,,,0,0,0,0,1,0
4403,9030,Gemeinschadliche Sachbeschadigung Sachbeschadi...,Konstanz,304 StGB,2019-01-14,Rechts,0.0,SPD,BW,78462.0,...,0.0,0.075,,,0,0,0,0,0,1
6164,12820,Bedrohung,Chemnitz,241 StGB,2019-01-10,Rechts,1.0,SPD,SN,9111.0,...,3.769841,0.045759,0.042815,0.037698,0,0,0,0,0,1
6173,12832,offentliche Aufforderung zu Straftaten Aufford...,Dresden,111 StGB,2018-08-27,Rechts,0.0,AfD,SN,1067.0,...,3.678161,-0.005475,0.257281,0.036782,1,0,0,0,0,0
6176,12847,Verwenden von Verwenden von Kennzeichen verfas...,Leipzig,86a StGB,2019-01-07,Rechts,0.0,SPD,SN,4109.0,...,12.844037,0.070671,0.115274,0.12844,0,0,0,0,0,1
6878,13559,Verstoß gegen das Versammlungsgesetz gegen das...,Gorlitz,VersG,2019-01-30,Rechts,4.0,CDU,SN,2826.0,...,9.615385,-0.090937,0.096154,0.096154,0,1,0,0,0,0
7850,17671,Sachbeschadigung,Meiningen,303 StGB,2019-01-10,Rechts,0.0,SPD,TH,98617.0,...,-4.367816,-0.019982,,,0,0,0,0,0,1


In [95]:
%run estout_func.py

In [160]:
estout_ols(modellist=[model1, model2], modellist_str = ["model1","model2"], y='Share Female Candidates', caption="Basic Regression", label="Basic Regression", list_regression_sets=[X,X], p_values=True)

\begin{table}[htbp] \caption{Basic Regression \label{Basic Regression}}
\resizebox{0.9\textwidth}{!}{ \centering
\begin{tabular}{lcc} \hline
  & (I) & (II) \\ 
Dependent Variable & \multicolumn{2}{c}{Share Female Candidates} \\ \hline \vspace{4pt} 
& \begin{footnotesize}\end{footnotesize} & \begin{footnotesize}\end{footnotesize}  \\ 
const & 1.2169*** & 1.2169*** \\ 
 \vspace{4pt} & \begin{footnotesize}(0.3685) \end{footnotesize} & \begin{footnotesize}(0.2768) \end{footnotesize} \\ 
crime_count_party & 0.0522*** & 0.0522* \\ 
 \vspace{4pt} & \begin{footnotesize}(0.0121) \end{footnotesize} & \begin{footnotesize}(0.0317) \end{footnotesize} \\ 
Observations & 3017 & 3017 \\ 
Adj. $R^2$ & -0.0003 & -0.0003 \\ 
F-statistic & [[18.62865534]] & [[2.71583122]] \\ 
Mean Squared Error & 43.10309127706569 & 43.10309127706569 \\ \hline
\multicolumn{3}{c}{\begin{footnotesize} model1 has cluster,  model2 has cluster, standard errors in parentheses. \end{footnotesize}} \\ 
\multicolumn{3}{c}{\begin{f

In [184]:
data_panel_diff.columns

Index(['Unnamed: 0', 'crime', 'city', 'law', 'date', 'background', 'suspects',
       'party', 'state', 'plz', 'city_id', 'bl_kuerzel', 'year', 'month',
       'day', 'cycle_1', 'cycle_2', 'cycle_3', 'crime_count',
       'Anzahl Bewerber', 'darunter Frauen', 'Land', 'crime_count_party_lr',
       'crime_count_party', 'crime_count_lr', 'crime_count_city', 'party_lr',
       'Bewerber_city', 'Frauen_city', 'bewerber_hit_by_lr', 'fem_hit_by_lr',
       'bewerber_party_hit_by_lr', 'fem_party_hit_by_lr', 'p_female',
       'p_female_lr', 'p_female_city', 'p_female_party_lr', 'p_female_diff',
       'p_female_city_diff', 'p_female_lr_diff', 'p_female_party_lr_diff',
       'party_AfD', 'party_CDU', 'party_DIE LINKE', 'party_FDP', 'party_GRÜNE',
       'party_SPD', 'bl_BW', 'bl_SN', 'bl_TH'],
      dtype='object')

In [185]:
X3 = data_panel_diff[["crime_count_party", "party_GRÜNE", "party_SPD", "party_CDU", "party_FDP", "party_DIE LINKE", 'bl_BW', 'bl_SN', 'bl_TH']]
y2 = data_panel_diff["p_female_diff"]
X3 = sm.add_constant(X3)
model6 = sm.OLS(y2, X3).fit(cov_type='cluster', cov_kwds={'groups': data_panel_diff['party_lr']})
model6.summary()



0,1,2,3
Dep. Variable:,p_female_diff,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,-397200000000000.0
Date:,"Sat, 11 Sep 2021",Prob (F-statistic):,1.0
Time:,18:23:24,Log-Likelihood:,-12680.0
No. Observations:,3017,AIC:,25380.0
Df Residuals:,3008,BIC:,25430.0
Df Model:,8,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,3.8087,0.307,12.407,0.000,3.207,4.410
crime_count_party,-0.0126,0.011,-1.116,0.264,-0.035,0.010
party_GRÜNE,-2.9197,0.453,-6.442,0.000,-3.808,-2.031
party_SPD,-4.0427,0.487,-8.303,0.000,-4.997,-3.088
party_CDU,-4.1794,0.430,-9.711,0.000,-5.023,-3.336
party_FDP,-4.5260,0.353,-12.810,0.000,-5.218,-3.834
party_DIE LINKE,-2.7783,0.105,-26.376,0.000,-2.985,-2.572
bl_BW,1.0557,0.367,2.873,0.004,0.336,1.776
bl_SN,1.7768,0.568,3.129,0.002,0.664,2.890

0,1,2,3
Omnibus:,440.475,Durbin-Watson:,2.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5219.576
Skew:,0.265,Prob(JB):,0.0
Kurtosis:,9.422,Cond. No.,3140000000000000.0


In [172]:
X3 = data_panel_party[["crime_count_party", "party_CDU", "party_AfD"]]
y2 = data_panel_party["p_female_diff"]
X3 = sm.add_constant(X3)
model7 = sm.OLS(y2, X3).fit(cov_type='cluster', cov_kwds={'groups': data_panel_diff_nogreens['party']})
model7.summary()

NameError: name 'data_panel_diff_nogreens' is not defined

In [56]:
data_panel_diff_balanced_noafd = data_panel_diff[data_panel_diff['party_AfD'] != 1]
X4 = data_panel_diff_balanced_noafd[["crime_count_party", "party_SPD", "party_FDP", "party_CDU", "party_GRÜNE"]]
y3 = data_panel_diff_balanced_noafd["p_female_diff"]
X4 = sm.add_constant(X4)
model6 = sm.OLS(y3, X4).fit(cov_type='cluster', cov_kwds={'groups': data_panel_diff_balanced_noafd['party']})
model6.summary()



0,1,2,3
Dep. Variable:,p_female_diff,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,28.12
Date:,"Wed, 01 Sep 2021",Prob (F-statistic):,0.00607
Time:,17:37:25,Log-Likelihood:,1199.5
No. Observations:,2999,AIC:,-2387.0
Df Residuals:,2993,BIC:,-2351.0
Df Model:,5,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0253,5.97e-17,4.24e+14,0.000,0.025,0.025
crime_count_party,0.0008,0.002,0.495,0.621,-0.002,0.004
party_SPD,-0.0159,0.000,-39.055,0.000,-0.017,-0.015
party_FDP,-0.0196,0.000,-81.614,0.000,-0.020,-0.019
party_CDU,-0.0166,0.000,-102.290,0.000,-0.017,-0.016
party_GRÜNE,-0.0044,0.000,-11.638,0.000,-0.005,-0.004

0,1,2,3
Omnibus:,447.035,Durbin-Watson:,2.046
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5199.069
Skew:,0.3,Prob(JB):,0.0
Kurtosis:,9.422,Cond. No.,7.93
