#MIP Analysis

This script analyses the data generated by the merger of the ownership data and the company panel information

In [38]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

Load the datasets

In [18]:
df_merged_companies = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_companies.csv")
df_merged_owners = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_owners.csv")

  df_merged_companies = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_companies.csv")
  df_merged_owners = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_owners.csv")


Two columns indicating if the company was involved in a ownership change in any of the panel years, one for `b_crefo` and one for `ownerid`

In [19]:
df_merged_companies["is_owner_change_company"] = df_merged_companies.groupby(["companyid"])["is_owner_change"].transform('any')
df_merged_companies["is_ownerid_change_company"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change"].transform('any')

Further six columns are added, in order to indicate when the ownership change took place. The periods are constructed so that there is cleary a survey before and after the acquisition (2009, whose period is 2006-2008; 2015, whose period is 2012-2014; and 2021, which emcompasses 2018-2020)

Change of ownership based on `b_crefo`: 
- 2009-2011 = `is_owner_change_2009`
- 2015-2017 = `is_owner_change_2015`
- 2009-2017 = `is_owner_change_2009_2017`

Change of ownership based on `ownerid`:
- 2009-2011 = `is_ownerid_change_2009`
- 2015-2017 = `is_ownerid_change_2015`
- 2009-2017 = `is_ownerid_change_2009_2017`

In [43]:
# b_crefo, 2009-2014
df_merged_companies["is_owner_change_2009"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2012), True, False)
df_merged_companies["is_owner_change_2009"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2009"].transform('any')

# b_crefo, 2015-2017
df_merged_companies["is_owner_change_2015"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2014) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_owner_change_2015"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2015"].transform('any')

# b_crefo, 2009-2017
df_merged_companies["is_owner_change_2009_2017"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_owner_change_2009_2017"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2009_2017"].transform('any')

# ownerid, 2009-2014
df_merged_companies["is_ownerid_change_2009"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2012), True, False)
df_merged_companies["is_ownerid_change_2009"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2009"].transform('any')

# ownerid, 2015-2017
df_merged_companies["is_ownerid_change_2015"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2014) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_ownerid_change_2015"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2009"].transform('any')

# ownerid, 2009-2017
df_merged_companies["is_ownerid_change_2009_2017"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_ownerid_change_2009_2017"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2009_2017"].transform('any')

As previously discussed in the MIP merge file, the environmental innovations survey questions changed from 2009 to 2015 and 2021, and due to that, the individual variables do not all have the same meaning depending on the year (e.g. `oekpz6` in 2015 onwards refers to noise pollution, while in 2009 it refers to soil contamination). Therefore, I will construct the variables `oekpz_avg` and `oekpd_avg`, which are averages of the other variables and can show a holistic view of eco-innovation at the respective companies.

In [21]:
eco_innovations = ["oekpz1", "oekpz2", "oekpz3", "oekpz4", "oekpz5", "oekpz6", "oekpz7", "oekpz8", "oekpz9"] 
eco_product_innovations = ["oekpd1", "oekpd2", "oekpd3", "oekpd4"]
df_merged_companies["oekpz_avg"] = df_merged_companies[eco_innovations].mean(axis=1)
df_merged_companies["oekpd_avg"] = df_merged_companies[eco_product_innovations].mean(axis=1)
df_merged_owners["oekpz_avg"] = df_merged_owners[eco_innovations].mean(axis=1)
df_merged_owners["oekpd_avg"] = df_merged_owners[eco_product_innovations].mean(axis=1)

However, in other to not lose information through the aggregation of all the variables into averages, I matched, were it was possible, the different variables throughout the years. The variable descriptions as they appear in the scientific use guide for MIP will be commented.

In [26]:
#oekpz1, 2015/2021: Reduction of energy consumption
#oekpz2, 2009: Reduction in energy consumption 
df_merged_companies["energy_consumption"] = np.where((df_merged_companies["jahr"] == 2014) | (df_merged_companies["jahr"] == 2020), df_merged_companies["oekpz1"], df_merged_companies["oekpz2"])
df_merged_owners["energy_consumption"] = np.where((df_merged_owners["jahr"] == 2014) | (df_merged_owners["jahr"] == 2020), df_merged_owners["oekpz1"], df_merged_owners["oekpz2"])

In [27]:
#oekpz2, 2015/2021: Reduction of material / water consumption
#oekpz1, 2009: Reduction in material use
df_merged_companies["material_use"] = np.where((df_merged_companies["jahr"] == 2014) | (df_merged_companies["jahr"] == 2020), df_merged_companies["oekpz2"], np.where(df_merged_companies["jahr"] == 2008, df_merged_companies["oekpz1"], None))
df_merged_owners["material_use"] = np.where((df_merged_owners["jahr"] == 2014) | (df_merged_owners["jahr"] == 2020), df_merged_owners["oekpz2"], np.where(df_merged_owners["jahr"] == 2008, df_merged_owners["oekpz1"], None))

In [28]:
#oekpz3: Reduction of CO2 emissions (2009, 2015/2021)
df_merged_companies["emissions_CO2"] = df_merged_companies["oekpz3"]
df_merged_owners["emissions_CO2"] = df_merged_owners["oekpz3"]
#oekpz4: Reduction of other air pollution (2015/2021), Reduction of other emissions (2009) 
df_merged_companies["emissions_other"] = df_merged_companies["oekpz4"]
df_merged_owners["emissions_other"] = df_merged_owners["oekpz4"]
#oekpz8: Replacement of dangerous material (2015/2021), Replament of hazardous material (2009)
df_merged_companies["hazardous_material"] = df_merged_companies["oekpz8"]
df_merged_owners["hazardous_material"] = df_merged_owners["oekpz8"]
#oekpz9: Recycling of waste / waste water / material (2015/2021), Improvement in recycling (2009)
df_merged_companies["recycling"] = df_merged_companies["oekpz9"]
df_merged_owners["recycling"] = df_merged_owners["oekpz9"] 

#Regression

In [48]:
# Filter the data for the relevant years
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2014)]
df_did = df_did[["companyid", "jahr", "oekpz_avg", "is_owner_change_2009", "is_owner_change_2015", "is_owner_change_2009_2017"]]
df_did["jahr_2014"] = np.where(df_did["jahr"] == 2014, 1, 0)
df_did["is_owner_change_2009"] = np.where(df_did["is_owner_change_2009"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula='oekpz_avg ~ is_owner_change_2009 + jahr_2014 + is_owner_change_2009:jahr_2014', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())


                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     44.85
Date:                Wed, 06 Dec 2023   Prob (F-statistic):           9.20e-29
Time:                        17:18:43   Log-Likelihood:                -7004.2
No. Observations:                8938   AIC:                         1.402e+04
Df Residuals:                    8934   BIC:                         1.404e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [59]:
# Filter the data for the relevant years
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2020) | (df_merged_companies["jahr"] == 2014)]
df_did = df_did[["companyid", "jahr", "oekpz_avg", "is_owner_change_2009", "is_owner_change_2015", "is_owner_change_2009_2017"]]
df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
df_did["is_owner_change_2015"] = np.where(df_did["is_owner_change_2015"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula='oekpz_avg ~ is_owner_change_2015 + jahr_2020 + is_owner_change_2015:jahr_2020', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())

                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.509
Date:                Wed, 06 Dec 2023   Prob (F-statistic):              0.210
Time:                        17:32:10   Log-Likelihood:                -5161.5
No. Observations:                8213   AIC:                         1.033e+04
Df Residuals:                    8209   BIC:                         1.036e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

In [61]:
# Filter the data for the relevant years
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
df_did = df_did[["companyid", "jahr", "oekpz_avg", "is_owner_change_2009", "is_owner_change_2015", "is_owner_change_2009_2017"]]
df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
df_did["is_owner_change_2009_2017"] = np.where(df_did["is_owner_change_2015"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula='oekpz_avg ~ is_owner_change_2009_2017 + jahr_2020 + is_owner_change_2009_2017:jahr_2020', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())

                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     46.48
Date:                Wed, 06 Dec 2023   Prob (F-statistic):           1.06e-38
Time:                        17:35:48   Log-Likelihood:                -6668.9
No. Observations:                8625   AIC:                         1.335e+04
Df Residuals:                    8620   BIC:                         1.338e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

#Descriptive Data

In [6]:
pd.crosstab(index=df_merged_companies["jahr"], columns=df_merged_companies["is_owner_change_company"], values=df_merged_companies["oekpz_avg"], aggfunc="mean")

is_owner_change_company,False,True
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1
2008.0,0.457171,0.45457
2014.0,0.323571,0.331623
2020.0,0.335706,0.368314


In [7]:
pd.crosstab(index=df_merged_companies["jahr"], columns=df_merged_companies["is_owner_change_company"], values=df_merged_companies["oekpd_avg"], aggfunc="mean")

is_owner_change_company,False,True
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1
2008.0,0.455366,0.450533
2014.0,0.281693,0.278795
2020.0,0.372911,0.425392


In [8]:
df_merged_companies.groupby(["jahr"])["bges"].agg(['mean', 'median', 'std', 'count'])


Unnamed: 0_level_0,mean,median,std,count
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006.0,542.109978,40.0,5984.766289,3628
2007.0,583.347796,38.0,6319.867209,4333
2008.0,468.610769,36.0,5966.092973,5200
2009.0,434.296854,32.0,5495.838428,4562
2010.0,286.095613,30.0,2148.879617,4832
2011.0,353.437609,25.0,3531.832354,5169
2012.0,372.831137,28.0,3621.839783,5235
2013.0,306.746721,29.0,3209.099099,4422
2014.0,392.093429,30.0,3858.244195,4733
2015.0,279.18467,29.0,2837.011337,3953


In [9]:
df_merged_companies[df_merged_companies["is_owner_change_company"] == True].groupby(["jahr"])["bges"].agg(['mean', 'median', 'std', 'count'])

Unnamed: 0_level_0,mean,median,std,count
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006.0,423.283477,49.0,2909.586604,1277
2007.0,644.264026,47.0,7148.08255,1515
2008.0,348.996269,45.0,2717.945332,1876
2009.0,356.035586,40.0,2777.263198,1665
2010.0,313.550529,37.0,2745.695913,1702
2011.0,320.169973,34.0,2392.155579,1865
2012.0,378.869845,37.0,3229.576037,1867
2013.0,292.320859,38.0,2478.048064,1630
2014.0,357.721387,40.0,3333.375683,1730
2015.0,250.735393,38.0,1753.899547,1489


In [10]:
df_merged_owners.groupby(["jahr"])["bges"].agg(['mean', 'median', 'std', 'count'])

Unnamed: 0_level_0,mean,median,std,count
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006.0,13909.441606,425.0,64985.50572,274
2007.0,16671.676871,525.0,58531.076246,294
2008.0,15481.123209,516.0,68875.934342,349
2009.0,10890.707655,359.0,53005.911794,307
2010.0,12023.026936,289.0,52905.425905,297
2011.0,8042.19891,280.0,22628.61369,367
2012.0,14637.414365,271.0,50525.185462,362
2013.0,9709.268382,230.0,29400.177432,272
2014.0,9435.865204,209.0,28866.119711,319
2015.0,9301.507519,173.5,29031.068166,266


The following `crosstab()` show how many entries there are with eco-innovation to the firm infos (`oekpz_avg`) per year, separated with regards to change of ownership.

In [11]:
pd.crosstab(df_merged_companies["jahr"], df_merged_companies["is_owner_change_company"], values=df_merged_companies["oekpz_avg"], aggfunc="count")

is_owner_change_company,False,True
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1
2006.0,0,0
2007.0,0,0
2008.0,3010,1711
2009.0,0,0
2010.0,0,0
2011.0,0,0
2012.0,0,0
2013.0,0,0
2014.0,2676,1541
2015.0,0,0


In [12]:
pd.crosstab(df_merged_companies["jahr"], df_merged_companies["is_owner_change_2009"], values=df_merged_companies["oekpz_avg"], aggfunc="count")

is_owner_change_2009,False,True
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1
2006.0,0,0
2007.0,0,0
2008.0,3910,811
2009.0,0,0
2010.0,0,0
2011.0,0,0
2012.0,0,0
2013.0,0,0
2014.0,3499,718
2015.0,0,0


In [13]:
pd.crosstab(df_merged_companies["jahr"], df_merged_companies["is_owner_change_2015"], values=df_merged_companies["oekpz_avg"], aggfunc="count")

is_owner_change_2015,False,True
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1
2006.0,0,0
2007.0,0,0
2008.0,4316,405
2009.0,0,0
2010.0,0,0
2011.0,0,0
2012.0,0,0
2013.0,0,0
2014.0,3820,397
2015.0,0,0


In [14]:
pd.crosstab(df_merged_companies["jahr"], df_merged_companies["is_ownerid_change_company"], values=df_merged_companies["oekpz_avg"], aggfunc="count")

is_ownerid_change_company,False,True
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1
2006.0,0,0
2007.0,0,0
2008.0,4534,187
2009.0,0,0
2010.0,0,0
2011.0,0,0
2012.0,0,0
2013.0,0,0
2014.0,4068,149
2015.0,0,0


In [15]:
pd.crosstab(df_merged_companies["jahr"], df_merged_companies["is_ownerid_change_2009"], values=df_merged_companies["oekpz_avg"], aggfunc="count")

is_ownerid_change_2009,False,True
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1
2006.0,0,0
2007.0,0,0
2008.0,4637,84
2009.0,0,0
2010.0,0,0
2011.0,0,0
2012.0,0,0
2013.0,0,0
2014.0,4136,81
2015.0,0,0


In [16]:
pd.crosstab(df_merged_companies["jahr"], df_merged_companies["is_ownerid_change_2015"], values=df_merged_companies["oekpz_avg"], aggfunc="count")

is_ownerid_change_2015,False,True
jahr,Unnamed: 1_level_1,Unnamed: 2_level_1
2006.0,0,0
2007.0,0,0
2008.0,4637,84
2009.0,0,0
2010.0,0,0
2011.0,0,0
2012.0,0,0
2013.0,0,0
2014.0,4136,81
2015.0,0,0


In [17]:
df_merged_companies.groupby("jahr")["um"].median()

jahr
2006.0    9.974733
2007.0    9.055493
2008.0    8.288808
2009.0    7.139757
2010.0    7.236571
2011.0    5.867490
2012.0    6.356447
2013.0    6.180423
2014.0    7.177896
2015.0    6.773039
2016.0    6.698718
2017.0    6.063073
2018.0    6.991114
2019.0    7.641428
2020.0    6.649822
2021.0    7.162249
Name: um, dtype: float64