#MIP Regression Analysis

This script analyses the data generated by the merger of the ownership data and the company panel information

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

Load the datasets

In [2]:
df_merged_companies = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_companies.csv")
df_merged_owners = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_owners.csv")

  df_merged_companies = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_companies.csv")
  df_merged_owners = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_owners.csv")


Two columns indicating if the company was involved in a ownership change in any of the panel years, one for `b_crefo` and one for `ownerid`

In [3]:
df_merged_companies["is_owner_change_company"] = df_merged_companies.groupby(["companyid"])["is_owner_change"].transform('any')
df_merged_companies["is_ownerid_change_company"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change"].transform('any')

Further six columns are added, in order to indicate when the ownership change took place. The periods are constructed so that there is cleary a survey before and after the acquisition (2009, whose period is 2006-2008; 2015, whose period is 2012-2014; and 2021, which emcompasses 2018-2020)

Change of ownership based on `b_crefo`: 
- 2009-2011 = `is_owner_change_2009`
- 2015-2017 = `is_owner_change_2015`
- 2009-2017 = `is_owner_change_2009_2017`

Change of ownership based on `ownerid`:
- 2009-2011 = `is_ownerid_change_2009`
- 2015-2017 = `is_ownerid_change_2015`
- 2009-2017 = `is_ownerid_change_2009_2017`

In [4]:
# b_crefo, 2009-2014
df_merged_companies["is_owner_change_2009"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2012), True, False)
df_merged_companies["is_owner_change_2009"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2009"].transform('any')

# b_crefo, 2015-2017
df_merged_companies["is_owner_change_2015"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2014) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_owner_change_2015"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2015"].transform('any')

# b_crefo, 2009-2017
df_merged_companies["is_owner_change_2009_2017"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_owner_change_2009_2017"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2009_2017"].transform('any')

# ownerid, 2009-2014
df_merged_companies["is_ownerid_change_2009"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2012), True, False)
df_merged_companies["is_ownerid_change_2009"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2009"].transform('any')

# ownerid, 2015-2017
df_merged_companies["is_ownerid_change_2015"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2014) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_ownerid_change_2015"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2009"].transform('any')

# ownerid, 2009-2017
df_merged_companies["is_ownerid_change_2009_2017"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_ownerid_change_2009_2017"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2009_2017"].transform('any')

Same logic, but now for `df_merged_owners` and `ownerid`

In [5]:
# ownerid, 2009-2014
df_merged_owners["is_ownerid_change_2009"] = np.where((df_merged_owners["is_ownerid_change"] == True) & (df_merged_owners["panel_year"] > 2008) & (df_merged_owners["panel_year"] < 2012), True, False)
df_merged_owners["is_ownerid_change_2009"] = df_merged_owners.groupby(["companyid"])["is_ownerid_change_2009"].transform('any')

# ownerid, 2015-2017
df_merged_owners["is_ownerid_change_2015"] = np.where((df_merged_owners["is_ownerid_change"] == True) & (df_merged_owners["panel_year"] > 2014) & (df_merged_owners["panel_year"] < 2018), True, False)
df_merged_owners["is_ownerid_change_2015"] = df_merged_owners.groupby(["companyid"])["is_ownerid_change_2009"].transform('any')

# ownerid, 2009-2017
df_merged_owners["is_ownerid_change_2009_2017"] = np.where((df_merged_owners["is_ownerid_change"] == True) & (df_merged_owners["panel_year"] > 2008) & (df_merged_owners["panel_year"] < 2018), True, False)
df_merged_owners["is_ownerid_change_2009_2017"] = df_merged_owners.groupby(["companyid"])["is_ownerid_change_2009_2017"].transform('any')

As previously discussed in the MIP merge file, the environmental innovations survey questions changed from 2009 to 2015 and 2021, and due to that, the individual variables do not all have the same meaning depending on the year (e.g. `oekpz6` in 2015 onwards refers to noise pollution, while in 2009 it refers to soil contamination). Therefore, I will construct the variables `oekpz_avg` and `oekpd_avg`, which are averages of the other variables and can show a holistic view of eco-innovation at the respective companies.

In [6]:
eco_innovations = ["oekpz1", "oekpz2", "oekpz3", "oekpz4", "oekpz5", "oekpz6", "oekpz7", "oekpz8", "oekpz9"] 
eco_product_innovations = ["oekpd1", "oekpd2", "oekpd3", "oekpd4"]
df_merged_companies["oekpz_avg"] = df_merged_companies[eco_innovations].mean(axis=1)
df_merged_companies["oekpd_avg"] = df_merged_companies[eco_product_innovations].mean(axis=1)
df_merged_owners["oekpz_avg"] = df_merged_owners[eco_innovations].mean(axis=1)
df_merged_owners["oekpd_avg"] = df_merged_owners[eco_product_innovations].mean(axis=1)

However, in other to not lose information through the aggregation of all the variables into averages, I matched, were it was possible, the different variables throughout the years. The variable descriptions as they appear in the scientific use guide for MIP will be commented.

In [7]:
#oekpz1, 2015/2021: Reduction of energy consumption
#oekpz2, 2009: Reduction in energy consumption 
df_merged_companies["energy_consumption"] = np.where((df_merged_companies["jahr"] == 2014) | (df_merged_companies["jahr"] == 2020), df_merged_companies["oekpz1"], df_merged_companies["oekpz2"])
df_merged_owners["energy_consumption"] = np.where((df_merged_owners["jahr"] == 2014) | (df_merged_owners["jahr"] == 2020), df_merged_owners["oekpz1"], df_merged_owners["oekpz2"])

In [8]:
#oekpz2, 2015/2021: Reduction of material / water consumption
#oekpz1, 2009: Reduction in material use
df_merged_companies["material_use"] = np.where((df_merged_companies["jahr"] == 2014) | (df_merged_companies["jahr"] == 2020), df_merged_companies["oekpz2"], df_merged_companies["oekpz1"])
df_merged_owners["material_use"] = np.where((df_merged_owners["jahr"] == 2014) | (df_merged_owners["jahr"] == 2020), df_merged_owners["oekpz2"], df_merged_owners["oekpz1"])

In [9]:
#oekpz3: Reduction of CO2 emissions (2009, 2015/2021)
df_merged_companies["emissions_CO2"] = df_merged_companies["oekpz3"]
df_merged_owners["emissions_CO2"] = df_merged_owners["oekpz3"]
#oekpz4: Reduction of other air pollution (2015/2021), Reduction of other emissions (2009) 
df_merged_companies["emissions_other"] = df_merged_companies["oekpz4"]
df_merged_owners["emissions_other"] = df_merged_owners["oekpz4"]
#oekpz8: Replacement of dangerous material (2015/2021), Replament of hazardous material (2009)
df_merged_companies["hazardous_material"] = df_merged_companies["oekpz8"]
df_merged_owners["hazardous_material"] = df_merged_owners["oekpz8"]
#oekpz9: Recycling of waste / waste water / material (2015/2021), Improvement in recycling (2009)
df_merged_companies["recycling"] = df_merged_companies["oekpz9"]
df_merged_owners["recycling"] = df_merged_owners["oekpz9"] 

#Regression

Regression with no control variables, merger between 2009-2011

In [10]:
y = "emissions_CO2"

# Filter the data for the relevant years
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2014)]
df_did = df_did[["companyid", "jahr", y, "is_owner_change_2009", "is_owner_change_2015", "is_owner_change_2009_2017"]]
df_did["jahr_2014"] = np.where(df_did["jahr"] == 2014, 1, 0)
df_did["is_owner_change_2009"] = np.where(df_did["is_owner_change_2009"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula= y + ' ~ is_owner_change_2009 + jahr_2014 + is_owner_change_2009:jahr_2014', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())


                            OLS Regression Results                            
Dep. Variable:          emissions_CO2   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     42.72
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           2.14e-27
Time:                        16:02:34   Log-Likelihood:                -9406.4
No. Observations:                8701   AIC:                         1.882e+04
Df Residuals:                    8697   BIC:                         1.885e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

Regression with no control variables, mergers between 2015-2017

In [11]:
#loop with all the possible variables
y = "emissions_CO2"


# Filter the data for the relevant years
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2014) | (df_merged_companies["jahr"] == 2020)]
df_did = df_did[["companyid", "jahr", y, "is_owner_change_2015"]]
df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
df_did["is_owner_change_2015"] = np.where(df_did["is_owner_change_2015"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula= y + ' ~ is_owner_change_2015 + jahr_2020 + is_owner_change_2015:jahr_2020', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())

                            OLS Regression Results                            
Dep. Variable:          emissions_CO2   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     15.81
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           3.03e-10
Time:                        16:02:35   Log-Likelihood:                -7745.1
No. Observations:                8005   AIC:                         1.550e+04
Df Residuals:                    8001   BIC:                         1.553e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept   

Regression with no control variables, mergers between 2009-2017

In [22]:
y = "oekpd_avg"

# Filter the data for the relevant years
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
df_did = df_did[["companyid", "jahr", y, "is_owner_change_2009_2017"]]
df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
df_did["is_owner_change_2009_2017"] = np.where(df_did["is_owner_change_2009_2017"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula=y + ' ~ is_owner_change_2009_2017 + jahr_2020 + is_owner_change_2009_2017:jahr_2020', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())

                            OLS Regression Results                            
Dep. Variable:              oekpd_avg   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     8.916
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           6.78e-06
Time:                        16:15:53   Log-Likelihood:                -7880.5
No. Observations:                8616   AIC:                         1.577e+04
Df Residuals:                    8612   BIC:                         1.580e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

Regression with `bges` as control variable, mergers within 2009-2017 period

In [23]:
y = "oekpd_avg"

# Filter the data for the relevant years
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
df_did = df_did[["companyid", "jahr", y, "is_owner_change_2009_2017", "bges"]]
df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
df_did["is_owner_change_2009_2017"] = np.where(df_did["is_owner_change_2009_2017"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula=y + ' ~ is_owner_change_2009_2017 + jahr_2020 + is_owner_change_2009_2017:jahr_2020 + bges', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())

                            OLS Regression Results                            
Dep. Variable:              oekpd_avg   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     25.71
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           3.28e-21
Time:                        16:16:01   Log-Likelihood:                -7765.2
No. Observations:                8528   AIC:                         1.554e+04
Df Residuals:                    8523   BIC:                         1.558e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

Regression with `bges` and `branche` as control variables, mergers within 2009-2017 period

In [19]:
y = "emissions_other"

# Filter the data for the relevant years
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
df_did = df_did[["companyid", "jahr", y, "is_owner_change_2009_2017", "bges", "branche"]]
df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
df_did["is_owner_change_2009_2017"] = np.where(df_did["is_owner_change_2009_2017"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula=y + ' ~ is_owner_change_2009_2017 + jahr_2020 + is_owner_change_2009_2017:jahr_2020 + bges + C(branche)', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())

                            OLS Regression Results                            
Dep. Variable:        emissions_other   R-squared:                       0.058
Model:                            OLS   Adj. R-squared:                  0.055
Method:                 Least Squares   F-statistic:                     20.28
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           6.49e-85
Time:                        16:09:54   Log-Likelihood:                -7422.1
No. Observations:                7902   AIC:                         1.489e+04
Df Residuals:                    7877   BIC:                         1.507e+04
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                                                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------

#Regression with `df_merged_owners`

Keeping the data together like this doesn't result in anything significant, too few entries

In [15]:
y = "emissions_CO2"
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
df_did = df_did[["companyid", "jahr", y, "is_ownerid_change_2009_2017", "bges", "branche"]]
df_did = pd.merge(df_did, df_merged_owners[['companyid', 'ownerid', 'jahr', y, 'bges', 'branche']], on=['companyid', 'jahr'], how='left')
df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
df_did["is_ownerid_change_2009_2017"] = np.where(df_did["is_ownerid_change_2009_2017"] == True, 1, 0)
# Set up the difference-in-differences model
model_did = smf.ols(formula=y + "_y" + ' ~ is_ownerid_change_2009_2017 + jahr_2020 + is_ownerid_change_2009_2017:jahr_2020', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())

                            OLS Regression Results                            
Dep. Variable:        emissions_CO2_y   R-squared:                       0.058
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     6.097
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           0.000489
Time:                        16:02:39   Log-Likelihood:                -355.68
No. Observations:                 299   AIC:                             719.4
Df Residuals:                     295   BIC:                             734.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

In [16]:
y = "emissions_CO2"
df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
df_did = df_did[["companyid", "jahr", y, "is_ownerid_change_2009_2017", "bges", "branche"]]
df_did["owner"] = 0
df_did = pd.concat([df_did, df_merged_owners[['companyid', 'jahr', y, 'is_ownerid_change_2009_2017' ,'bges', 'branche']]], ignore_index=True)
df_did["owner"].fillna(1, inplace=True)
df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
df_did["is_ownerid_change_2009_2017"] = np.where(df_did["is_ownerid_change_2009_2017"] == True, 1, 0)

# Set up the difference-in-differences model
model_did = smf.ols(formula=y + ' ~ is_ownerid_change_2009_2017 + jahr_2020 + owner + jahr_2020:owner + is_ownerid_change_2009_2017:jahr_2020 + is_ownerid_change_2009_2017:jahr_2020:owner', data=df_did)

# Fit the model
results_did = model_did.fit()

# Print the model summary
print(results_did.summary())

                            OLS Regression Results                            
Dep. Variable:          emissions_CO2   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     10.74
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           5.98e-12
Time:                        16:02:39   Log-Likelihood:                -16620.
No. Observations:               14939   AIC:                         3.325e+04
Df Residuals:                   14932   BIC:                         3.331e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

Save the data

In [17]:
df_merged_companies.to_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_companies_reganalysis.csv")
df_merged_owners.to_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_owners_reganalysis.csv")