#MIP Regression Analysis

This script analyses the data generated by the merger of the ownership data and the company panel information

In [22]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

Load the datasets

In [23]:
df_merged_companies = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_companies.csv")
df_merged_owners = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_owners.csv")

  df_merged_companies = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_companies.csv")
  df_merged_owners = pd.read_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_owners.csv")


Two columns indicating if the company was involved in a ownership change in any of the panel years, one for `b_crefo` and one for `ownerid`

In [24]:
df_merged_companies["is_owner_change_company"] = df_merged_companies.groupby(["companyid"])["is_owner_change"].transform('any')
df_merged_companies["is_ownerid_change_company"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change"].transform('any')

Further six columns are added, in order to indicate when the ownership change took place. The periods are constructed so that there is cleary a survey before and after the acquisition (2009, whose period is 2006-2008; 2015, whose period is 2012-2014; and 2021, which emcompasses 2018-2020)

Change of ownership based on `b_crefo`: 
- 2009-2011 = `is_owner_change_2009`
- 2015-2017 = `is_owner_change_2015`
- 2009-2017 = `is_owner_change_2009_2017`

Change of ownership based on `ownerid`:
- 2009-2011 = `is_ownerid_change_2009`
- 2015-2017 = `is_ownerid_change_2015`
- 2009-2017 = `is_ownerid_change_2009_2017`

In [25]:
# b_crefo, 2009-2014
df_merged_companies["is_owner_change_2009"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2012), True, False)
df_merged_companies["is_owner_change_2009"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2009"].transform('any')

# b_crefo, 2015-2017
df_merged_companies["is_owner_change_2015"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2014) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_owner_change_2015"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2015"].transform('any')

# b_crefo, 2009-2017
df_merged_companies["is_owner_change_2009_2017"] = np.where((df_merged_companies["is_owner_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_owner_change_2009_2017"] = df_merged_companies.groupby(["companyid"])["is_owner_change_2009_2017"].transform('any')

# ownerid, 2009-2014
df_merged_companies["is_ownerid_change_2009"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2012), True, False)
df_merged_companies["is_ownerid_change_2009"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2009"].transform('any')

# ownerid, 2015-2017
df_merged_companies["is_ownerid_change_2015"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2014) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_ownerid_change_2015"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2015"].transform('any')

# ownerid, 2009-2017
df_merged_companies["is_ownerid_change_2009_2017"] = np.where((df_merged_companies["is_ownerid_change"] == True) & (df_merged_companies["panel_year"] > 2008) & (df_merged_companies["panel_year"] < 2018), True, False)
df_merged_companies["is_ownerid_change_2009_2017"] = df_merged_companies.groupby(["companyid"])["is_ownerid_change_2009_2017"].transform('any')

Now for `df_merged_owners` with `ownerid`

In [26]:
# ownerid, 2009-2014
df_merged_owners["is_ownerid_change_2009"] = np.where((df_merged_owners["is_ownerid_change"] == True) & (df_merged_owners["panel_year"] > 2008) & (df_merged_owners["panel_year"] < 2012), True, False)
df_merged_owners["is_ownerid_change_2009"] = df_merged_owners.groupby(["companyid"])["is_ownerid_change_2009"].transform('any')

# ownerid, 2015-2017
df_merged_owners["is_ownerid_change_2015"] = np.where((df_merged_owners["is_ownerid_change"] == True) & (df_merged_owners["panel_year"] > 2014) & (df_merged_owners["panel_year"] < 2018), True, False)
df_merged_owners["is_ownerid_change_2015"] = df_merged_owners.groupby(["companyid"])["is_ownerid_change_2015"].transform('any')

# ownerid, 2009-2017
df_merged_owners["is_ownerid_change_2009_2017"] = np.where((df_merged_owners["is_ownerid_change"] == True) & (df_merged_owners["panel_year"] > 2008) & (df_merged_owners["panel_year"] < 2018), True, False)
df_merged_owners["is_ownerid_change_2009_2017"] = df_merged_owners.groupby(["companyid"])["is_ownerid_change_2009_2017"].transform('any')

As previously discussed in the MIP merge file, the environmental innovations survey questions changed from 2009 to 2015 and 2021, and due to that, the individual variables do not all have the same meaning depending on the year (e.g. `oekpz6` in 2015 onwards refers to noise pollution, while in 2009 it refers to soil contamination). Therefore, I will construct the variables `oekpz_avg` and `oekpd_avg`, which are averages of the other variables and can show a holistic view of eco-innovation at the respective companies.

In [27]:
eco_innovations = ["oekpz1", "oekpz2", "oekpz3", "oekpz4", "oekpz5", "oekpz6", "oekpz7", "oekpz8", "oekpz9"] 
eco_product_innovations = ["oekpd1", "oekpd2", "oekpd3", "oekpd4"]
df_merged_companies["oekpz_avg"] = df_merged_companies[eco_innovations].mean(axis=1)
df_merged_companies["oekpd_avg"] = df_merged_companies[eco_product_innovations].mean(axis=1)
df_merged_owners["oekpz_avg"] = df_merged_owners[eco_innovations].mean(axis=1)
df_merged_owners["oekpd_avg"] = df_merged_owners[eco_product_innovations].mean(axis=1)

However, in other to not lose information through the aggregation of all the variables into averages, I matched, were it was possible, the different variables throughout the years. The variable descriptions as they appear in the scientific use guide for MIP will be commented.

In [28]:
#oekpz1, 2015/2021: Reduction of energy consumption
#oekpz2, 2009: Reduction in energy consumption 
df_merged_companies["energy_consumption"] = np.where((df_merged_companies["jahr"] == 2014) | (df_merged_companies["jahr"] == 2020), df_merged_companies["oekpz1"], df_merged_companies["oekpz2"])
df_merged_owners["energy_consumption"] = np.where((df_merged_owners["jahr"] == 2014) | (df_merged_owners["jahr"] == 2020), df_merged_owners["oekpz1"], df_merged_owners["oekpz2"])

In [29]:
#oekpz2, 2015/2021: Reduction of material / water consumption
#oekpz1, 2009: Reduction in material use
df_merged_companies["material_use"] = np.where((df_merged_companies["jahr"] == 2014) | (df_merged_companies["jahr"] == 2020), df_merged_companies["oekpz2"], df_merged_companies["oekpz1"])
df_merged_owners["material_use"] = np.where((df_merged_owners["jahr"] == 2014) | (df_merged_owners["jahr"] == 2020), df_merged_owners["oekpz2"], df_merged_owners["oekpz1"])

In [30]:
#oekpz3: Reduction of CO2 emissions (2009, 2015/2021)
df_merged_companies["emissions_CO2"] = df_merged_companies["oekpz3"]
df_merged_owners["emissions_CO2"] = df_merged_owners["oekpz3"]
#oekpz4: Reduction of other air pollution (2015/2021), Reduction of other emissions (2009) 
df_merged_companies["emissions_other"] = df_merged_companies["oekpz4"]
df_merged_owners["emissions_other"] = df_merged_owners["oekpz4"]
#oekpz8: Replacement of dangerous material (2015/2021), Replacement of hazardous material (2009)
df_merged_companies["hazardous_material"] = df_merged_companies["oekpz8"]
df_merged_owners["hazardous_material"] = df_merged_owners["oekpz8"]
#oekpz9: Recycling of waste / waste water / material (2015/2021), Improvement in recycling (2009)
df_merged_companies["recycling"] = df_merged_companies["oekpz9"]
df_merged_owners["recycling"] = df_merged_owners["oekpz9"] 

#Regression

Define all possible dependent variables

In [31]:
dependent_variables = ["oekpz_avg", "oekpd_avg", "energy_consumption", "material_use", "emissions_CO2", "emissions_other", "hazardous_material", "recycling"]

Regression with no control variables, merger between 2009-2011

In [32]:
#Loop over all possible dependent variables
for y in dependent_variables:
    # Filter the data for the relevant years
    df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2014)]
    df_did = df_did[["companyid", "jahr", y, "is_owner_change_2009", "is_owner_change_2015", "is_owner_change_2009_2017"]]
    df_did["jahr_2014"] = np.where(df_did["jahr"] == 2014, 1, 0)
    df_did["is_owner_change_2009"] = np.where(df_did["is_owner_change_2009"] == True, 1, 0)

    # Set up the difference-in-differences model
    model_did = smf.ols(formula= y + ' ~ is_owner_change_2009 + jahr_2014 + is_owner_change_2009:jahr_2014', data=df_did)

    # Fit the model
    results_did = model_did.fit()

    # Print the model summary
    print("Regression results for " + y + " in the period 2009-2011:\n")
    print(results_did.summary())
    print("\n\n")


Regression results for oekpz_avg in the period 2009-2011:

                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     44.85
Date:                Sat, 06 Jan 2024   Prob (F-statistic):           9.20e-29
Time:                        18:13:44   Log-Likelihood:                -7004.2
No. Observations:                8938   AIC:                         1.402e+04
Df Residuals:                    8934   BIC:                         1.404e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------

Regression with no control variables, mergers between 2015-2017

In [33]:
#Loop over all possible dependent variables
for y in dependent_variables:
    # Filter the data for the relevant years
    df_did = df_merged_companies[(df_merged_companies["jahr"] == 2014) | (df_merged_companies["jahr"] == 2020)]
    df_did = df_did[["companyid", "jahr", y, "is_owner_change_2015"]]
    df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
    df_did["is_owner_change_2015"] = np.where(df_did["is_owner_change_2015"] == True, 1, 0)

    # Set up the difference-in-differences model
    model_did = smf.ols(formula= y + ' ~ is_owner_change_2015 + jahr_2020 + is_owner_change_2015:jahr_2020', data=df_did)

    # Fit the model
    results_did = model_did.fit()

    # Print the model summary
    print("Regression results for " + y + " in the period 2015-2017:\n")
    print(results_did.summary())
    print("\n\n")

Regression results for oekpz_avg in the period 2015-2017:

                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.509
Date:                Sat, 06 Jan 2024   Prob (F-statistic):              0.210
Time:                        18:13:47   Log-Likelihood:                -5161.5
No. Observations:                8213   AIC:                         1.033e+04
Df Residuals:                    8209   BIC:                         1.036e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------

Regression with no control variables, mergers between 2009-2017

In [34]:
#Loop over all possible dependent variables
for y in dependent_variables:
    # Filter the data for the relevant years
    df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
    df_did = df_did[["companyid", "jahr", y, "is_owner_change_2009_2017"]]
    df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
    df_did["is_owner_change_2009_2017"] = np.where(df_did["is_owner_change_2009_2017"] == True, 1, 0)

    # Set up the difference-in-differences model
    model_did = smf.ols(formula=y + ' ~ is_owner_change_2009_2017 + jahr_2020 + is_owner_change_2009_2017:jahr_2020', data=df_did)

    # Fit the model
    results_did = model_did.fit()

    # Print the model summary
    print("Regression results for " + y + " in the period 2009-2017:\n")
    print(results_did.summary())
    print("\n\n")

Regression results for oekpz_avg in the period 2009-2017:

                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     31.52
Date:                Sat, 06 Jan 2024   Prob (F-statistic):           2.93e-20
Time:                        18:13:51   Log-Likelihood:                -6777.4
No. Observations:                8717   AIC:                         1.356e+04
Df Residuals:                    8713   BIC:                         1.359e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------

Regression with `bges` as control variable, mergers within 2009-2017 period

In [35]:
#Loop over all possible dependent variables
for y in dependent_variables:
    # Filter the data for the relevant years
    df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
    df_did = df_did[["companyid", "jahr", y, "is_owner_change_2009_2017", "bges"]]
    df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
    df_did["is_owner_change_2009_2017"] = np.where(df_did["is_owner_change_2009_2017"] == True, 1, 0)

    # Set up the difference-in-differences model
    model_did = smf.ols(formula=y + ' ~ is_owner_change_2009_2017 + jahr_2020 + is_owner_change_2009_2017:jahr_2020 + bges', data=df_did)

    # Fit the model
    results_did = model_did.fit()

    # Print the model summary
    print("Regression results for " + y + " in the period 2009-2017, with bges (# of employees) as a control:\n")
    print(results_did.summary())
    print("\n\n")

Regression results for oekpz_avg in the period 2009-2017, with bges (# of employees) as a control:

                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     46.87
Date:                Sat, 06 Jan 2024   Prob (F-statistic):           4.93e-39
Time:                        18:13:54   Log-Likelihood:                -6668.1
No. Observations:                8625   AIC:                         1.335e+04
Df Residuals:                    8620   BIC:                         1.338e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
------

Regression with `bges` and `branche` as control variables, mergers within 2009-2017 period

In [36]:
#Loop over all possible dependent variables
for y in dependent_variables:
    # Filter the data for the relevant years
    df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
    df_did = df_did[["companyid", "jahr", y, "is_owner_change_2009_2017", "bges", "branche"]]
    df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
    df_did["is_owner_change_2009_2017"] = np.where(df_did["is_owner_change_2009_2017"] == True, 1, 0)

    # Set up the difference-in-differences model
    model_did = smf.ols(formula=y + ' ~ is_owner_change_2009_2017 + jahr_2020 + is_owner_change_2009_2017:jahr_2020 + bges + C(branche)', data=df_did)

    # Fit the model
    results_did = model_did.fit()

    # Print the model summary
    print("Regression results for " + y + " in the period 2009-2017, with bges (# of employees) and branche (industry) as controls:\n")
    print(results_did.summary())

Regression results for oekpz_avg in the period 2009-2017, with bges (# of employees) and branche (industry) as controls:

                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     26.21
Date:                Sat, 06 Jan 2024   Prob (F-statistic):          9.60e-113
Time:                        18:13:57   Log-Likelihood:                -6074.0
No. Observations:                8104   AIC:                         1.220e+04
Df Residuals:                    8079   BIC:                         1.237e+04
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                                                          coef    std err          t    

After adding control variables to the regression model, run Variance Inflation Factor calculations for the different regression configurations to check for multicollinearity

In [37]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["X"] = df_merged_companies[["is_owner_change_2009_2017", "jahr_2020"]].columns
vif_data["VIF"] = [variance_inflation_factor(df_did[["is_owner_change_2009_2017", "jahr_2020"]].dropna(), i) 
                          for i in range(len(df_did[["is_owner_change_2009_2017", "jahr_2020"]].columns))]
vif_data 

KeyError: "['jahr_2020'] not in index"

In [38]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["X"] = df_did[["is_owner_change_2009_2017","jahr_2020", "bges"]].columns
vif_data["VIF"] = [variance_inflation_factor(df_did[["is_owner_change_2009_2017","jahr_2020", "bges"]].dropna(), i) 
                          for i in range(len(df_did[["is_owner_change_2009_2017","jahr_2020", "bges"]].columns))]
vif_data 

Unnamed: 0,X,VIF
0,is_owner_change_2009_2017,1.118296
1,jahr_2020,1.11928
2,bges,1.002389


I was not able to run the VIF with branche since it is not numerical


#Regression with `df_merged_owners`

Keeping the data together like this doesn't result in anything significant, too few entries

In [39]:
#Loop over all possible dependent variables
for y in dependent_variables:
    df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
    df_did = df_did[["companyid", "jahr", y, "is_ownerid_change_2009_2017", "bges", "branche"]]
    df_did = pd.merge(df_did, df_merged_owners[['companyid', 'ownerid', 'jahr', y, 'bges', 'branche']], on=['companyid', 'jahr'], how='left')
    df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
    df_did["is_ownerid_change_2009_2017"] = np.where(df_did["is_ownerid_change_2009_2017"] == True, 1, 0)
    # Set up the difference-in-differences model
    model_did = smf.ols(formula=y + "_y" + ' ~ is_ownerid_change_2009_2017 + jahr_2020 + is_ownerid_change_2009_2017:jahr_2020', data=df_did)

    # Fit the model
    results_did = model_did.fit()

    # Print the model summary
    print("Regression results for " + y + " in the period 2009-2017 for owners:\n")
    print(results_did.summary())
    print("\n\n")

Regression results for oekpz_avg in the period 2009-2017 for owners:

                            OLS Regression Results                            
Dep. Variable:            oekpz_avg_y   R-squared:                       0.077
Model:                            OLS   Adj. R-squared:                  0.068
Method:                 Least Squares   F-statistic:                     8.389
Date:                Sat, 06 Jan 2024   Prob (F-statistic):           2.26e-05
Time:                        18:15:35   Log-Likelihood:                -273.47
No. Observations:                 304   AIC:                             554.9
Df Residuals:                     300   BIC:                             569.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------

Here there is a additional dummy variable `owner` to identify if an entry is a owner (=1) or company (=0)

In [40]:
#Loop over all possible dependent variables
for y in dependent_variables:
    df_did = df_merged_companies[(df_merged_companies["jahr"] == 2008) | (df_merged_companies["jahr"] == 2020)]
    df_did = df_did[["companyid", "jahr", y, "is_ownerid_change_2009_2017", "bges", "branche"]]
    df_did["owner"] = 0
    df_did = pd.concat([df_did, df_merged_owners[['companyid', 'jahr', y, 'is_ownerid_change_2009_2017' ,'bges', 'branche']]], ignore_index=True)
    df_did["owner"].fillna(1, inplace=True)
    df_did["jahr_2020"] = np.where(df_did["jahr"] == 2020, 1, 0)
    df_did["is_ownerid_change_2009_2017"] = np.where(df_did["is_ownerid_change_2009_2017"] == True, 1, 0)

    # Set up the difference-in-differences model
    model_did = smf.ols(formula=y + ' ~ is_ownerid_change_2009_2017 + jahr_2020 + owner + jahr_2020:owner + is_ownerid_change_2009_2017:jahr_2020 + is_ownerid_change_2009_2017:jahr_2020:owner', data=df_did)

    # Fit the model
    results_did = model_did.fit()

    # Print the model summary
    print("Regression results for " + y + " in the period 2009-2017 for owners and companies:\n")
    print(results_did.summary())
    print("\n\n")

Regression results for oekpz_avg in the period 2009-2017 for owners and companies:

                            OLS Regression Results                            
Dep. Variable:              oekpz_avg   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     20.71
Date:                Sat, 06 Jan 2024   Prob (F-statistic):           2.59e-24
Time:                        18:15:39   Log-Likelihood:                -11998.
No. Observations:               15262   AIC:                         2.401e+04
Df Residuals:                   15255   BIC:                         2.406e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------

Save the data

In [41]:
df_merged_companies.to_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_companies_reganalysis.csv")
df_merged_owners.to_csv(r"C:\Users\lucas\OneDrive\BA\Data\outputs\merged_owners_reganalysis.csv")