In [1]:
import pandas as pd
df = pd.read_csv("final_data.csv")
df.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'final_data.csv'

In [None]:
df.shape

In [None]:
df.columns

## total companies

In [None]:
df.gvkey.nunique()

## companies per year

In [None]:
companies_per_year = df.groupby('fyear').gvkey.nunique()
companies_per_year

In [None]:
companies_per_year.plot(ylabel="n companies")

## years per company

In [None]:
years_per_company = df.groupby('gvkey').fyear.nunique().sort_values()
years_per_company

In [None]:
years_per_company.plot(kind="hist", bins=25, xlabel="Years", ylabel="n companies")

## staggered board?
### only staggered, never staggered, both

In [None]:
df_sbf = df.groupby('gvkey')['sbf'].agg(set= lambda x: list(set(x)))
df_sbf

In [None]:
ax = df_sbf["set"].value_counts().plot(kind="bar", ylabel="n companies", xlabel="category", rot=30)
ax.set_xticklabels(["staggered", "not staggered", "both"])


## kld index per company

In [None]:
kld_per_company = df.groupby('gvkey')["kld_index"].agg(mean="mean")
kld_per_company

In [None]:
kld_per_company.reset_index(drop=True, inplace=True)
ax = kld_per_company.plot(kind="line", ylabel="kld index", xlabel="gvkey", style='o')
ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False) 

## kld per sbf & company

In [None]:
kld_per_sbf_and_company = df.groupby(["gvkey", "sbf"])["kld_index"].agg(mean="mean")
kld_per_sbf_and_company

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(figsize=(10, 10))

kld_sbf = kld_per_sbf_and_company[kld_per_sbf_and_company.index.isin([1], level=1)].values.squeeze(1)
kld_no_sbf = kld_per_sbf_and_company[kld_per_sbf_and_company.index.isin([0], level=1)].values.squeeze(1)

axes.violinplot(dataset = [kld_sbf, kld_no_sbf], showextrema=False, showmeans=True)
axes.set_ylabel("kld index")
axes.set_xticks([1, 2])
axes.set_xticklabels(["Staggered (1157 data points)", "Not Staggered (1113 data points)"])

## Fixed Effects
### reproducing Julie's results with PanelOLS

In [None]:
df["fyear"] = pd.to_datetime(df["fyear"], format='%Y')
df.set_index(["gvkey", "fyear"], inplace=True)
df

In [None]:
from linearmodels.panel import PanelOLS
mod = PanelOLS(dependent=df["kld_index"], exog=df["sbf"], entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res)

### reproducing Julie's results with OLS (demeaned)

In [None]:
import statsmodels.api as sm

sbf_mean = df.groupby('gvkey')['sbf'].mean().reset_index()
sbf_mean.rename(columns={'sbf': 'sbf_mean'}, inplace=True)
sbf_mean

In [None]:
df = pd.merge(df, sbf_mean, on='gvkey', how='left')
df['sbf_demeaned'] = df['sbf'] - df['sbf_mean']
df

In [None]:
esg_score_mean = df.groupby('gvkey')['kld_index'].mean().reset_index()
esg_score_mean.rename(columns={'kld_index': 'kld_index_mean'}, inplace=True)
esg_score_mean

In [None]:
df = pd.merge(df, esg_score_mean, on='gvkey', how='left')
df['kld_index_demeaned'] = df['kld_index'] - df['kld_index_mean']
df

In [None]:
model = sm.OLS(df["kld_index_demeaned"], df["sbf_demeaned"])
results = model.fit()
results.summary()

### reproducing Julie's results with OLS (dummy variables)

In [None]:
df_dummies = pd.get_dummies(df[["sbf", "gvkey"]], columns=["gvkey"])
df_dummies

In [None]:
model = sm.OLS(df[["kld_index"]], df_dummies.astype(float))
results = model.fit()
results.params.iloc[0]