In [19]:
import pandas as pd
import itertools
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [43]:
df = pd.read_csv("../3) Data/main_df_no_NAN_99p.csv")

In [44]:
# Identfying ESG changes in companies and years
# How much are changes more than 10%
# Looking at just ESG Score changes

# A standardized df is needed, where each ISIN has 
# the same number of years next to it
years = list(range(2000, 2023))
isin = df['ISIN'].unique()
combinations = list(itertools.product(isin, years))
df_combinations = pd.DataFrame(combinations, columns=['ISIN', 'Year'])

df_ESGS_YoY = pd.merge(df_combinations, df, on=['ISIN', 'Year'], how='left'
                    ).reset_index().drop(columns="index")

# Calculating one-period differences: (Xt/Xt-1) - 1
df_ESGS_YoY['ESG Score YoY Change'] = (df_ESGS_YoY["ESG Score"] / df_ESGS_YoY["ESG Score"].shift(1)) - 1

# Including one-period lag for y variable
df_ESGS_YoY["Total Return+1"] = df_ESGS_YoY["Total Return"].shift(-1)

# Dropping the first year of every company, because it is compared to the last
# ESG Score of the previous company, not the previous year and already calculated returns
df_ESG_YoY = df_ESGS_YoY[df_ESGS_YoY["Year"].isin([2000, 2022]) == False]

# Deleting NAN values
df_ESG_YoY_cleaned = df_ESG_YoY.dropna().drop(columns="Unnamed: 0")

In [45]:
df_ESG_YoY_cleaned[df_ESG_YoY_cleaned["ESG Score YoY Change"] >= 0.1].count().head(1)

ISIN    12029
dtype: int64

In [46]:
df_ESG_YoY_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34280 entries, 3 to 159744
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ISIN                        34280 non-null  object 
 1   Year                        34280 non-null  int64  
 2   Total Return                34280 non-null  float64
 3   ESG Score                   34280 non-null  float64
 4   ESG Combined Score          34280 non-null  float64
 5   ESG Controversies Score     34280 non-null  float64
 6   Social Pillar Score         34280 non-null  float64
 7   Governance Pillar Score     34280 non-null  float64
 8   Environmental Pillar Score  34280 non-null  float64
 9   Revenue - Mean              34280 non-null  float64
 10  Earnings Per Share - Mean   34280 non-null  float64
 11  Return On Equity - Mean     34280 non-null  float64
 12  Volume                      34280 non-null  float64
 13  Company Market Cap          34

In [47]:
df_ESG_YoY_cleaned

Unnamed: 0,ISIN,Year,Total Return,ESG Score,ESG Combined Score,ESG Controversies Score,Social Pillar Score,Governance Pillar Score,Environmental Pillar Score,Revenue - Mean,...,Company Market Cap,EBITDA - Mean,Country of Headquarters,Region of Headquarters,TRBC Industry Name,TRBC Industry Group Name,TRBC Business Sector Name,TRBC Economic Sector Name,ESG Score YoY Change,Total Return+1
3,DK0010244508,2003,78.725725,16.933543,16.933543,100.0,19.033531,34.523810,0.000000,2.600826e+10,...,1.828570e+11,6.631311e+09,Denmark,Europe,Marine Freight & Logistics,Freight & Logistics Services,Transportation,Industrials,0.577767,6.899061
4,DK0010244508,2004,6.899061,17.855846,17.855846,100.0,20.465337,26.862745,7.523148,3.097855e+10,...,1.980218e+11,8.512478e+09,Denmark,Europe,Marine Freight & Logistics,Freight & Logistics Services,Transportation,Industrials,0.054466,45.463359
5,DK0010244508,2005,45.463359,16.395483,16.395483,100.0,20.152244,17.166667,11.683007,3.250909e+10,...,2.821975e+11,8.136144e+09,Denmark,Europe,Marine Freight & Logistics,Freight & Logistics Services,Transportation,Industrials,-0.081786,-17.550102
6,DK0010244508,2006,-17.550102,18.732474,18.732474,60.0,20.633013,25.500000,11.033951,4.769541e+10,...,2.292305e+11,9.496474e+09,Denmark,Europe,Marine Freight & Logistics,Freight & Logistics Services,Transportation,Industrials,0.142539,3.167156
7,DK0010244508,2007,3.167156,31.193987,31.193987,100.0,28.724879,23.602484,40.195106,5.364527e+10,...,2.384613e+11,1.149992e+10,Denmark,Europe,Marine Freight & Logistics,Freight & Logistics Services,Transportation,Industrials,0.665236,-47.670028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159729,TW0001216000,2017,28.109428,85.702375,85.702375,100.0,85.789523,82.835427,88.182496,4.062900e+11,...,3.750130e+11,4.523707e+10,Taiwan,Asia,Food Processing,Food & Tobacco,Food & Beverages,Consumer Non-Cyclicals,0.041572,13.628573
159741,FR0013326246,2006,69.575372,24.458032,24.458032,100.0,31.540128,40.189255,5.274725,5.237316e+08,...,8.498573e+09,4.554353e+08,France,Europe,Commercial REITs,Residential & Commercial REITs,Real Estate,Real Estate,-0.220994,-16.521731
159742,FR0013326246,2007,-16.521731,73.652381,73.652381,100.0,70.885382,79.110106,71.201105,7.449527e+08,...,1.224555e+10,7.267190e+08,France,Europe,Commercial REITs,Residential & Commercial REITs,Real Estate,Real Estate,2.011378,-25.198536
159743,FR0013326246,2008,-25.198536,84.186496,84.186496,100.0,83.021801,83.587662,85.637085,1.421309e+09,...,8.731300e+09,1.167809e+09,France,Europe,Commercial REITs,Residential & Commercial REITs,Real Estate,Real Estate,0.143025,52.077456


In [48]:
def model_ESG(n, conditioning_feature):
    
    results = pd.DataFrame({'number': n, 'mse': [0]*len(n), 'r2': [0]*len(n)})
    for i in range (len(n)):
        results['number'][i] = n[i]
        top_n_df = df_ESG_YoY_cleaned.groupby('Region of Headquarters').apply(lambda x: x.nlargest(n[i], conditioning_feature)).reset_index(drop=True)
        top_n_df_reg = top_n_df[["Total Return+1",
                                 "ESG Score",
                                 "ESG Combined Score",
                                 "ESG Controversies Score",
                                 "Social Pillar Score",
                                 "Governance Pillar Score",
                                 "Environmental Pillar Score",
                                 "ESG Score YoY Change"]]
        X = top_n_df_reg.drop('Total Return+1',axis =1)
        y = top_n_df_reg['Total Return+1']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 101)
        regr = RandomForestRegressor(n_estimators = 10, max_depth = 10, random_state = 101)
        regr.fit(X_train, y_train.values.ravel())
        predictions = regr.predict(X_test)
        # Mean squared error (MSE)
        results['mse'][i] = mean_squared_error(y_test.values.ravel(), predictions)
        # R-squared scores
        results['r2'][i] = r2_score(y_test.values.ravel(), predictions)
        
    return results

In [49]:
n = [100, 1000, 5000, 10000, 15000, 20000]
conditioning_feature = 'ESG Score YoY Change'

results_ = model_ESG(n, conditioning_feature)
results_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['r2'][i] = r2_score(y_test.values.ravel(), predictions)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['number'][i] = n[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['number'][i] = n[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['number'][i] = n[i]
A value is trying to

Unnamed: 0,number,mse,r2
0,100,1666.942752,-0.050098
1,1000,1604.318011,-0.045527
2,5000,1455.813324,-0.011637
3,10000,1398.243578,-0.005443
4,15000,1386.616659,8.5e-05
5,20000,1386.616659,8.5e-05
