In [87]:
import pandas as pd
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [88]:
df = pd.read_csv("../3) Data/main_df_no_NAN_99p.csv").drop(columns="Unnamed: 0")

df_change = pd.read_csv("../3) Data/main_df_no_NAN_99p_YoY.csv").drop(columns="Unnamed: 0")

In [89]:
# Identfying ESG changes in companies and years
# How much are changes more than 10%
# Looking at just ESG Score changes

# A standardized df is needed, where each ISIN has 
# the same number of years next to it
years = list(range(2000, 2023))
isin = df['ISIN'].unique()
combinations = list(itertools.product(isin, years))
df_combinations = pd.DataFrame(combinations, columns=['ISIN', 'Year'])

df_ESGS_YoY = pd.merge(df_combinations, df, on=['ISIN', 'Year'], how='left'
                    ).reset_index().drop(columns="index")

# Calculating one-period differences: (Xt/Xt-1) - 1
df_ESGS_YoY['ESG Score YoY Change'] = (df_ESGS_YoY["ESG Score"] / df_ESGS_YoY["ESG Score"].shift(1)) - 1

# Including one-period lag for y variable
df_ESGS_YoY["Total Return+1"] = df_ESGS_YoY["Total Return"].shift(-2)

# Dropping the first year of every company, because it is compared to the last
# ESG Score of the previous company, not the previous year and already calculated returns
df_ESG_YoY = df_ESGS_YoY[df_ESGS_YoY["Year"].isin([2000, 2021, 2022]) == False]

# Deleting NAN values
df_ESG_YoY_cleaned = df_ESG_YoY.dropna()

In [92]:
# Select only the float columns in the dataframe
float_cols = df_ESG_YoY_cleaned.select_dtypes(include=['float64']).columns

# Instantiate the StandardScaler object
scaler = StandardScaler()

# Fit and transform the float columns in the dataframe
df_ESG_YoY_cleaned[float_cols] = scaler.fit_transform(df_ESG_YoY_cleaned[float_cols])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ESG_YoY_cleaned[float_cols] = scaler.fit_transform(df_ESG_YoY_cleaned[float_cols])


In [93]:
def CustomRegression_upper(df, size_list, condition_list):

    # load dataframe with all data
    current_df = df    
    #dictionary of results corresponding to each condition
    perCondition = {}

    #takes each condition and 
    for condition in condition_list:
        #create dataframe to house results
        results = pd.DataFrame({'number': size_list, 'mse': [0]*len(size_list), 'r2': [0]*len(size_list)})
        for idx in range (len(size_list)):
            results['number'][idx] = size_list[idx]
            top_n_df = current_df.nlargest(size_list[idx], condition)
            top_n_df_reg = top_n_df[["Total Return+1",
                                 "ESG Score",
                                 "ESG Combined Score",
                                 "ESG Controversies Score",
                                 "Social Pillar Score",
                                 "Governance Pillar Score",
                                 "Environmental Pillar Score",
                                 "ESG Score YoY Change"]]
            X = top_n_df_reg.drop('Total Return+1',axis =1)
            y = top_n_df_reg['Total Return+1']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 101)
            regr = RandomForestRegressor(n_estimators = 10, max_depth = 10, random_state = 101)
            regr.fit(X_train, y_train.values.ravel())
            predictions = regr.predict(X_test)
            # Mean squared error (MSE)
            results['mse'][idx] = mean_squared_error(y_test.values.ravel(), predictions)
            # R-squared scores
            results['r2'][idx] = r2_score(y_test.values.ravel(), predictions)

        perCondition[condition] = results
    
    return perCondition

In [94]:
n = [100, 1000, 5000, 10000, 15000, 20000, len(df_ESG_YoY_cleaned["ISIN"])]
conditioning_feature = ['Company Market Cap', "ESG Score YoY Change"]

results_ = CustomRegression_upper(df_ESG_YoY_cleaned, n, conditioning_feature)
results_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['r2'][idx] = r2_score(y_test.values.ravel(), predictions)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['number'][idx] = size_list[idx]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['number'][idx] = size_list[idx]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['number'][idx]

{'Company Market Cap':    number       mse        r2
 0     100  1.719673  0.112594
 1    1000  0.889970 -0.131357
 2    5000  0.792850 -0.039866
 3   10000  0.858472 -0.040123
 4   15000  0.861715 -0.015365
 5   20000  0.863767 -0.020339
 6   29457  1.037300 -0.010816,
 'ESG Score YoY Change':    number       mse        r2
 0     100  1.632991 -0.638773
 1    1000  1.389302 -0.225535
 2    5000  1.236417 -0.054755
 3   10000  1.118262 -0.030548
 4   15000  1.058257 -0.018185
 5   20000  0.973131 -0.022055
 6   29457  1.033568 -0.021810}