# Imports

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import warnings
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

warnings.filterwarnings(action='once')

data_path = str(Path(os.getcwd()).parent.absolute())+"/data"
figures_path = str(Path(os.getcwd()).parent.absolute())+"/reports/figures"

# Load Data

In [None]:
factors = pd.read_csv(data_path+"/interim/factors.csv", index_col=0)
portfolio_excess_returns = pd.read_csv(data_path+"/interim/portfolio_excess_returns.csv", index_col=0)

## Function

In [None]:
# Return X and y for lasso taking penalty for beta estimates into account
def get_betas_estimates(factors, portfolio_excess_returns, penalty):
    #Compute the covariances between factors and the excess returns
    cov_h = np.cov(factors.transpose(),portfolio_excess_returns.transpose())[len(factors.columns):,:len(factors.columns)]
    cov_h = pd.DataFrame(data = cov_h, columns = factors.columns , index = portfolio_excess_returns.columns)

    #Compute the average monthly returns of the portfolios
    average_portfolio_excess_returns = pd.DataFrame({'Average Excess Returns': portfolio_excess_returns.mean()})

    X = cov_h * penalty
    y = average_portfolio_excess_returns
    return(X,y)


## First selection

In [None]:
warnings.filterwarnings(action='ignore')
alpha_search = pd.DataFrame()

alphas = np.logspace(0,-35,100,base=np.e)
random_state = 1
cv_repeats = 20
folds = 10

#Compute the covariances between factors and the excess returns
cov_h = np.cov(factors.transpose(),portfolio_excess_returns.transpose())[len(factors.columns):,:len(factors.columns)]
cov_h = pd.DataFrame(data = cov_h, columns = factors.columns , index = portfolio_excess_returns.columns)

#Compute the beta estimates
factors_betas = pd.DataFrame(columns = portfolio_excess_returns.columns)
for factor in factors:
    factors_betas.loc[factor] = cov_h.loc[:,factor]/np.var(factors[factor])
factors_betas = factors_betas.transpose()

#Compute the penalty for the lasso
penalty  = (factors_betas*factors_betas).mean()
penalty = penalty/penalty.mean() # normalize the level

X, y = get_betas_estimates(factors, portfolio_excess_returns, penalty)


for cv_repeat in range(0,cv_repeats):
    kf = KFold(n_splits=folds, shuffle=True, random_state=cv_repeat)
    print('\n------------ CV Repeat number:', cv_repeat+1)

    for fold, (train_index, test_index) in enumerate(kf.split(factors)):
        print('\n------ Fold Number:',fold+1)

    # Second-Pass LASSO Regression First selection ###################################################
        X_test, y_test = get_betas_estimates(factors.iloc[test_index], portfolio_excess_returns.iloc[test_index], penalty)
        X_train, y_train = get_betas_estimates(factors.iloc[train_index], portfolio_excess_returns.iloc[train_index], penalty)

        for alpha in alphas:
            model = Lasso(alpha=alpha, fit_intercept=True).fit(X_train,y_train)
            y_pred = model.predict(X_test)

            alpha_search.loc[len(alpha_search),['alpha', 'MSE','R-Squared','# Coefs',"fold","repeat"]] = [alpha,mean_squared_error(y_test, y_pred),r2_score(y_test,y_pred),np.count_nonzero(model.coef_), fold, cv_repeat]




In [None]:
# Calculate k-fold mean and se
cv_repeats = 200
alphas = np.logspace(0,-35,100,base=np.e)
alpha_search_new = pd.DataFrame()
for cv_repeat in range(0,cv_repeats):
    for alpha in alphas:
        alpha_search_new.loc[len(alpha_search_new),['alpha', 'MSE','R-Squared','# Coefs', 'repeat', 'SE']]= [
            alpha,
            alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'MSE'].mean(),
            alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'R-Squared'].mean(),
            alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), '# Coefs'].mean(),
            cv_repeat,
            np.std(alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'MSE']),
        ]

In [None]:
# Choose best alpha for each repetition
alpha_sel = pd.DataFrame()
for cv_repeat in range(0,cv_repeats):
    alpha_search_temp = alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat) & (alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat), 'MSE'] == alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat), 'MSE'].min())]
    if alpha_search_temp.iloc[0,0] == 1:
        alpha_temp = alpha_search_temp.iloc[-1,0]
    else:
        alpha_temp = alpha_search_temp.iloc[0,0]
    alpha_sel.loc[cv_repeat,'alpha'] = alpha_temp


In [None]:
alpha_sel

In [None]:
# Select the best alpha ie: the (log) average of the best alphas of each repetition
alpha_sel.loc[:,'log_alpha'] = alpha_sel.loc[:,'alpha'].apply(np.log)
best_alpha = np.exp(alpha_sel.loc[:,'log_alpha'].mean())


In [None]:
model = Lasso(alpha=best_alpha, fit_intercept=True).fit(X,y)

sel1_results = list(pd.DataFrame(data=abs(model.coef_), index=X.columns).sort_values(by=[0],ascending=False).iloc[0:np.count_nonzero(model.coef_)].index)
sel1_results = [best_alpha] + sel1_results

pd.DataFrame(data=sel1_results)


## Second selection

In [None]:
factors = pd.read_csv(data_path+"/interim/factors.csv", index_col=0).loc['2007-02-28':,:]
portfolio_excess_returns = pd.read_csv(data_path+"/interim/portfolio_excess_returns.csv", index_col=0).loc['2007-02-28':'2017-10-31',:]

In [None]:
green_factors = pd.read_csv("green_factors.csv", index_col=0).loc['2007-02-28':'2017-10-31',:]
green_factors = green_factors.loc[:,['ESG_ADJ', 'E_V_IND_W', 'S_V_IND_W', 'G_V_IND_W', 'CO2_V_IND_W']]

In [None]:
warnings.filterwarnings(action='ignore')
alpha_search = pd.DataFrame()

alphas = np.logspace(0,-35,100,base=np.e)
random_state = 1
cv_repeats = 20
folds = 10


for green_characteristic in green_factors.columns:
    green_factor = pd.DataFrame(data = green_factors.loc[:, green_characteristic])

    #Compute the covariances between factors and the excess returns
    cov_h = np.cov(factors.transpose(),portfolio_excess_returns.transpose())[len(factors.columns):,:len(factors.columns)]
    cov_h = pd.DataFrame(data = cov_h, columns = factors.columns , index = portfolio_excess_returns.columns)

    #Compute the beta estimates
    factors_betas = pd.DataFrame(columns = portfolio_excess_returns.columns)
    for factor in factors:
        factors_betas.loc[factor] = cov_h.loc[:,factor]/np.var(factors[factor])
    factors_betas = factors_betas.transpose()

    #Compute the penalty for the lasso
    penalty  = (factors_betas*factors_betas).mean()
    penalty = penalty/penalty.mean() # normalize the level

    #Compute the covariances between green_factor and the excess returns
    green_cov_h = np.cov(green_factor.transpose(),portfolio_excess_returns.transpose())[len(green_factor.columns):,:len(green_factor.columns)]
    green_cov_h = pd.DataFrame(data = green_cov_h, columns = green_factor.columns , index = portfolio_excess_returns.columns)


    #Compute the green_factor beta estimates
    green_factor_betas = pd.DataFrame(columns = portfolio_excess_returns.columns)
    for factor in green_factor:
        green_factor_betas.loc[factor] = green_cov_h.loc[:,factor]/np.var(green_factor[factor])
    green_factor_betas = green_factor_betas.transpose()


    for cv_repeat in range(0,cv_repeats):
        kf = KFold(n_splits=folds, shuffle=True, random_state=cv_repeat)
        print('\n------------ CV Repeat number:', cv_repeat+1)

        for fold, (train_index, test_index) in enumerate(kf.split(factors)):
            print('\n------ Fold Number:',fold+1)

            # Second-Pass LASSO Regression First selection ###################################################
            X_test = get_betas_estimates(factors.iloc[test_index], portfolio_excess_returns.iloc[test_index], penalty)[0]
            y_test = get_betas_estimates(green_factor.iloc[test_index], portfolio_excess_returns.iloc[test_index], 1)[0]

            X_train = get_betas_estimates(factors.iloc[train_index], portfolio_excess_returns.iloc[train_index], penalty)[0]
            y_train = get_betas_estimates(green_factor.iloc[train_index], portfolio_excess_returns.iloc[train_index], 1)[0]

            for alpha in alphas:
                model = Lasso(alpha=alpha, fit_intercept=True, tol=0.000000001).fit(X_train,y_train)
                y_pred = model.predict(X_test)

                alpha_search.loc[len(alpha_search),['alpha', 'MSE','R-Squared','# Coefs',"fold","repeat"]] = [alpha,mean_squared_error(y_test, y_pred),r2_score(y_test,y_pred),np.count_nonzero(model.coef_), fold, cv_repeat]


    # Calculate k-fold mean and se
    alphas = np.logspace(0,-35,100,base=np.e)
    alpha_search_new = pd.DataFrame()
    for cv_repeat in range(0,cv_repeats):
        for alpha in alphas:
            alpha_search_new.loc[len(alpha_search_new),['alpha', 'MSE','R-Squared','# Coefs', 'repeat', 'SE']]= [
                alpha,
                alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'MSE'].mean(),
                alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'R-Squared'].mean(),
                alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), '# Coefs'].mean(),
                cv_repeat,
                np.std(alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'MSE']),
            ]

    # Choose best alpha for each repetition
    alpha_sel = pd.DataFrame()
    for cv_repeat in range(0,cv_repeats):
        alpha_search_temp = alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat) & (alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat), 'MSE'] == alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat), 'MSE'].min())]
        if alpha_search_temp.iloc[0,0] == 1:
            alpha_temp = alpha_search_temp.iloc[-1,0]
        else:
            alpha_temp = alpha_search_temp.iloc[0,0]
        alpha_sel.loc[cv_repeat,'alpha'] = alpha_temp

    alpha_sel.loc[:,'log_alpha'] = alpha_sel.loc[:,'alpha'].apply(np.log)
    best_alpha = np.exp(alpha_sel.loc[:,'log_alpha'].mean())

    X = get_betas_estimates(factors, portfolio_excess_returns, penalty)[0]
    y = get_betas_estimates(green_factor, portfolio_excess_returns, 1)[0]
    model = Lasso(alpha=best_alpha, fit_intercept=True).fit(X,y)

    sel2_results = list(pd.DataFrame(data=abs(model.coef_), index=X.columns).sort_values(by=[0],ascending=False).iloc[0:np.count_nonzero(model.coef_)].index)
    sel2_results = [best_alpha] + sel2_results


    pd.DataFrame(data=sel2_results).transpose().to_csv(data_path + '/processed/'+green_characteristic+'_sel2_results.csv')



## Selection for inference

In [None]:
factors = pd.read_csv(data_path+"/interim/factors.csv", index_col=0).loc['2007-02-28':,:]
portfolio_excess_returns = pd.read_csv(data_path+"/interim/portfolio_excess_returns.csv", index_col=0).loc['2007-02-28':'2017-10-31',:]

In [None]:
green_factors = pd.read_csv("green_factors.csv", index_col=0).loc['2007-02-28':'2017-10-31',:]
green_factors = green_factors.loc[:,['ESG_ADJ', 'E_V_IND_W', 'S_V_IND_W', 'G_V_IND_W', 'CO2_V_IND_W']]

In [None]:
warnings.filterwarnings(action='ignore')
alpha_search = pd.DataFrame()

alphas = np.logspace(0,-35,100,base=np.e)
random_state = 1
cv_repeats = 20
folds = 10


for green_characteristic in green_factors.columns:
    green_factor = pd.DataFrame(data = green_factors.loc[:, green_characteristic])


    for cv_repeat in range(0,cv_repeats):
        kf = KFold(n_splits=folds, shuffle=True, random_state=cv_repeat)
        print('\n------------ CV Repeat number:', cv_repeat+1)

        for fold, (train_index, test_index) in enumerate(kf.split(factors)):
            print('\n------ Fold Number:',fold+1)

            # Second-Pass LASSO Regression First selection ###################################################
            X_test = factors.iloc[test_index]
            # Standardize X
            X_test = X_test / np.sqrt(len(X_test.index))
            y_test = green_factor.iloc[test_index]

            X_train = factors.iloc[train_index]
            # Standardize X
            X_train = X_train / np.sqrt(len(X_train.index))
            y_train = green_factor.iloc[train_index]

            for alpha in alphas:
                model = Lasso(alpha=alpha, fit_intercept=False).fit(X_train,y_train)
                y_pred = model.predict(X_test)

                alpha_search.loc[len(alpha_search),['alpha', 'MSE','R-Squared','# Coefs',"fold","repeat"]] = [alpha,mean_squared_error(y_test, y_pred),r2_score(y_test,y_pred),np.count_nonzero(model.coef_), fold, cv_repeat]

    # Calculate k-fold mean and se
    alphas = np.logspace(0,-35,100,base=np.e)
    alpha_search_new = pd.DataFrame()
    for cv_repeat in range(0,cv_repeats):
        for alpha in alphas:
            alpha_search_new.loc[len(alpha_search_new),['alpha', 'MSE','R-Squared','# Coefs', 'repeat', 'SE']]= [
                alpha,
                alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'MSE'].mean(),
                alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'R-Squared'].mean(),
                alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), '# Coefs'].mean(),
                cv_repeat,
                np.std(alpha_search.loc[(alpha_search['alpha'] == alpha) & (alpha_search['repeat'] == cv_repeat), 'MSE']),
            ]

    # Choose best alpha for each repetition
    alpha_sel = pd.DataFrame()
    for cv_repeat in range(0,cv_repeats):
        alpha_search_temp = alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat) & (alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat), 'MSE'] == alpha_search_new.loc[(alpha_search_new['repeat'] == cv_repeat), 'MSE'].min())]
        if alpha_search_temp.iloc[0,0] == 1:
            alpha_temp = alpha_search_temp.iloc[-1,0]
        else:
            alpha_temp = alpha_search_temp.iloc[0,0]
        alpha_sel.loc[cv_repeat,'alpha'] = alpha_temp

    # Select the best alpha ie: the (log) average of the best alphas of each repetition
    alpha_sel.loc[:,'log_alpha'] = alpha_sel.loc[:,'alpha'].apply(np.log)
    best_alpha = np.exp(alpha_sel.loc[:,'log_alpha'].mean())

    X = factors
    # Standardize X
    X = X / np.sqrt(len(X.index))
    y = green_factor
    model = Lasso(alpha=best_alpha, fit_intercept=True).fit(X,y)

    sel3_results = list(pd.DataFrame(data=abs(model.coef_), index=X.columns).sort_values(by=[0],ascending=False).iloc[0:np.count_nonzero(model.coef_)].index)
    sel3_results = [best_alpha] + sel3_results

    pd.DataFrame(data=sel3_results).transpose().to_csv(data_path + '/processed/'+green_characteristic+'_sel3_results.csv')

