In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn import datasets, linear_model
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

We read in the dataframe and clean out the column names

In [None]:
df = pd.read_csv('Final-Sample-External-with-ISINs.csv')

column_list = []
for column in df.columns:
    column_list.append(column.replace(' ', ''))
df.columns = column_list

Format columns from the csv to float.

In [None]:
df = df[['Year', 'Country', 'Industry(Exiobase)', 'EnvironmentalIntensity(Sales)', 'TotalEnvironmentalCost']]

def percent_to_float(s):
    return float(s.strip('%')) / 100.0

df['EnvironmentalIntensity(Sales)'] = df['EnvironmentalIntensity(Sales)'].apply(percent_to_float)

replace_dict = {'(':'',')':'', ' ' : '', ',' : ''}
def paranthesis_to_minus(value):
    for i, j in replace_dict.items():
        value = value.replace(i, j)
    value = int(f'-{value}')
    return value

df['TotalEnvironmentalCost'] = df['TotalEnvironmentalCost'].apply(paranthesis_to_minus)

Create dummy variables for industry and country.

In [None]:
df = pd.get_dummies(df, columns = ['Industry(Exiobase)', 'Country'])

x = df.drop(columns = ['Year',  'EnvironmentalIntensity(Sales)', 'TotalEnvironmentalCost'])
y = df['EnvironmentalIntensity(Sales)']

Setup the dataframe to fit into the models. Fit the models

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)
OLS_VS = sm.OLS(y_train, x_train).fit() #47 percent of the variation can be explained with this model

y_pred = OLS_VS.predict(x_test)
r2_score(y_test, y_pred)

OLS_CV = linear_model.LinearRegression()
lasso = linear_model.Lasso()
ridge = linear_model.Ridge()

OLS_CV.fit(x_train, y_train)
y_pred = OLS_CV.predict(x_test)
r2_score(y_test, y_pred)

print( 'OLS:', cross_val_score(OLS_CV, x_train, y_train, cv=3, scoring = 'r2').mean())
print('Lasso:', cross_val_score(lasso, x_train, y_train, cv=3, scoring = 'r2').mean())
print('Ridge:', cross_val_score(ridge, x_train, y_train, cv=3, scoring = 'r2').mean())

# XGBoost
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test, label = y_test)

param = {'max_depth': 2, 'eta': 1}
param['nthread'] = 4
param['eval_metric'] = 'auc'

evallist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 10
# bst = xgb.train(param, dtrain, num_round, evallist)


#Random Forest
X, y = make_regression(n_features=4, n_informative=2,
                       random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(x_train, y_train)

y_pred = regr.predict(x_test)
print('Random Forest:', r2_score(y_test, y_pred))


OLS: -2.0933314439874914e+23
Lasso: -0.0008139589559728044
Ridge: 0.4298000289561223
Random Forest: 0.21765651623480808
