## Model Evaluation #3

What is Average GHG for Energy vs Utilities

GHG = a + b1*Sales + b2*Assets + b3*Util + b5*TimeTrend + e

150 observations (firms, years)


Util = 1 if Utilities, = 0 Otherwise
a= average GHG for energy (when sales and assets = 0)
b3 = additional GHG for utilities above that of energy company
a + b3  → utilities (when sales and assets = 0)
Predicted GHG = a + b1*Sales + b2*Assets + b3  (utilities)
Predicted GHG = a + b1*Sales + b2*Assets  (energy)


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_predict
from sklearn import metrics

In [2]:
stocks = pd.read_csv('Datasets/company_data.csv')
sectors = pd.read_csv('Datasets/52_tickers_sectors.csv')

In [3]:
print(stocks.shape)
print(sectors.shape)

(780, 27)
(52, 3)


### Create the Missing_GHG column

In [4]:
stocks['Missing_GHG'] = np.where(stocks['GHG Scope 1'].isna(), 1, 0)
stocks.loc[stocks['GHG Scope 1'].isna(),['GHG Scope 1','Missing_GHG']]

Unnamed: 0,GHG Scope 1,Missing_GHG
0,,1
1,,1
2,,1
3,,1
4,,1
...,...,...
754,,1
755,,1
756,,1
757,,1


### Add the sector column to the dataset

In [5]:
df = pd.merge(stocks, sectors, how='inner',on='Ticker')
df.drop(columns='Name', inplace=True)
stocks = df.copy()

### Create the Utility column

In [6]:
stocks['Utility'] = np.where(stocks.Sector == 'Utilities',1,0)
stocks.loc[stocks.Sector == 'Utilities',].head()

Unnamed: 0,Year,Ticker,Environmental Disclosure Score,GHG Scope 1,Total Energy Consumption,Change_in_EDS,Change_in_GHG,Change_in_TEC,Ratio of GHG Emissions to Total Assets,Ratio of GHG Emissions to Total Sales,...,Change_in_NI,ROA,Profit_Margin,Annual Return,Logarithm_Total_Assets,Logarithm_Total_Sales,Profitable,Missing_GHG,Sector,Utility
0,2005,AEE,,,,0.0,0.0,0.0,,,...,0.0,3.408215,9.129794,,9.807087,8.821732,True,1,Utilities,1
1,2006,AEE,,,,0.0,0.0,0.0,,,...,-9.854604,2.850138,8.110465,,9.882162,8.836374,True,1,Utilities,1
2,2007,AEE,9.7,,,0.0,0.0,0.0,,,...,12.724014,3.034543,8.335542,,9.939241,8.928773,True,1,Utilities,1
3,2008,AEE,9.7,,,0.0,0.0,0.0,,,...,-2.225755,2.714393,7.845388,,10.028224,8.966867,True,1,Utilities,1
4,2009,AEE,12.4,,,27.835052,0.0,0.0,,,...,-0.487805,2.572509,8.631876,,10.077021,8.866441,True,1,Utilities,1


In [7]:
stocks.loc[stocks.Sector != 'Utilities',].head()

Unnamed: 0,Year,Ticker,Environmental Disclosure Score,GHG Scope 1,Total Energy Consumption,Change_in_EDS,Change_in_GHG,Change_in_TEC,Ratio of GHG Emissions to Total Assets,Ratio of GHG Emissions to Total Sales,...,Change_in_NI,ROA,Profit_Margin,Annual Return,Logarithm_Total_Assets,Logarithm_Total_Sales,Profitable,Missing_GHG,Sector,Utility
45,2005,APA,,,,0.0,0.0,0.0,,,...,0.0,13.614351,35.18342,,9.866398,8.916947,True,1,Energy,0
46,2006,APA,,,,0.0,0.0,0.0,,,...,-2.716705,10.500381,31.612225,,10.098568,8.996436,True,1,Energy,0
47,2007,APA,1.7,,,0.0,0.0,0.0,,,...,10.182644,9.82152,28.230908,,10.262373,9.206531,True,1,Energy,0
48,2008,APA,7.4,90999.8,,335.294118,0.0,0.0,3117.874592,7381.650588,...,-74.684802,2.439328,5.775173,,10.281461,9.419615,True,0,Energy,0
49,2009,APA,18.2,10985.0,29536.2,145.945946,-87.928545,0.0,389.736045,1281.209882,...,-139.94612,-1.009014,-3.31701,,10.246572,9.056481,False,0,Energy,0


### Time trend columns for each company

In [8]:
stocks['time_trend'] = stocks.groupby('Ticker').cumcount()
stocks.loc[stocks.Ticker == 'XOM'].groupby(['Ticker','Year']).head(20)

Unnamed: 0,Year,Ticker,Environmental Disclosure Score,GHG Scope 1,Total Energy Consumption,Change_in_EDS,Change_in_GHG,Change_in_TEC,Ratio of GHG Emissions to Total Assets,Ratio of GHG Emissions to Total Sales,...,ROA,Profit_Margin,Annual Return,Logarithm_Total_Assets,Logarithm_Total_Sales,Profitable,Missing_GHG,Sector,Utility,time_trend
765,2005,XOM,,,,0.0,0.0,0.0,,,...,17.342261,11.008095,,12.246903,12.701418,True,1,Energy,0,0
766,2006,XOM,43.8,145500.0,433650.0,0.0,0.0,0.0,664.338059,434.216888,...,18.035294,11.788019,,12.296895,12.722142,True,0,Energy,0,1
767,2007,XOM,46.3,141000.0,430556.0,5.707763,-3.092784,-0.713479,582.44727,393.195761,...,16.775308,11.324596,,12.397032,12.789963,True,0,Energy,0,2
768,2008,XOM,46.3,126000.0,416667.0,0.0,-10.638298,-3.225829,552.505569,296.421068,...,19.828811,10.638223,,12.337329,12.960011,True,0,Energy,0,3
769,2009,XOM,52.9,128000.0,405639.0,14.25486,1.587302,-2.646718,548.595724,464.501894,...,8.263223,6.99656,,12.360179,12.526575,True,0,Energy,0,4
770,2010,XOM,54.6,132000.0,408333.0,3.213611,3.125,0.664137,436.349212,386.44175,...,10.069089,8.917436,,12.61987,12.741331,True,0,Energy,0,5
771,2011,XOM,54.6,136000.0,430556.0,0.0,3.030303,5.442372,410.811595,313.706675,...,12.402885,9.471174,,12.710031,12.979707,True,0,Energy,0,6
772,2012,XOM,55.4,132000.0,416667.0,1.465201,-2.941176,-3.225829,395.452299,313.752335,...,13.445378,10.667579,0.19827,12.718282,12.949709,True,0,Energy,0,7
773,2013,XOM,55.4,135000.0,416667.0,0.0,2.272727,0.0,389.264377,345.934754,...,9.394247,8.348559,0.120157,12.756527,12.874535,True,0,Energy,0,8
774,2014,XOM,55.4,121000.0,444444.0,0.0,-10.37037,6.666475,346.215804,331.722242,...,9.304907,8.915378,0.352835,12.764239,12.807003,True,0,Energy,0,9


### Simple Linear Regression using Scikit-Learn

In [21]:
stocks.dropna(inplace=True, subset = ['Total_Assets','Total_Sales','GHG Scope 1'])
stocks.shape

In [23]:
X = stocks[['Total_Sales','Total_Assets','Utility','time_trend']]
y = stocks['GHG Scope 1']
print(X.shape)
print(y.shape)

(398, 4)
(398,)


In [24]:
#y.fillna(value=0, inplace=True)
#X.fillna(value=0, inplace=True)

In [25]:
model = LinearRegression(fit_intercept=True)
model

LinearRegression()

In [26]:
#Train and Test split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
#Fit the model
model.fit(Xtrain,ytrain)
y_model = model.predict(Xtest)

In [28]:
#The coefficients
print('Coefficients: \n', model.coef_)

Coefficients: 
 [ 1.27217988e-01  2.83677679e-01  2.23421275e+04 -5.58358640e+02]


In [29]:
#Mean Squared Error
print('Mean squared error: %.2f' % metrics.mean_squared_error(ytest,y_model))
print(f'Root Mean Squared Error: {np.sqrt(metrics.mean_squared_error(ytest,y_model))}')

Mean squared error: 442891100.67
Root Mean Squared Error: 21044.978039190788


In [30]:
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % metrics.r2_score(ytest, y_model))

Coefficient of determination: 0.66


In [32]:
#Create data frame with observed and predicted
linear_reg = pd.DataFrame({'Observed':ytest,'Prediction':y_model})
linear_reg.head()

Unnamed: 0,Observed,Prediction
418,3577.3,-905.917272
778,119000.0,125953.752659
58,3410.0,-735.413566
464,14296.5,19195.057407
206,32000.0,26110.10819


### Simple Linear regression with k-fold cross validation

In [None]:
cv = KFold(n_splits=10, random_state=833, shuffle=True)
model = LinearRegression()
cv_scores = cross_val_score(model, X, y, cv= cv, scoring='neg_mean_squared_error', n_jobs=1)
print(f'All the scores {cv_scores}')
print(f'The MSE is {np.mean(cv_scores)}')
print(f'RSME is {np.sqrt(np.mean(cv_scores) * -1)}')

The Mean Square Error returned by sklearn.cross_validation.cross_val_score is always a negative. 

### Ridge regression 

In [None]:
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)

coefs = []
for a in alphas:
    ridge = Ridge(alpha=a, fit_intercept=False)
    ridge.fit(X, y)
    coefs.append(ridge.coef_)

In [None]:
ax = plt.gca()
ax.plot(alphas, coefs)
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

https://scikit-learn.org/stable/auto_examples/linear_model/plot_ridge_path.html

#### Validation set approach

#### K-fold approach

In [None]:
model = Ridge(alpha=1.0)
cv = KFold(n_splits=10, random_state=833, shuffle=True)
# evaluate model
scores = cross_val_score(model, X, y, cv=cv)
print(scores)
print('Accuracy: %0.2f (+- %0.2f)' % (scores.mean(), scores.std() * 2))

In [None]:
#Make predictions
model.fit(X,y)
yhat = model.predict(X)