Load necessary libraries. 

In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import os

Load the tickers data. 

In [None]:
tickers = pd.read_csv('Companydescription.csv') # load tickers.csv 
tickers

Unnamed: 0,Ticker,Description
0,1COV,
1,A,
2,AAL,
3,AAL.L,
4,AAP,
...,...,...
863,ZBH,
864,ZBRA,
865,ZION,
866,ZTS,


Check if we have duplicates

In [None]:
boolean = tickers['Ticker'].duplicated().any() 
boolean

False

Loop through the tickers and get for each ticker a respective URL and business description. Note that for example purposes `ticker` variable is limited to the first 5 (`[:5]`) items, remove it if you want to run the loop for the whole sample. The loop for all observations might take 30+ minutes.

In [None]:
# Create a loop to store URLs of all stocks' description page
URL = [] # empty list for URLs
DES = [] # empty list for descriptions 
ticker = tickers['Ticker'] .head()
for i in ticker: 
  url ='https://finance.yahoo.com/quote/'+i+'/profile' 
  URL.append(url)
  page = requests.get(url) # visits the URL 
  htmldata = BeautifulSoup(page.content, 'html.parser')
  Business_Description = htmldata.find('p',{'class':'Mt(15px) Lh(1.6)'}) # finds the business description part in the HTML code
  DES.append(Business_Description)

In [None]:
# print(URL)
print(DES) # check the descriptions

[None, None, None, None, None]


Since there is some technical issues with yahoo finace that we can't download the descriptions from the website so we decided to look for other websites and combined the previous descriptions

Convert the results to pandas dataframe. 

In [None]:
# Create new data frame that stores ticker, description of corresponding tickers 
company_des = pd.DataFrame({'ticker':ticker,'description':DES})
company_des.head()

Unnamed: 0,ticker,description
0,1COV,
1,A,
2,AAL,
3,AAL.L,
4,AAP,


Drop tickers with no descriptions. Convert the `description` variable to string. 

Clean the data: remove NAs, convert to string and remove HTML code attributes. 

In [None]:
# Drop the stocks that do not have Yahoo Finance company profiles 
company_des.dropna(inplace=True)
company_des['description'] = company_des['description'].astype(str)

# Remove regex text from description using loop 
a = np.arange(1,300)
a = a.astype(str)
for i in a:
  company_des['description']=company_des['description'].str.replace('<p class="Mt(15px) Lh(1.6)" data-reactid="'+i+'">','',regex=False)

company_des['description']=company_des['description'].str.replace('</p>','',regex=False)

In [None]:
# company_des.head()

Export the data

In [None]:
# Export company_des into a csv file
company_des.to_csv(r'154stock_des.csv', index = False, header=True)

Define the function to transform the percentage into number in EI

In [None]:
def p2f(x):
    return float(x.strip('%'))/100

Test stational or non-stational of the Environmental Intensity in different years

In [None]:
ind=pd.read_csv('Environmental_impact_cleaned.csv')
ind['EnvironmentalIntensity(Sales)'] = ind['EnvironmentalIntensity(Sales)'].apply(p2f)
y2018=list(ind[ind['Year'] == 2018]['EnvironmentalIntensity(Sales)'])
y2018=pd.Series(y2018)

In [None]:
from statsmodels.tsa.stattools import adfuller
X = y2018.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))


mean1=-0.129262, mean2=-0.101468
variance1=0.094017, variance2=0.052920
ADF Statistic: -24.980342
p-value: 0.000000
Critical Values:
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [None]:
y2017=list(ind[ind['Year'] == 2017]['EnvironmentalIntensity(Sales)'])
y2017=pd.Series(y2017)
from statsmodels.tsa.stattools import adfuller
X = y2017.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.123985, mean2=-0.108138
variance1=0.089221, variance2=0.063000
ADF Statistic: -26.071015
p-value: 0.000000
Critical Values:
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [None]:
y2016=list(ind[ind['Year'] == 2016]['EnvironmentalIntensity(Sales)'])
y2016=pd.Series(y2016)
from statsmodels.tsa.stattools import adfuller
X = y2016.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

ADF Statistic: -38.998411
p-value: 0.000000
Critical Values:
mean1=-0.128953, mean2=-0.113759
variance1=0.084427, variance2=0.066553
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [None]:
y2015=list(ind[ind['Year'] == 2015]['EnvironmentalIntensity(Sales)'])
y2015=pd.Series(y2015)
from statsmodels.tsa.stattools import adfuller
X = y2015.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.128073, mean2=-0.112539
variance1=0.072975, variance2=0.066954
ADF Statistic: -25.147693
p-value: 0.000000
Critical Values:
	1%: -3.435
	5%: -2.863
	10%: -2.568


In [None]:
y2014=list(ind[ind['Year'] == 2014]['EnvironmentalIntensity(Sales)'])
y2014=pd.Series(y2014)
from statsmodels.tsa.stattools import adfuller
X = y2014.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.113915, mean2=-0.100781
variance1=0.062142, variance2=0.055520
ADF Statistic: -15.509563
p-value: 0.000000
Critical Values:
	1%: -3.435
	5%: -2.864
	10%: -2.568


Summary: The environmental intensity from 2014 to 2018 for each year the data is stational

Next,let's exam the industry average for each year to see whether they are stational

In [None]:
ind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14515 entries, 0 to 14514
Data columns (total 39 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   ISIN                                                    14515 non-null  object 
 1   Year                                                    14515 non-null  int64  
 2   CompanyName                                             14515 non-null  object 
 3   Country                                                 14515 non-null  object 
 4   Industry(Exiobase)                                      14515 non-null  object 
 5   EnvironmentalIntensity(Sales)                           14515 non-null  float64
 6   EnvironmentalIntensity(OpInc)                           13700 non-null  object 
 7   TotalEnvironmentalCost                                  14515 non-null  object 
 8   WorkingCapacity                     

Test whether the industry average for each year we used is stationary :


In [None]:
y2018=list(ind[ind['Year'] == 2018]['industry_avg_year'])
y2018=pd.Series(y2018)
from statsmodels.tsa.stattools import adfuller
X = y2018.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.121867, mean2=-0.108862
variance1=0.032617, variance2=0.026903
ADF Statistic: -10.463661
p-value: 0.000000
Critical Values:
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [None]:
y2017=list(ind[ind['Year'] == 2017]['industry_avg_year'])
y2017=pd.Series(y2017)
from statsmodels.tsa.stattools import adfuller
X = y2017.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.117744, mean2=-0.114378
variance1=0.030029, variance2=0.028428
ADF Statistic: -27.227921
p-value: 0.000000
Critical Values:
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [None]:
y2016=list(ind[ind['Year'] == 2016]['industry_avg_year'])
y2016=pd.Series(y2016)
from statsmodels.tsa.stattools import adfuller
X = y2016.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.125713, mean2=-0.117000
variance1=0.035538, variance2=0.033986
ADF Statistic: -38.943780
p-value: 0.000000
Critical Values:
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [None]:
y2015=list(ind[ind['Year'] == 2015]['industry_avg_year'])
y2015=pd.Series(y2015)
from statsmodels.tsa.stattools import adfuller
X = y2015.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.125579, mean2=-0.115032
variance1=0.038191, variance2=0.033880
ADF Statistic: -20.655671
p-value: 0.000000
Critical Values:
	1%: -3.435
	5%: -2.863
	10%: -2.568


Summary:Each year from 2015 to 2018, the industry average is stational. And the model we used which includes the the industry average is predictable.

Let's take a look at top five industries

In [None]:
ind.groupby('Industry(Exiobase)')['EnvironmentalIntensity(Sales)'].count().sort_values()

Industry(Exiobase)
Cultivation of cereal grains nec                                                                                          1
Forestry, logging and related service activities (02)                                                                     5
Sea and coastal water transport                                                                                           6
Education (80)                                                                                                            6
Production of electricity by petroleum and other oil derivatives                                                         12
Mining of coal and lignite; extraction of peat (10)                                                                      15
Manufacture of tobacco products (16)                                                                                     22
Copper production                                                                                                

Top five industries:

Retail trade, except of motor vehicles and motorcycles; repair of personal and household goods (52)                    
Real estate activities(70)                                                                                    
Construction (45)                                                 
Manufacture of electrical machinery and apparatus n.e.c. (31)                                                 
Financial intermediation, except insurance and pension funding (65) 

In [None]:
listind=['Retail trade, except of motor vehicles and motorcycles; repair of personal and household goods (52)',
'Real estate activities(70)',
'Construction (45)',
'Manufacture of electrical machinery and apparatus n.e.c. (31)',
'Financial intermediation, except insurance and pension funding (65)']
num_order_new = ind[(ind['Industry(Exiobase)']=='Construction (45)')|(ind['Industry(Exiobase)'] == 'Financial intermediation, except insurance and pension funding (65)')|(ind['Industry(Exiobase)'] == 'Manufacture of electrical machinery and apparatus n.e.c. (31)')
|(ind['Industry(Exiobase)'] == 'Real estate activities(70)')|(ind['Industry(Exiobase)'] == 'Retail trade, except of motor vehicles and motorcycles; repair of personal and household goods (52)')]
num_order_new  



Unnamed: 0,ISIN,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,Env_intensity,industry_avg,industry_avg_year,Industry_indicator_year,Environmental_Growth
1,GB00B1YW4409,2010,3I GROUP PLC,United Kingdom,"Financial intermediation, except insurance and...",-0.0012,-0.11%,-1055812,-1032103,-277,-13751,-3221,-47,-562,-5953,102,-463300,-295103,-294949,-3438,-3438,-47957,59044,-74,-5953,-562,-4,0,-133,-4,51,51,-43,10%,-0.0012,-0.028537,-0.006402,1,
2,GB00B1YW4409,2011,3I GROUP PLC,United Kingdom,"Financial intermediation, except insurance and...",-0.0016,-0.16%,-961875,-940402,-246,-12525,-2935,-42,-424,-5378,77,-421928,-264714,-264579,-3131,-3131,-42961,44515,-56,-5378,-424,-3,0,-119,-3,38,38,-39,9%,-0.0016,-0.028537,-0.009838,1,33.333333
3,GB00B1YW4409,2012,3I GROUP PLC,United Kingdom,"Financial intermediation, except insurance and...",-0.0015,,-722999,-706893,-183,-9414,-2206,-32,-295,-4030,54,-317104,-197859,-197760,-2354,-2354,-32095,30960,-39,-4030,-295,-2,0,-89,-2,27,27,-30,8%,-0.0015,-0.028537,-0.024437,1,-6.250000
50,DE0005408116,2012,AAREAL BANK AG,Germany,"Financial intermediation, except insurance and...",-0.0010,-0.69%,-1615657,-1540246,-504,-20570,-4810,-79,-1932,-47899,383,-693698,-498714,-498400,-5143,-5143,-81852,217411,-341,-47899,-1932,-13,-2,-233,-16,192,192,-66,20%,-0.0010,-0.028537,-0.024437,1,
51,DE0005408116,2013,AAREAL BANK AG,Germany,"Financial intermediation, except insurance and...",-0.0011,-0.56%,-1561584,-1483364,-469,-19802,-4632,-74,-1655,-51916,328,-667625,-469884,-469596,-4950,-4950,-76994,186258,-292,-51916,-1655,-11,-2,-219,-13,164,164,-63,19%,-0.0011,-0.028537,-0.025627,1,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14485,KYG989221000,2016,ZHEN DING TECHNOLOGY HOLDING LIMITED,Taiwan,Manufacture of electrical machinery and appara...,-0.0873,-155.74%,-222083216,-204825973,-47125,-2713781,-635954,-8532,-8129,-13843967,246,-91295951,-53183570,-53158871,-678445,-678445,-8572045,-630018,-1955,-13843967,-8129,-27,-61,-23440,-32,123,123,-8504,1%,-0.0873,-0.067427,-0.073788,-1,-5.825243
14486,KYG989221000,2017,ZHEN DING TECHNOLOGY HOLDING LIMITED,Taiwan,Manufacture of electrical machinery and appara...,-0.0603,-76.11%,-221429493,-203149289,-46856,-2686271,-629599,-8463,-11727,-14897562,273,-90384665,-52689302,-52665410,-671568,-671568,-8493012,-910285,-2820,-14897562,-11727,-36,-88,-23254,-43,136,136,-8426,1%,-0.0603,-0.067427,-0.068294,1,-30.927835
14487,KYG989221000,2018,ZHEN DING TECHNOLOGY HOLDING LIMITED,Taiwan,Manufacture of electrical machinery and appara...,-0.0615,-48.85%,-236938822,-217968519,-51373,-2886977,-676611,-9212,-12305,-15337583,3756,-97158892,-57405552,-57379583,-721744,-721744,-9265219,1097855,-2959,-15337583,-12305,-155,-92,-25366,-185,1878,1878,-9054,3%,-0.0615,-0.067427,-0.076812,1,1.990050
14488,KYG989221000,2019,ZHEN DING TECHNOLOGY HOLDING LIMITED,Taiwan,Manufacture of electrical machinery and appara...,-0.0602,-49.99%,-241827228,-223793936,-52466,-2966025,-694995,-9454,-12822,-14301445,3914,-99793897,-58835287,-58808848,-741506,-741506,-9494000,936891,-3084,-14301445,-12822,-162,-96,-25898,-193,1957,1957,-9290,3%,-0.0602,-0.067427,-0.075113,1,-2.113821


In [None]:
y2018=list(num_order_new[num_order_new['Year'] == 2018]['EnvironmentalIntensity(Sales)'])
y2018=pd.Series(y2018)
from statsmodels.tsa.stattools import adfuller
X = y2018.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.041328, mean2=-0.039877
variance1=0.010222, variance2=0.005468
ADF Statistic: -21.045903
p-value: 0.000000
Critical Values:
	1%: -3.445
	5%: -2.868
	10%: -2.570


In [None]:
y2017=list(num_order_new[num_order_new['Year'] == 2017]['EnvironmentalIntensity(Sales)'])
y2017=pd.Series(y2017)
from statsmodels.tsa.stattools import adfuller
X = y2017.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.041998, mean2=-0.035402
variance1=0.010698, variance2=0.002860
ADF Statistic: -5.443552
p-value: 0.000003
Critical Values:
	1%: -3.446
	5%: -2.868
	10%: -2.570


In [None]:
y2016=list(num_order_new[num_order_new['Year'] == 2016]['EnvironmentalIntensity(Sales)'])
y2016=pd.Series(y2016)
from statsmodels.tsa.stattools import adfuller
X = y2016.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.044473, mean2=-0.037712
variance1=0.011978, variance2=0.002783
ADF Statistic: -20.528414
p-value: 0.000000
Critical Values:
	1%: -3.446
	5%: -2.868
	10%: -2.570


In [None]:
y2015=list(num_order_new[num_order_new['Year'] == 2015]['EnvironmentalIntensity(Sales)'])
y2015=pd.Series(y2015)
from statsmodels.tsa.stattools import adfuller
X = y2015.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.043269, mean2=-0.028244
variance1=0.012533, variance2=0.019730
ADF Statistic: -18.508807
p-value: 0.000000
Critical Values:
	1%: -3.449
	5%: -2.870
	10%: -2.571


Let's check the top 3 industries

In [None]:
num_order_new = ind[(ind['Industry(Exiobase)']=='Construction (45)')|(ind['Industry(Exiobase)'] == 'Financial intermediation, except insurance and pension funding (65)')|(ind['Industry(Exiobase)'] == 'Manufacture of electrical machinery and apparatus n.e.c. (31)')]

In [None]:
y2018=list(num_order_new[num_order_new['Year'] == 2018]['EnvironmentalIntensity(Sales)'])
y2018=pd.Series(y2018)
from statsmodels.tsa.stattools import adfuller
X = y2018.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.045647, mean2=-0.045448
variance1=0.012042, variance2=0.006497
ADF Statistic: -19.134100
p-value: 0.000000
Critical Values:
	1%: -3.449
	5%: -2.870
	10%: -2.571


In [None]:
y2017=list(num_order_new[num_order_new['Year'] == 2017]['EnvironmentalIntensity(Sales)'])
y2017=pd.Series(y2017)
from statsmodels.tsa.stattools import adfuller
X = y2017.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.039995, mean2=-0.040480
variance1=0.003754, variance2=0.003352
ADF Statistic: -19.354295
p-value: 0.000000
Critical Values:
	1%: -3.448
	5%: -2.870
	10%: -2.571


In [None]:
y2016=list(num_order_new[num_order_new['Year'] == 2016]['EnvironmentalIntensity(Sales)'])
y2016=pd.Series(y2016)
from statsmodels.tsa.stattools import adfuller
X = y2016.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.048769, mean2=-0.043516
variance1=0.014048, variance2=0.003285
ADF Statistic: -19.089448
p-value: 0.000000
Critical Values:
	1%: -3.449
	5%: -2.870
	10%: -2.571


Summary: All of the data that we used is stational.

# DistilBERT

Load, merge and clean the data

In [None]:
# load the csv files
stock_des=pd.read_csv('202 companies_des.csv')
df = pd.read_csv('IEV_holdings-1.csv')
df1=pd.read_csv('EE-ISIN_merged.csv') 

In [None]:
df1=df1.drop('Unnamed: 0',1)

In [None]:
df1=df1[df1['Year']==2019]
df1

Unnamed: 0,ISIN,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,gvkey,fyear,datadate,at,isin,conm,fic,sic
0,MYL1818OO003,2019,BURSA MALAYSIA BHD,Malaysia,Activities auxiliary to financial intermediati...,-0.017,-3.47%,-1968379,-1924910,-451,-25349,-5938,-81,-168,-11502,20,-852646,-502708,-502460,-6337,-6337,-81118,-4791,-27,-11502,-168,-1,-1,-222,-2,10,10,-79,4%,272691,2019,20191231,2321.040,MYL1818OO003,BURSA MALAYSIA BHD,MYS,6200.0
1,GB0031638363,2019,INTERTEK GROUP PLC,United Kingdom,Activities auxiliary to financial intermediati...,-0.015,-9.49%,-60599272,-59281663,-13774,-788289,-184802,-2487,-3804,-324960,508,-26533166,-15557810,-15550827,-197072,-197072,-2509207,284215,-703,-324960,-3804,-17,-4,-6861,-20,254,254,-2470,1%,252384,2019,20191231,2818.400,GB0031638363,INTERTEK GROUP PLC,GBR,8700.0
2,ZAE000079711,2019,JSE LIMITED,South Africa,Activities auxiliary to financial intermediati...,-0.015,,-2290124,-2239814,-510,-29662,-6938,-93,-901,-12200,-6,-995881,-576811,-576488,-7415,-7415,-92910,-19470,-277,-12200,-901,0,-1,-253,0,-3,-3,-93,2%,278391,2019,20191231,40227.215,ZAE000079711,JSE LIMITED,ZAF,6211.0
3,FR0006174348,2019,BUREAU VERITAS SA,France,Activities auxiliary to financial intermediati...,-0.007,-5.10%,-39978650,-39107612,-9330,-520701,-121953,-1671,-4116,-214438,1172,-17514837,-10430409,-10425281,-130175,-130175,-1684676,561195,-577,-214438,-4116,-38,-9,-4607,-45,586,586,-1633,3%,286961,2019,20191231,7049.100,FR0006174348,BUREAU VERITAS SA,FRA,8700.0
4,GB0007370074,2019,RICARDO PLC,United Kingdom,Activities auxiliary to financial intermediati...,-0.007,-7.27%,-3247235,-3176408,-753,-42228,-9899,-135,-468,-17406,63,-1421576,-842731,-842343,-10557,-10557,-136060,34998,-87,-17406,-468,-2,0,-373,-2,31,31,-133,3%,221859,2019,20190630,371.900,GB0007370074,RICARDO PLC,GBR,8711.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057,ZAE000216537,2019,BID CORPORATION LTD,South Africa,"Wholesale trade and commission trade, except o...",-0.014,-28.02%,-124047261,-121344740,-28962,-1603616,-374519,-5177,-35468,-658523,3743,-53787360,-32057343,-32036952,-400904,-400904,-5177518,533888,-10313,-658523,-35468,-167,-103,-14129,-200,1871,1871,-5007,6%,321670,2019,20190630,64931.978,ZAE000216537,BID CORP(NEW),ZAF,6799.0
1058,ZAE000058517,2019,SPAR GROUP LIMITED,South Africa,"Wholesale trade and commission trade, except o...",-0.003,-49.56%,-24127362,-23594642,-6350,-305744,-70543,-1082,-27940,-124010,2948,-10145768,-6518174,-6510343,-76436,-76436,-1059431,420567,-8124,-124010,-27940,-132,-81,-2898,-157,1474,1474,-948,20%,271087,2019,20190930,34052.900,ZAE000058517,SPAR GROUP LTD,ZAF,5140.0
1059,CNE100000FN7,2019,SINOPHARM GROUP CO LTD,China,"Wholesale trade and commission trade, except o...",-0.001,-1.85%,-46263408,-42158085,-10358,-562862,-131780,-1839,-16404,-3384365,2285,-18933076,-11481810,-11476273,-140715,-140715,-1857658,1173028,-694,-3384365,-16404,-77,-13,-5068,-92,1142,1142,-1761,1%,292783,2019,20191231,269888.371,CNE100000FN7,SINOPHARM GROUP CO,CHN,5122.0
1060,TW0005902001,2019,"Tait Marketing & Distribution Co., Ltd.",Taiwan,"Wholesale trade and commission trade, except o...",-0.001,-1.61%,-35015,-34239,-11,-456,-105,-2,-25,-185,8,-15168,-10607,-10595,-114,-114,-1736,3531,-3,-185,-25,0,0,-5,0,4,4,-1,22%,279003,2019,20191231,1076.614,TW0005902001,TAIT MARKETING & DIST CO LTD,TWN,5140.0


In [None]:
df2=pd.merge(df,df1,on='ISIN')
df2

Unnamed: 0,Ticker,Name,Sector,Asset Class,Market Value,Weight (%),Notional Value,Shares,CUSIP,ISIN,SEDOL,Price,Location,Exchange,Currency,FX Rate,Market Currency,Accrual Date,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,gvkey,fyear,datadate,at,isin,conm,fic,sic
0,NESN,NESTLE SA,Consumer Staples,Equity,64181630.45,3.32,64181630.45,505570.00,S71238703,CH0038863350,7123870,126.95,Switzerland,SIX Swiss Exchange,USD,0.90,CHF,-,2019,NESTLE S.A.,Switzerland,Processing of Food products nec,-0.016,-9.75%,-1527139399,-1405885897,-416410,-18578077,-4368093,-65196,-113252,-97865932,153457,-628976294,-420810968,-420563568,-4644519,-4644519,-68675754,119281774,-6492,-97865932,-113252,-5213,-135,-201854,-6213,76729,76729,-59918,0%,16603,2019,20191231,127940.000,CH0038863350,NESTLE SA/AG,CHE,2000.0
1,ROG,ROCHE HOLDING PAR AG,Health Care,Equity,46859542.94,2.42,46859542.94,123210.00,S71103881,CH0012032048,7110388,380.32,Switzerland,SIX Swiss Exchange,USD,0.90,CHF,-,2019,ROCHE HOLDING AKTIENGESELLSCHAFT,Switzerland,"Manufacture of medical, precision and optical ...",-0.003,-0.84%,-160446737,-152869527,-35079,-1974118,-461147,-6303,-1149997,-3950618,52,-66193394,-38441169,-38420015,-493529,-493529,-6193341,-4364957,-722554,-3950618,-1149997,-139,-317,-16901,-165,26,26,-6163,6%,25648,2019,20191231,83091.000,CH0012032048,ROCHE HOLDING AG,CHE,2834.0
2,NOVN,NOVARTIS AG,Health Care,Equity,40415079.41,2.09,40415079.41,433340.00,S71030654,CH0012005267,7103065,93.26,Switzerland,SIX Swiss Exchange,USD,0.90,CHF,-,2019,NOVARTIS AG,Switzerland,"Manufacture of medical, precision and optical ...",-0.007,-3.18%,-348286772,-314529861,-74014,-4143676,-967262,-13198,-903525,-27659340,4103,-138890514,-81935724,-81878242,-1035919,-1035919,-13219836,-1114082,-567694,-27659340,-903525,-246,-249,-36343,-294,2051,2051,-12948,3%,101310,2019,20191231,118370.000,CH0012005267,NOVARTIS AG,CHE,2834.0
3,AZN,ASTRAZENECA PLC,Health Care,Equity,26955662.18,1.39,26955662.18,230130.00,S09895293,GB0009895292,989529,117.13,United Kingdom,London Stock Exchange,USD,0.71,GBP,-,2019,ASTRAZENECA PLC,United Kingdom,"Manufacture of medical, precision and optical ...",-0.005,-3.98%,-134985600,-129784288,-39655,-1688455,-393446,-6368,-573484,-2526476,26573,-56694491,-39535401,-39506085,-422114,-422114,-6472569,11948529,-781929,-2526476,-573484,-1050,-312,-18122,-1251,13286,13286,-5305,22%,28272,2019,20191231,61377.000,GB0009895292,ASTRAZENECA PLC,GBR,2834.0
4,SIE,SIEMENS N AG,Industrials,Equity,23306443.99,1.21,23306443.99,141729.00,S57279739,DE0007236101,5727973,164.44,Germany,Xetra,USD,0.82,EUR,-,2019,SIEMENS AG,Germany,Activities of membership organisation n.e.c. (91),-0.005,-5.49%,-431933738,-405755192,-97827,-5290306,-1228970,-16859,-600004,-18945339,758,-176454055,-102930381,-102840656,-1322577,-1322577,-16586392,-10700011,-167303,-18945339,-600004,-206,-1140,-46960,-246,379,379,-16650,7%,19349,2019,20190930,150248.000,DE0007236101,SIEMENS AG,DEU,9997.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,GALP,GALP ENERGIA SGPS SA,Energy,Equity,939164.58,0.05,939164.58,80006.00,-,PTGAL0AM0009,B1FW751,11.74,Portugal,Nyse Euronext - Euronext Lisbon,USD,0.82,EUR,-,2019,"GALP ENERGIA SGPS, S.A.",Portugal,Extraction of crude petroleum and services rel...,-0.057,-79.38%,-1089402909,-975645972,-256476,-12901913,-3033548,-42690,-,-97567302,44991,-436137236,-272357396,-272229158,-3225478,-3225478,-44186427,39651504,-,-97567302,-,-1522,-,-126443,-1814,22496,22496,-41149,0%,279448,2019,20191231,13770.000,PTGAL0AM0009,GALP ENERGIA SGPS SA,PRT,2911.0
150,DLG,DIRECT LINE INSURANCE PLC,Financials,Equity,919839.00,0.05,919839.00,223040.00,-,GB00BY9D0Y18,BY9D0Y1,4.12,United Kingdom,London Stock Exchange,USD,0.71,GBP,-,2019,DIRECT LINE INSURANCE GROUP PLC,United Kingdom,"Insurance and pension funding, except compulso...",-0.001,-0.55%,-3981590,-3892996,-1032,-52067,-12193,-177,-2084,-21437,395,-1753332,-1111209,-1110653,-13017,-13017,-180503,224126,-168,-21437,-2084,-13,-1,-498,-16,197,197,-164,10%,268159,2019,20191231,9434.200,GB00BY9D0Y18,DIRECT LINE INSURANCE GRP,GBR,6331.0
151,SECU B,SECURITAS B,Industrials,Equity,911513.62,0.05,911513.62,56773.00,S55540413,SE0000163594,5554041,16.06,Sweden,Nasdaq Omx Nordic,USD,8.26,SEK,-,2019,SECURITAS AB,Sweden,Other service activities (93),-0.004,-9.11%,-50877031,-49705407,-11746,-660208,-154717,-2105,-7830,-335967,949,-22217949,-13159217,-13152988,-165052,-165052,-2124359,460741,-2332,-335967,-7830,-35,-9,-5817,-42,475,475,-2069,3%,104981,2019,20191231,62190.000,SE0000163594,SECURITAS AB,SWE,7381.0
152,CLN,CLARIANT AG,Materials,Equity,862542.17,0.04,862542.17,39804.00,S71139901,CH0012142631,7113990,21.67,Switzerland,SIX Swiss Exchange,USD,0.90,CHF,-,2019,CLARIANT AG,Switzerland,Chemicals nec,-0.067,-72.91%,-303725335,-244189831,-61797,-3279867,-755897,-10775,-4931,-55439989,17754,-108643559,-67205147,-67103952,-819967,-819967,-10884662,7221119,-400,-55439989,-4931,-639,-81,-30023,-762,8877,8877,-10128,2%,206489,2019,20191231,7979.000,CH0012142631,CLARIANT AG,CHE,2860.0


In [None]:
df2=pd.merge(df,df1,on='ISIN')
df2 = pd.merge(df2, stock_des, on='Ticker',how='inner')
df2

Unnamed: 0,Ticker,Name,Sector,Asset Class,Market Value,Weight (%),Notional Value,Shares,CUSIP,ISIN,SEDOL,Price,Location,Exchange,Currency,FX Rate,Market Currency,Accrual Date,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,gvkey,fyear,datadate,at,isin,conm,fic,sic,description
0,ROG,ROCHE HOLDING PAR AG,Health Care,Equity,46859542.94,2.42,46859542.94,123210.0,S71103881,CH0012032048,7110388,380.32,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-,2019,ROCHE HOLDING AKTIENGESELLSCHAFT,Switzerland,"Manufacture of medical, precision and optical ...",-0.003,-0.84%,-160446737,-152869527,-35079,-1974118,-461147,-6303,-1149997,-3950618,52,-66193394,-38441169,-38420015,-493529,-493529,-6193341,-4364957,-722554,-3950618,-1149997,-139,-317,-16901,-165,26,26,-6163,6%,25648,2019,20191231,83091.0,CH0012032048,ROCHE HOLDING AG,CHE,2834.0,"Rogers Corporation designs, develops, manufact..."
1,NOVN,NOVARTIS AG,Health Care,Equity,40415079.41,2.09,40415079.41,433340.0,S71030654,CH0012005267,7103065,93.26,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-,2019,NOVARTIS AG,Switzerland,"Manufacture of medical, precision and optical ...",-0.007,-3.18%,-348286772,-314529861,-74014,-4143676,-967262,-13198,-903525,-27659340,4103,-138890514,-81935724,-81878242,-1035919,-1035919,-13219836,-1114082,-567694,-27659340,-903525,-246,-249,-36343,-294,2051,2051,-12948,3%,101310,2019,20191231,118370.0,CH0012005267,NOVARTIS AG,CHE,2834.0,"Novan, Inc., a clinical development-stage biot..."
2,AZN,ASTRAZENECA PLC,Health Care,Equity,26955662.18,1.39,26955662.18,230130.0,S09895293,GB0009895292,989529,117.13,United Kingdom,London Stock Exchange,USD,0.71,GBP,-,2019,ASTRAZENECA PLC,United Kingdom,"Manufacture of medical, precision and optical ...",-0.005,-3.98%,-134985600,-129784288,-39655,-1688455,-393446,-6368,-573484,-2526476,26573,-56694491,-39535401,-39506085,-422114,-422114,-6472569,11948529,-781929,-2526476,-573484,-1050,-312,-18122,-1251,13286,13286,-5305,22%,28272,2019,20191231,61377.0,GB0009895292,ASTRAZENECA PLC,GBR,2834.0,"AstraZeneca PLC discovers, develops, manufactu..."
3,SAN,SANOFI SA,Health Care,Equity,21623837.19,1.12,21623837.19,201397.0,S56717358,FR0000120578,5671735,107.37,France,Nyse Euronext - Euronext Paris,USD,0.82,EUR,-,2019,SANOFI S.A.,France,"Manufacture of medical, precision and optical ...",-0.01,-6.09%,-434457155,-393211644,-77490,-4399917,-1003233,-13907,-451590,-35245079,-54294,-143654775,-75667077,-75484826,-1099979,-1099979,-12050491,-89489470,-108652,-35245079,-451590,-461,-3419,-33072,-549,-27147,-27147,-13441,25%,101204,2019,20191231,112736.0,FR0000120578,SANOFI,FRA,2834.0,"Banco Santander, S.A., together with its subsi..."
4,SAN,BANCO SANTANDER SA,Financials,Equity,12744703.6,0.66,12744703.6,3038453.0,S57059461,ES0113900J37,5705946,4.19,Spain,Bolsa De Madrid,USD,0.82,EUR,-,2019,BANCO SANTANDER SA,Spain,"Financial intermediation, except insurance and...",-0.001,-0.70%,-99333328,-92801728,-25942,-1243952,-290468,-4367,-52104,-4928087,13320,-41800811,-27420552,-27402198,-310988,-310988,-4466925,7367610,-4289,-4928087,-52104,-461,-47,-12357,-549,6660,6660,-3901,13%,14140,2019,20191231,1522695.0,ES0113900J37,BANCO SANTANDER SA,ESP,6020.0,"Banco Santander, S.A., together with its subsi..."
5,TTE,TOTALENERGIES,Energy,Equity,21247071.1,1.1,21247071.1,438365.0,-,FR0000120271,B15C557,48.47,France,Nyse Euronext - Euronext Paris,USD,0.82,EUR,-,2019,TOTAL SA,France,Extraction of crude petroleum and services rel...,-0.084,-86.39%,-14772813804,-14478449101,-3769054,-169082300,-38943242,-606970,-13813139,-68456247,306249,-5597249540,-3534954350,-3529245416,-42270575,-42270575,-573515105,-1365534096,-3370723,-68456247,-13813139,-75065,-102420,-1642356,-89472,153125,153125,-530974,16%,24625,2019,20191231,273294.0,FR0000120271,TOTAL SE,FRA,2911.0,TotalEnergies SE operates as an integrated oil...
6,GSK,GLAXOSMITHKLINE PLC,Health Care,Equity,17365446.14,0.9,17365446.14,879169.0,S09252883,GB0009252882,925288,19.75,United Kingdom,London Stock Exchange,USD,0.71,GBP,-,2019,GLAXOSMITHKLINE PLC,United Kingdom,"Manufacture of medical, precision and optical ...",-0.009,-3.33%,-385312247,-370745733,-102404,-4865743,-1135847,-17137,-1015460,-7476974,47052,-163432220,-106636776,-106565263,-1216436,-1216436,-17363406,21016285,-1384550,-7476974,-1015460,-1859,-552,-48182,-2215,23526,23526,-15256,16%,5180,2019,20191231,79692.0,GB0009252882,GLAXOSMITHKLINE PLC,GBR,2834.0,"GlaxoSmithKline plc, together with its subsidi..."
7,RIO,RIO TINTO PLC,Materials,Equity,16138658.65,0.83,16138658.65,189144.0,S07188758,GB0007188757,718875,85.33,United Kingdom,London Stock Exchange,USD,0.71,GBP,-,2019,RIO TINTO PLC,United Kingdom,Quarrying of sand and clay,-0.144,-43.41%,-6462466711,-5904798809,-2407048,-73707062,-17463797,-317956,-30954637,-434498443,1681042,-2528247789,-2094113630,-2092789063,-18426766,-18426766,-347251425,1104934479,-2859512,-434498443,-30954637,-67037,-14558,-1102616,-79904,840521,840521,-250087,4%,19565,2019,20191231,87802.0,GB0007188757,RIO TINTO GROUP,GBR,1000.0,"Rio Tinto Group engages in exploring, mining, ..."
8,AI,LAIR LIQUIDE SOCIETE ANONYME POUR,Materials,Equity,14292701.76,0.74,14292701.76,83090.0,-,FR0000120073,B1YXBJ7,172.01,France,Nyse Euronext - Euronext Paris,USD,0.82,EUR,-,2019,AIR LIQUIDE,France,Chemicals nec,-0.366,-224.97%,-9004051487,-8426185675,-2002880,-111825588,-26225271,-356440,-1022461,-436571184,138011,-3766240396,-2234999726,-2233985494,-27956397,-27956397,-360882027,86867711,-85778,-436571184,-1022461,-5255,-1060,-993652,-6263,69006,69006,-351120,1%,101202,2019,20191231,43666.5,FR0000120073,L'AIR LIQUIDE SA,FRA,2810.0,"C3.ai, Inc. operates as an enterprise artifici..."
9,DTE,DEUTSCHE TELEKOM N AG,Communication,Equity,12240959.85,0.63,12240959.85,568301.0,S58423591,DE0005557508,5842359,21.54,Germany,Xetra,USD,0.82,EUR,-,2019,DEUTSCHE TELEKOM AG,Germany,Post and telecommunications (64),-0.008,-6.45%,-678082340,-662403185,-157402,-8812866,-2064549,-28179,-105742,-4526646,16228,-296493439,-176025217,-175942301,-2203216,-2203216,-28422975,7948136,-17155,-4526646,-105742,-574,-168,-77772,-684,8114,8114,-27598,3%,221616,2019,20191231,170672.0,DE0005557508,DEUTSCHE TELEKOM,DEU,4813.0,DTE Energy Company engages in the utility oper...


Create a binary variable that is 1 if the assets to revenue ratio is above its median and 0 otherwise.

This is the dependent variable (label) that we'll try to predict.

In [None]:
df2['HIGH_EI'] = (df2['EnvironmentalIntensity(Sales)'].gt(df2['EnvironmentalIntensity(Sales)'].median())).astype(int)

In [None]:
!pip install transformers
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 3.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 16.7MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |█████

Load a pre-trained BERT model.

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokenize the textual data for DistilBERT.

In [None]:
tokenized = df2['description'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

Pad all lists of tokenized values to the same size

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(41, 463)

Create attention mask variable for BERT to ignore (mask) the padding when it's processing its input.

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(41, 463)

We run the pretrained DistilBERT model on the prepared predictor and keep the result in last_hidden_states variable.

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

Keep the first layer of the hidden states and assign the outcome variable to labels.

In [None]:
features = last_hidden_states[0][:,0,:].numpy()
labels = df2['HIGH_EI']

Split the data in train and test subsets, train the Logistic Regression on train set and evaluate its accuracy on the test set.

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels,test_size=0.2,random_state=42)
lr_clf = LogisticRegression(max_iter=5000)
lr_clf.fit(train_features, train_labels)
print(lr_clf.score(test_features, test_labels))

0.6666666666666666


Summary: Our model can 67% accuratly capture whether the company is high or low environmental intensity.

In [None]:
test_labels

24    0
13    0
8     0
25    1
4     1
40    0
19    1
39    0
29    1
Name: HIGH_EI, dtype: int64