Load necessary libraries. 

In [2]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import os

Load the tickers data. 

In [5]:
tickers = pd.read_csv('Companydescription.csv') # load tickers.csv 
tickers

Unnamed: 0,Ticker,Description
0,1COV,
1,A,
2,AAL,
3,AAL.L,
4,AAP,
...,...,...
863,ZBH,
864,ZBRA,
865,ZION,
866,ZTS,


In [8]:
boolean = tickers['Ticker'].duplicated().any() 
boolean

False

Loop through the tickers and get for each ticker a respective URL and business description. Note that for example purposes `ticker` variable is limited to the first 5 (`[:5]`) items, remove it if you want to run the loop for the whole sample. The loop for all observations might take 30+ minutes.

In [9]:
# Create a loop to store URLs of all stocks' description page
URL = [] # empty list for URLs
DES = [] # empty list for descriptions 
ticker = tickers['Ticker'] .head()
for i in ticker: 
  url ='https://finance.yahoo.com/quote/'+i+'/profile' 
  URL.append(url)
  page = requests.get(url) # visits the URL 
  htmldata = BeautifulSoup(page.content, 'html.parser')
  Business_Description = htmldata.find('p',{'class':'Mt(15px) Lh(1.6)'}) # finds the business description part in the HTML code
  DES.append(Business_Description)

In [10]:
# print(URL)
print(DES) # check the descriptions

[None, None, None, None, None]


Convert the results to pandas dataframe. 

In [11]:
# Create new data frame that stores ticker, description of corresponding tickers 
company_des = pd.DataFrame({'ticker':ticker,'description':DES})
company_des.head()

Unnamed: 0,ticker,description
0,1COV,
1,A,
2,AAL,
3,AAL.L,
4,AAP,


Drop tickers with no descriptions. Convert the `description` variable to string. 

Clean the data: remove NAs, convert to string and remove HTML code attributes. 

In [None]:
# Drop the stocks that do not have Yahoo Finance company profiles 
company_des.dropna(inplace=True)
company_des['description'] = company_des['description'].astype(str)

# Remove regex text from description using loop 
a = np.arange(1,300)
a = a.astype(str)
for i in a:
  company_des['description']=company_des['description'].str.replace('<p class="Mt(15px) Lh(1.6)" data-reactid="'+i+'">','',regex=False)

company_des['description']=company_des['description'].str.replace('</p>','',regex=False)

In [None]:
# company_des.head()

Export the data

In [None]:
# Export company_des into a csv file
company_des.to_csv(r'154stock_des.csv', index = False, header=True)

Define the function to transform the percentage into number in EI

In [13]:
def p2f(x):
    return float(x.strip('%'))/100

In [14]:
ind=pd.read_csv('Environmental_impact_cleaned.csv')
ind['EnvironmentalIntensity(Sales)'] = ind['EnvironmentalIntensity(Sales)'].apply(p2f)
y2018=list(ind[ind['Year'] == 2018]['EnvironmentalIntensity(Sales)'])
y2018=pd.Series(y2018)

In [26]:
from statsmodels.tsa.stattools import adfuller
X = y2018.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))


mean1=-0.129262, mean2=-0.101468
variance1=0.094017, variance2=0.052920
ADF Statistic: -24.980342
p-value: 0.000000
Critical Values:
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [25]:
y2017=list(ind[ind['Year'] == 2017]['EnvironmentalIntensity(Sales)'])
y2017=pd.Series(y2017)
from statsmodels.tsa.stattools import adfuller
X = y2017.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.123985, mean2=-0.108138
variance1=0.089221, variance2=0.063000
ADF Statistic: -26.071015
p-value: 0.000000
Critical Values:
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [19]:
y2016=list(ind[ind['Year'] == 2016]['EnvironmentalIntensity(Sales)'])
y2016=pd.Series(y2016)
from statsmodels.tsa.stattools import adfuller
X = y2016.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

ADF Statistic: -38.998411
p-value: 0.000000
Critical Values:
mean1=-0.128953, mean2=-0.113759
variance1=0.084427, variance2=0.066553
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [24]:
y2015=list(ind[ind['Year'] == 2015]['EnvironmentalIntensity(Sales)'])
y2015=pd.Series(y2015)
from statsmodels.tsa.stattools import adfuller
X = y2015.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.128073, mean2=-0.112539
variance1=0.072975, variance2=0.066954
ADF Statistic: -25.147693
p-value: 0.000000
Critical Values:
	1%: -3.435
	5%: -2.863
	10%: -2.568


In [23]:
y2014=list(ind[ind['Year'] == 2014]['EnvironmentalIntensity(Sales)'])
y2014=pd.Series(y2014)
from statsmodels.tsa.stattools import adfuller
X = y2014.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

mean1=-0.113915, mean2=-0.100781
variance1=0.062142, variance2=0.055520
ADF Statistic: -15.509563
p-value: 0.000000
Critical Values:
	1%: -3.435
	5%: -2.864
	10%: -2.568


In [27]:
ind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14515 entries, 0 to 14514
Data columns (total 39 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   ISIN                                                    14515 non-null  object 
 1   Year                                                    14515 non-null  int64  
 2   CompanyName                                             14515 non-null  object 
 3   Country                                                 14515 non-null  object 
 4   Industry(Exiobase)                                      14515 non-null  object 
 5   EnvironmentalIntensity(Sales)                           14515 non-null  float64
 6   EnvironmentalIntensity(OpInc)                           13700 non-null  object 
 7   TotalEnvironmentalCost                                  14515 non-null  object 
 8   WorkingCapacity                     

In [28]:
ind.head()

Unnamed: 0,ISIN,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,Env_intensity,industry_avg,industry_avg_year,Industry_indicator_year,Environmental_Growth
0,DE0005545503,2016,1&1 DRILLISCH AG,Germany,Post and telecommunications (64),-0.0007,-0.82%,-539318,-525027,-169,-7009,-1630,-27,-878,-4714,135,-234989,-166914,-166795,-1752,-1752,-27366,65960,-142,-4714,-878,-5,-1,-77,-6,67,67,-22,23%,-0.0007,-0.020506,-0.02074,1,
1,GB00B1YW4409,2010,3I GROUP PLC,United Kingdom,"Financial intermediation, except insurance and...",-0.0012,-0.11%,-1055812,-1032103,-277,-13751,-3221,-47,-562,-5953,102,-463300,-295103,-294949,-3438,-3438,-47957,59044,-74,-5953,-562,-4,0,-133,-4,51,51,-43,10%,-0.0012,-0.028537,-0.006402,1,
2,GB00B1YW4409,2011,3I GROUP PLC,United Kingdom,"Financial intermediation, except insurance and...",-0.0016,-0.16%,-961875,-940402,-246,-12525,-2935,-42,-424,-5378,77,-421928,-264714,-264579,-3131,-3131,-42961,44515,-56,-5378,-424,-3,0,-119,-3,38,38,-39,9%,-0.0016,-0.028537,-0.009838,1,33.333333
3,GB00B1YW4409,2012,3I GROUP PLC,United Kingdom,"Financial intermediation, except insurance and...",-0.0015,,-722999,-706893,-183,-9414,-2206,-32,-295,-4030,54,-317104,-197859,-197760,-2354,-2354,-32095,30960,-39,-4030,-295,-2,0,-89,-2,27,27,-30,8%,-0.0015,-0.028537,-0.024437,1,-6.25
4,US88579Y1010,2010,3M COMPANY,United States,Activities of membership organisation n.e.c. (91),-0.079,-35.45%,-2105919763,-1924672080,-439506,-25698273,-5971786,-80081,-24605,-149040978,7545,-857263302,-498663505,-498158998,-6424568,-6424568,-80328786,-9295475,-2771,-149040978,-24605,-355,-164,-219089,-423,3772,3772,-79722,1%,-0.079,-0.175838,-0.084583,1,


In [34]:
y2018=list(ind[ind['Year'] == 2018][['industry_avg','Env_intensity']])
y2018=pd.Series(y2018)
from statsmodels.tsa.stattools import adfuller
X = y2018.values
result = adfuller(X)
split = round(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

ValueError: ignored

In [31]:
ind.groupby('Industry(Exiobase)')['EnvironmentalIntensity(Sales)'].count().sort_values()

Industry(Exiobase)
Cultivation of cereal grains nec                                                                                          1
Forestry, logging and related service activities (02)                                                                     5
Sea and coastal water transport                                                                                           6
Education (80)                                                                                                            6
Production of electricity by petroleum and other oil derivatives                                                         12
Mining of coal and lignite; extraction of peat (10)                                                                      15
Manufacture of tobacco products (16)                                                                                     22
Copper production                                                                                                

Top five industries:

Retail trade, except of motor vehicles and motorcycles; repair of personal and household goods (52)                    
Real estate activities(70)                                                                                    
Construction (45)                                                 
Manufacture of electrical machinery and apparatus n.e.c. (31)                                                 
Financial intermediation, except insurance and pension funding (65) 