# 1. Mount your Google Drive and establish the working directory

Mounting allows to access files on your Google Drive. You'll need to allow the Google Drive for desktop's access to your Google Account and copying the sign in code into the authorization code field. 

In [90]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


Set working directory to a Google Drive folder: change the `root_dir` to the folder on your Google Drive. 

`root_dir = "/content/gdrive/MyDrive/...`

In [91]:
import os

# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/gdrive/MyDrive/BU/Year1/Summer/"

# choose where you want your project files to be saved
project_folder = "capstone/"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  !touch 'new_file_in_working_directory.txt'
  print('\nYour working directory was changed to ' + root_dir + project_folder + \
        "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

create_and_set_working_directory(project_folder)


Your working directory was changed to /content/gdrive/MyDrive/BU/Year1/Summer/capstone/

An empty text file was created there. You can also run !pwd to confirm the current working directory.


Check if the function worked by listing the files in the project folder. During the very first run it should contain only `new_file_in_working_directory.txt`. If you upload other files to your `project_folder`, other files will be listed too. 

In [92]:
!ls

 154stock_des.csv		   EE-ISIN_merged.csv
'52 company names.csv'		   Environmental_Impact_dataset1.csv
 52ticker_companydescription.csv   IEV_holdings-1.csv
 df2.csv			   new_file_in_working_directory.txt
'df2tickers - Sheet1.csv'	   stock_des.csv


# 2. Install, load the libraries

In [93]:
!pip install transformers



In [94]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# 2. Load, merge, clean the data

In [95]:
# load the csv files
stock_des = pd.read_csv('154stock_des.csv')
df = pd.read_csv('IEV_holdings-1.csv')
df1=pd.read_csv('EE-ISIN_merged.csv') 

In [96]:
df.head()

Unnamed: 0,Ticker,Name,Sector,Asset Class,Market Value,Weight (%),Notional Value,Shares,CUSIP,ISIN,SEDOL,Price,Location,Exchange,Currency,FX Rate,Market Currency,Accrual Date
0,NESN,NESTLE SA,Consumer Staples,Equity,64181630.45,3.32,64181630.45,505570.0,S71238703,CH0038863350,7123870,126.95,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-
1,ASML,ASML HOLDING NV,Information Technology,Equity,51205545.47,2.65,51205545.47,73649.0,-,NL0010273215,B929F46,695.26,Netherlands,Euronext Amsterdam,USD,0.82,EUR,-
2,ROG,ROCHE HOLDING PAR AG,Health Care,Equity,46859542.94,2.42,46859542.94,123210.0,S71103881,CH0012032048,7110388,380.32,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-
3,NOVN,NOVARTIS AG,Health Care,Equity,40415079.41,2.09,40415079.41,433340.0,S71030654,CH0012005267,7103065,93.26,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-
4,MC,LVMH,Consumer Discretionary,Equity,37598745.9,1.95,37598745.9,46966.0,S40614125,FR0000121014,4061412,800.55,France,Nyse Euronext - Euronext Paris,USD,0.82,EUR,-


In [97]:
df1=df1.drop('Unnamed: 0',1)
df1.head()

Unnamed: 0,ISIN,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,gvkey,fyear,datadate,at,isin,conm,fic,sic
0,MYL1818OO003,2019,BURSA MALAYSIA BHD,Malaysia,Activities auxiliary to financial intermediati...,-0.017,-3.47%,-1968379,-1924910,-451,-25349,-5938,-81,-168,-11502,20,-852646,-502708,-502460,-6337,-6337,-81118,-4791,-27,-11502,-168,-1,-1,-222,-2,10,10,-79,4%,272691,2019,20191231,2321.04,MYL1818OO003,BURSA MALAYSIA BHD,MYS,6200.0
1,GB0031638363,2019,INTERTEK GROUP PLC,United Kingdom,Activities auxiliary to financial intermediati...,-0.015,-9.49%,-60599272,-59281663,-13774,-788289,-184802,-2487,-3804,-324960,508,-26533166,-15557810,-15550827,-197072,-197072,-2509207,284215,-703,-324960,-3804,-17,-4,-6861,-20,254,254,-2470,1%,252384,2019,20191231,2818.4,GB0031638363,INTERTEK GROUP PLC,GBR,8700.0
2,ZAE000079711,2019,JSE LIMITED,South Africa,Activities auxiliary to financial intermediati...,-0.015,,-2290124,-2239814,-510,-29662,-6938,-93,-901,-12200,-6,-995881,-576811,-576488,-7415,-7415,-92910,-19470,-277,-12200,-901,0,-1,-253,0,-3,-3,-93,2%,278391,2019,20191231,40227.215,ZAE000079711,JSE LIMITED,ZAF,6211.0
3,FR0006174348,2019,BUREAU VERITAS SA,France,Activities auxiliary to financial intermediati...,-0.007,-5.10%,-39978650,-39107612,-9330,-520701,-121953,-1671,-4116,-214438,1172,-17514837,-10430409,-10425281,-130175,-130175,-1684676,561195,-577,-214438,-4116,-38,-9,-4607,-45,586,586,-1633,3%,286961,2019,20191231,7049.1,FR0006174348,BUREAU VERITAS SA,FRA,8700.0
4,GB0007370074,2019,RICARDO PLC,United Kingdom,Activities auxiliary to financial intermediati...,-0.007,-7.27%,-3247235,-3176408,-753,-42228,-9899,-135,-468,-17406,63,-1421576,-842731,-842343,-10557,-10557,-136060,34998,-87,-17406,-468,-2,0,-373,-2,31,31,-133,3%,221859,2019,20190630,371.9,GB0007370074,RICARDO PLC,GBR,8711.0


In [98]:
df1=df1[df1['Year']==2019]
df1.head()

Unnamed: 0,ISIN,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,gvkey,fyear,datadate,at,isin,conm,fic,sic
0,MYL1818OO003,2019,BURSA MALAYSIA BHD,Malaysia,Activities auxiliary to financial intermediati...,-0.017,-3.47%,-1968379,-1924910,-451,-25349,-5938,-81,-168,-11502,20,-852646,-502708,-502460,-6337,-6337,-81118,-4791,-27,-11502,-168,-1,-1,-222,-2,10,10,-79,4%,272691,2019,20191231,2321.04,MYL1818OO003,BURSA MALAYSIA BHD,MYS,6200.0
1,GB0031638363,2019,INTERTEK GROUP PLC,United Kingdom,Activities auxiliary to financial intermediati...,-0.015,-9.49%,-60599272,-59281663,-13774,-788289,-184802,-2487,-3804,-324960,508,-26533166,-15557810,-15550827,-197072,-197072,-2509207,284215,-703,-324960,-3804,-17,-4,-6861,-20,254,254,-2470,1%,252384,2019,20191231,2818.4,GB0031638363,INTERTEK GROUP PLC,GBR,8700.0
2,ZAE000079711,2019,JSE LIMITED,South Africa,Activities auxiliary to financial intermediati...,-0.015,,-2290124,-2239814,-510,-29662,-6938,-93,-901,-12200,-6,-995881,-576811,-576488,-7415,-7415,-92910,-19470,-277,-12200,-901,0,-1,-253,0,-3,-3,-93,2%,278391,2019,20191231,40227.215,ZAE000079711,JSE LIMITED,ZAF,6211.0
3,FR0006174348,2019,BUREAU VERITAS SA,France,Activities auxiliary to financial intermediati...,-0.007,-5.10%,-39978650,-39107612,-9330,-520701,-121953,-1671,-4116,-214438,1172,-17514837,-10430409,-10425281,-130175,-130175,-1684676,561195,-577,-214438,-4116,-38,-9,-4607,-45,586,586,-1633,3%,286961,2019,20191231,7049.1,FR0006174348,BUREAU VERITAS SA,FRA,8700.0
4,GB0007370074,2019,RICARDO PLC,United Kingdom,Activities auxiliary to financial intermediati...,-0.007,-7.27%,-3247235,-3176408,-753,-42228,-9899,-135,-468,-17406,63,-1421576,-842731,-842343,-10557,-10557,-136060,34998,-87,-17406,-468,-2,0,-373,-2,31,31,-133,3%,221859,2019,20190630,371.9,GB0007370074,RICARDO PLC,GBR,8711.0


In [99]:
df2=pd.merge(df,df1,on='ISIN')
df2.head()

Unnamed: 0,Ticker,Name,Sector,Asset Class,Market Value,Weight (%),Notional Value,Shares,CUSIP,ISIN,SEDOL,Price,Location,Exchange,Currency,FX Rate,Market Currency,Accrual Date,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,gvkey,fyear,datadate,at,isin,conm,fic,sic
0,NESN,NESTLE SA,Consumer Staples,Equity,64181630.45,3.32,64181630.45,505570.0,S71238703,CH0038863350,7123870,126.95,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-,2019,NESTLE S.A.,Switzerland,Processing of Food products nec,-0.016,-9.75%,-1527139399,-1405885897,-416410,-18578077,-4368093,-65196,-113252,-97865932,153457,-628976294,-420810968,-420563568,-4644519,-4644519,-68675754,119281774,-6492,-97865932,-113252,-5213,-135,-201854,-6213,76729,76729,-59918,0%,16603,2019,20191231,127940.0,CH0038863350,NESTLE SA/AG,CHE,2000.0
1,ROG,ROCHE HOLDING PAR AG,Health Care,Equity,46859542.94,2.42,46859542.94,123210.0,S71103881,CH0012032048,7110388,380.32,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-,2019,ROCHE HOLDING AKTIENGESELLSCHAFT,Switzerland,"Manufacture of medical, precision and optical ...",-0.003,-0.84%,-160446737,-152869527,-35079,-1974118,-461147,-6303,-1149997,-3950618,52,-66193394,-38441169,-38420015,-493529,-493529,-6193341,-4364957,-722554,-3950618,-1149997,-139,-317,-16901,-165,26,26,-6163,6%,25648,2019,20191231,83091.0,CH0012032048,ROCHE HOLDING AG,CHE,2834.0
2,NOVN,NOVARTIS AG,Health Care,Equity,40415079.41,2.09,40415079.41,433340.0,S71030654,CH0012005267,7103065,93.26,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-,2019,NOVARTIS AG,Switzerland,"Manufacture of medical, precision and optical ...",-0.007,-3.18%,-348286772,-314529861,-74014,-4143676,-967262,-13198,-903525,-27659340,4103,-138890514,-81935724,-81878242,-1035919,-1035919,-13219836,-1114082,-567694,-27659340,-903525,-246,-249,-36343,-294,2051,2051,-12948,3%,101310,2019,20191231,118370.0,CH0012005267,NOVARTIS AG,CHE,2834.0
3,AZN,ASTRAZENECA PLC,Health Care,Equity,26955662.18,1.39,26955662.18,230130.0,S09895293,GB0009895292,989529,117.13,United Kingdom,London Stock Exchange,USD,0.71,GBP,-,2019,ASTRAZENECA PLC,United Kingdom,"Manufacture of medical, precision and optical ...",-0.005,-3.98%,-134985600,-129784288,-39655,-1688455,-393446,-6368,-573484,-2526476,26573,-56694491,-39535401,-39506085,-422114,-422114,-6472569,11948529,-781929,-2526476,-573484,-1050,-312,-18122,-1251,13286,13286,-5305,22%,28272,2019,20191231,61377.0,GB0009895292,ASTRAZENECA PLC,GBR,2834.0
4,SIE,SIEMENS N AG,Industrials,Equity,23306443.99,1.21,23306443.99,141729.0,S57279739,DE0007236101,5727973,164.44,Germany,Xetra,USD,0.82,EUR,-,2019,SIEMENS AG,Germany,Activities of membership organisation n.e.c. (91),-0.005,-5.49%,-431933738,-405755192,-97827,-5290306,-1228970,-16859,-600004,-18945339,758,-176454055,-102930381,-102840656,-1322577,-1322577,-16586392,-10700011,-167303,-18945339,-600004,-206,-1140,-46960,-246,379,379,-16650,7%,19349,2019,20190930,150248.0,DE0007236101,SIEMENS AG,DEU,9997.0


In [100]:
# merge the dataframes into one
df2 = pd.merge(df2, stock_des, on='Ticker')
df2.head()


Unnamed: 0,Ticker,Name,Sector,Asset Class,Market Value,Weight (%),Notional Value,Shares,CUSIP,ISIN,SEDOL,Price,Location,Exchange,Currency,FX Rate,Market Currency,Accrual Date,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),EnvironmentalIntensity(OpInc),TotalEnvironmentalCost,WorkingCapacity,FishProductionCapacity,CropProductionCapacity,MeatProductionCapacity,Biodiversity,AbioticResources,Waterproductioncapacity(Drinkingwater&IrrigationWater),WoodProductionCapacity,SDG1.5,SDG2.1,SDG2.2,SDG2.3,SDG2.4,SDG3.3,SDG3.4,SDG3.9,SDG6,SDG12.2,SDG14.1,SDG14.2,SDG14.3,SDG14.c,SDG15.1,SDG15.2,SDG15.5,%Imputed,gvkey,fyear,datadate,at,isin,conm,fic,sic,description
0,ROG,ROCHE HOLDING PAR AG,Health Care,Equity,46859542.94,2.42,46859542.94,123210.0,S71103881,CH0012032048,7110388,380.32,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-,2019,ROCHE HOLDING AKTIENGESELLSCHAFT,Switzerland,"Manufacture of medical, precision and optical ...",-0.003,-0.84%,-160446737,-152869527,-35079,-1974118,-461147,-6303,-1149997,-3950618,52,-66193394,-38441169,-38420015,-493529,-493529,-6193341,-4364957,-722554,-3950618,-1149997,-139,-317,-16901,-165,26,26,-6163,6%,25648,2019,20191231,83091.0,CH0012032048,ROCHE HOLDING AG,CHE,2834.0,"Rogers Corporation designs, develops, manufact..."
1,NOVN,NOVARTIS AG,Health Care,Equity,40415079.41,2.09,40415079.41,433340.0,S71030654,CH0012005267,7103065,93.26,Switzerland,SIX Swiss Exchange,USD,0.9,CHF,-,2019,NOVARTIS AG,Switzerland,"Manufacture of medical, precision and optical ...",-0.007,-3.18%,-348286772,-314529861,-74014,-4143676,-967262,-13198,-903525,-27659340,4103,-138890514,-81935724,-81878242,-1035919,-1035919,-13219836,-1114082,-567694,-27659340,-903525,-246,-249,-36343,-294,2051,2051,-12948,3%,101310,2019,20191231,118370.0,CH0012005267,NOVARTIS AG,CHE,2834.0,"Novan, Inc., a clinical development-stage biot..."
2,AZN,ASTRAZENECA PLC,Health Care,Equity,26955662.18,1.39,26955662.18,230130.0,S09895293,GB0009895292,989529,117.13,United Kingdom,London Stock Exchange,USD,0.71,GBP,-,2019,ASTRAZENECA PLC,United Kingdom,"Manufacture of medical, precision and optical ...",-0.005,-3.98%,-134985600,-129784288,-39655,-1688455,-393446,-6368,-573484,-2526476,26573,-56694491,-39535401,-39506085,-422114,-422114,-6472569,11948529,-781929,-2526476,-573484,-1050,-312,-18122,-1251,13286,13286,-5305,22%,28272,2019,20191231,61377.0,GB0009895292,ASTRAZENECA PLC,GBR,2834.0,"AstraZeneca PLC discovers, develops, manufactu..."
3,SAN,SANOFI SA,Health Care,Equity,21623837.19,1.12,21623837.19,201397.0,S56717358,FR0000120578,5671735,107.37,France,Nyse Euronext - Euronext Paris,USD,0.82,EUR,-,2019,SANOFI S.A.,France,"Manufacture of medical, precision and optical ...",-0.01,-6.09%,-434457155,-393211644,-77490,-4399917,-1003233,-13907,-451590,-35245079,-54294,-143654775,-75667077,-75484826,-1099979,-1099979,-12050491,-89489470,-108652,-35245079,-451590,-461,-3419,-33072,-549,-27147,-27147,-13441,25%,101204,2019,20191231,112736.0,FR0000120578,SANOFI,FRA,2834.0,"Banco Santander, S.A., together with its subsi..."
4,SAN,SANOFI SA,Health Care,Equity,21623837.19,1.12,21623837.19,201397.0,S56717358,FR0000120578,5671735,107.37,France,Nyse Euronext - Euronext Paris,USD,0.82,EUR,-,2019,SANOFI S.A.,France,"Manufacture of medical, precision and optical ...",-0.01,-6.09%,-434457155,-393211644,-77490,-4399917,-1003233,-13907,-451590,-35245079,-54294,-143654775,-75667077,-75484826,-1099979,-1099979,-12050491,-89489470,-108652,-35245079,-451590,-461,-3419,-33072,-549,-27147,-27147,-13441,25%,101204,2019,20191231,112736.0,FR0000120578,SANOFI,FRA,2834.0,"Banco Santander, S.A., together with its subsi..."


In [102]:
df2['description'] = df2['description'].str.slice(0,350)

Create a binary variable that is 1 if the assets to revenue ratio is above its median and 0 otherwise. 

This is the **dependent variable** (label) that we'll try to predict. 

In [103]:
df2['HIGH_EI'] = (df2['EnvironmentalIntensity(Sales)'].gt(df2['EnvironmentalIntensity(Sales)'].median())).astype(int)

# Preparing the predictor and DistilBERT model

**Note**. Please enable GPU in Edit > Notebook settings > Hardware accelerator. 

Load a pre-trained BERT model.

In [104]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokenize the textual data for DistilBERT. 

In [106]:
tokenized = df2['description'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

Pad all lists of tokenized values to the same size. 

In [107]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [108]:
np.array(padded).shape

(42, 90)

Create attention mask variable for BERT to ignore (mask) the padding when it's processing its input.

In [109]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(42, 90)

# DistilBERT model

We run the pretrained DistilBERT model on the prepared predictor and keep the result in `last_hidden_states` variable. 

In [110]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

# Logistic regression model 



Keep the first layer of the hidden states and assign the outcome variable to `labels`. 

In [111]:
features = last_hidden_states[0][:,0,:].numpy()
labels = df2['HIGH_EI']

Split the data in train and test subsets, train the Logistic Regression on train set and evaluate its accuracy on the test set. 

In [112]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels,test_size=0.25,random_state=42)
lr_clf = LogisticRegression(max_iter=5000)
lr_clf.fit(train_features, train_labels)
print(lr_clf.score(test_features, test_labels))

0.6363636363636364


Check if this approach works better than a random guess (1.0 > 0.5). 

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))