# 1. Mount your Google Drive and establish the working directory

Mounting allows to access files on your Google Drive. You'll need to allow the Google Drive for desktop's access to your Google Account and copying the sign in code into the authorization code field. 

In [119]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


Set working directory to a Google Drive folder: change the `root_dir` to the folder on your Google Drive. 

`root_dir = "/content/gdrive/MyDrive/...`

In [120]:
import os

# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/gdrive/MyDrive/BU/Year1/Summer/"

# choose where you want your project files to be saved
project_folder = "capstone/"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  !touch 'new_file_in_working_directory.txt'
  print('\nYour working directory was changed to ' + root_dir + project_folder + \
        "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

create_and_set_working_directory(project_folder)


Your working directory was changed to /content/gdrive/MyDrive/BU/Year1/Summer/capstone/

An empty text file was created there. You can also run !pwd to confirm the current working directory.


Check if the function worked by listing the files in the project folder. During the very first run it should contain only `new_file_in_working_directory.txt`. If you upload other files to your `project_folder`, other files will be listed too. 

In [121]:
!ls

'52 company names.csv'		   Environmental_Impact_dataset1.csv
 52ticker_companydescription.csv   new_file_in_working_directory.txt


# 2. Install, load the libraries

In [122]:
!pip install transformers



In [123]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# 2. Load, merge, clean the data

In [124]:
# load the csv files
stock_des = pd.read_csv('52ticker_companydescription.csv')
df = pd.read_csv('Environmental_Impact_dataset1.csv')
df1=pd.read_csv('52 company names.csv') 

In [125]:
df=df.drop('Unnamed: 0',1)

In [126]:
df.head()

Unnamed: 0,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),Env_intensity,industry_avg,Industry_indicator,Environmental_Growth,Ind_Yearavg
0,2016,1&1 DRILLISCH AG,Germany,Post and telecommunications (64),-0.0007,-0.0007,-0.018382,1,,-0.01164
1,2016,AFK SISTEMA PAO,Russia,Post and telecommunications (64),0.154,0.154,-0.018382,1,6.722107,-0.01164
2,2016,AMERICA MOVIL S.A.B. DE C.V.,Mexico,Post and telecommunications (64),-0.0128,-0.0128,-0.018382,1,13.274336,-0.01164
3,2016,AT&T INC.,United States,Post and telecommunications (64),-0.017,-0.017,-0.018382,1,-8.108108,-0.01164
4,2016,CHORUS LIMITED,New Zealand,Post and telecommunications (64),-0.0114,-0.0114,-0.018382,1,-12.977099,-0.01164


In [127]:
# merge the dataframes into one
df2 = pd.merge(df1, stock_des, on='Ticker')
df2 = df2[df2['fyear']==2018]
df2.head()

Unnamed: 0,fyear,Ticker,CompanyName,conml,Description
8,2018,PNW,PINNACLE WEST CAPITAL CORP,Pinnacle West Capital Corp,"Pinnacle West Capital Corporation, through its..."
18,2018,HES,HESS CORP,Hess Corp,"Hess Corporation, an exploration and productio..."
28,2018,AEP,AMERICAN ELECTRIC POWER CO,American Electric Power Co Inc,"American Electric Power Company, Inc., an elec..."
38,2018,APA,APA CORP,APA Corporation,"APA Corporation, through its subsidiaries, exp..."
48,2018,CVX,CHEVRON CORP,Chevron Corp,"Chevron Corporation, through its subsidiaries,..."


In [128]:
df2=pd.merge(df,df2,on='CompanyName')
df2.head()

Unnamed: 0,Year,CompanyName,Country,Industry(Exiobase),EnvironmentalIntensity(Sales),Env_intensity,industry_avg,Industry_indicator,Environmental_Growth,Ind_Yearavg,fyear,Ticker,conml,Description
0,2016,DOMINION ENERGY INC,United States,Production of electricity nec,-1.3176,-1.3176,-0.821631,-1,,-0.860912,2018,D,Dominion Energy Inc,"Dominion Energy, Inc. produces and distributes..."
1,2017,DOMINION ENERGY INC,United States,Production of electricity nec,-1.0128,-1.0128,-0.821631,-1,-23.132969,-0.853385,2018,D,Dominion Energy Inc,"Dominion Energy, Inc. produces and distributes..."
2,2018,DOMINION ENERGY INC,United States,Production of electricity nec,-0.7632,-0.7632,-0.821631,1,-24.64455,-0.786849,2018,D,Dominion Energy Inc,"Dominion Energy, Inc. produces and distributes..."
3,2019,DOMINION ENERGY INC,United States,Production of electricity nec,-0.8474,-0.8474,-0.821631,-1,11.032495,-0.794171,2018,D,Dominion Energy Inc,"Dominion Energy, Inc. produces and distributes..."
4,2016,EDISON INTERNATIONAL,United States,Production of electricity nec,-0.3127,-0.3127,-0.821631,1,,-0.860912,2018,EIX,Edison International,"Edison International, through its subsidiaries..."


In [129]:
df2['Description'] = df2['Description'].str.slice(0,350)

Create a binary variable that is 1 if the assets to revenue ratio is above its median and 0 otherwise. 

This is the **dependent variable** (label) that we'll try to predict. 

In [130]:
df2['HIGH_EI'] = (df2['Env_intensity'].gt(df2['Env_intensity'].median())).astype(int)

# Preparing the predictor and DistilBERT model

**Note**. Please enable GPU in Edit > Notebook settings > Hardware accelerator. 

Load a pre-trained BERT model.

In [131]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokenize the textual data for DistilBERT. 

In [132]:
tokenized = df2['Description'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

Pad all lists of tokenized values to the same size. 

In [133]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [134]:
np.array(padded).shape

(32, 75)

Create attention mask variable for BERT to ignore (mask) the padding when it's processing its input.

In [135]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(32, 75)

# DistilBERT model

We run the pretrained DistilBERT model on the prepared predictor and keep the result in `last_hidden_states` variable. 

In [136]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

# Logistic regression model 



Keep the first layer of the hidden states and assign the outcome variable to `labels`. 

In [137]:
features = last_hidden_states[0][:,0,:].numpy()
labels = df2['HIGH_EI']

Split the data in train and test subsets, train the Logistic Regression on train set and evaluate its accuracy on the test set. 

In [138]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels,test_size=0.33, random_state=42)
lr_clf = LogisticRegression(max_iter=5000)
lr_clf.fit(train_features, train_labels)
print(lr_clf.score(test_features, test_labels))

1.0


Check if this approach works better than a random guess (1.0 > 0.5). 

In [139]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.510 (+/- 0.47)


