### Import Packages

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from preprocessing_helper import *
from machine_learning_helper import * 
from table_extraction import *

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble
from sklearn.svm import SVC
from crop_jpg import *
from pdf_to_jpg import *
from cosine_similarity import * 

# I. Data Input

### Train Input

In [2]:
# Input CSV with Report URL
df = pd.read_csv("Processed_Corp_List.csv")

# Get PDF data from URLs for unprocessed csv 
df['Processed_Sentences'] = df['Report URL'].apply(lambda x: get_processed_sentences_from_url(x))

# Clean up the data
df = sentenceCleaning(df)

# Read CSV with labelled answers and remove datapoints with error
df2 = pd.read_csv('Labelled_Answers.csv')
df2 = df2[df2.Q3 != 'NA_2021']
df2 = df2[df2.Q3 != 'PDF_Error']

# Merge both information on companies with labelled answers
df3 = pd.merge(df, df2, how='inner', left_on='IssuerName', right_on='IssuerName')
df3.head()

Unnamed: 0.1,Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q3,Q4,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14
0,0,Repsol SA,ES0173516115,REP,Spain,Energy,Integrated Oil & Gas,2021,https://www.repsol.com/content/dam/repsol-corp...,[2 0 2 1 REPSOL Group Integrated Management Re...,,Yes,Yes,Established Carbon Transition Plan,Yes,,Yes,Yes,Yes,Yes
1,1,OMV AG,AT0000743059,OMV,Austria,Energy,Integrated Oil & Gas,2021,https://www.omv.com/services/downloads/00/omv....,[Sustainability Report 2021 Non-Financial Repo...,,Yes,Yes,Established Carbon Transition Plan,Yes,Yes,Yes,Yes,,Yes
2,2,TotalEnergies SE,FR0000120271,TTE,France,Energy,Integrated Oil & Gas,2021,https://totalenergies.com/system/files/documen...,[Universal Registration Document 2021including...,,Yes,Yes,Established Carbon Transition Plan,,,Yes,Yes,Yes,Yes
3,4,Eni SpA,IT0003132476,ENI,Italy,Energy,Integrated Oil & Gas,2021,https://www.eni.com/assets/documents/eng/just-...,[Eni for 2021 A just transition ##PAGE_BREAK##...,Yes,Yes,Yes,Plans to Transition to Low Carbon Environment,Yes,Yes,Yes,Yes,Yes,Yes
4,5,Woodside Energy Group Ltd.,AU0000224040,WDS,Australia,Energy,Oil & Gas Exploration & Production,2021,https://www.woodside.com/docs/default-source/i...,[SUSTAINABLE DEVELOPMENT REPORT ##PAGE_BREAK##...,,Yes,Yes,Plans to Transition to Low Carbon Environment,,Yes,,Yes,,Yes


In [3]:
df3 = df3[['IssuerName', 'ISIN', 'Ticker', 'CountryOfIncorporation', 'GICSSector', 'GICSSubIndustry', 'Year','Report URL','Q3', 'Q4', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Processed_Sentences']]
# Map to 0, 1 and 2 for Machine Learning
for i in ['Q3', 'Q4', 'Q7', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14']:
    df3[i] = df3[i].replace({np.nan: 0, 'Yes': 1, "NA ": 0})
df3['Q8'] = df3['Q8'].replace({np.nan: 0, 'Plans to Transition to Low Carbon Environment': 1, 'Established Carbon Transition Plan': 2})

In [4]:
# Load question : keyword_list mappings
f = open("question_keywords.json", "r")
question_keywords = json.loads(f.read())
f.close()

### New Data (Run cell below when using new urls that have not been processed)

In [5]:
# new_data = pd.read_csv("New_URLs.csv")
# new_data['Processed_Sentences'] = new_data['Report URL'].apply(lambda x: get_processed_sentences_from_url(x))
# new_data.to_csv("New_Corp_List.csv")

In [6]:
new_data = pd.read_csv("New_Corp_List.csv")
new_data = sentenceCleaning(new_data)
del new_data["Unnamed: 0.1"]

In [7]:
# Merge train data with new test data
df3 = df3.append(new_data)
del df3["Unnamed: 0"]
del new_data["Unnamed: 0"]
df3

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Q3,Q4,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Processed_Sentences
0,Repsol SA,ES0173516115,REP,Spain,Energy,Integrated Oil & Gas,2021,https://www.repsol.com/content/dam/repsol-corp...,0.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,[2 0 2 1 REPSOL Group Integrated Management Re...
1,OMV AG,AT0000743059,OMV,Austria,Energy,Integrated Oil & Gas,2021,https://www.omv.com/services/downloads/00/omv....,0.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0.0,1.0,[Sustainability Report 2021 Non-Financial Repo...
2,TotalEnergies SE,FR0000120271,TTE,France,Energy,Integrated Oil & Gas,2021,https://totalenergies.com/system/files/documen...,0.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,[Universal Registration Document 2021including...
3,Eni SpA,IT0003132476,ENI,Italy,Energy,Integrated Oil & Gas,2021,https://www.eni.com/assets/documents/eng/just-...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[Eni for 2021 A just transition ##PAGE_BREAK##...
4,Woodside Energy Group Ltd.,AU0000224040,WDS,Australia,Energy,Oil & Gas Exploration & Production,2021,https://www.woodside.com/docs/default-source/i...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,[SUSTAINABLE DEVELOPMENT REPORT ##PAGE_BREAK##...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,,,,,,,,,,,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr..."
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,,,,,,,,,,,[ Despite the continued challenges of the pand...
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,,,,,,,,,,,[2021-22 ESG Report Prologis Eindhoven Distrib...
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,,,,,,,,,,,[ None of the owners of these trademarks has a...


# II. Generating New CSV

### Question 1 & 2

In [8]:
# Update Q1 and Q2 columns containing extracted page numbers
new_data = extract_page_numbers(new_data, 1)
new_data = extract_page_numbers(new_data, 2)
new_data

Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.


Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[]
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46]
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[]
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[]


In [9]:
# Uncomment to upload cropped table images to Firebase Storage
# convert_to_jpg_qn_1(new_data)
# convert_to_jpg_qn_2(new_data)
# crop_images()

### Question 3

In [10]:
qn_name, feature_engineering = 'Q3', getTfidfVaderDf
kw_lists = question_keywords[qn_name]
model = KNeighborsClassifier(n_neighbors=9)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2,Q3
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[],1.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46],0.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[],1.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[],0.0


### Question 4

In [11]:
qn_name, feature_engineering = 'Q4', getTfidfVaderDf
kw_lists = question_keywords[qn_name]
model = ensemble.GradientBoostingClassifier(loss='exponential', n_estimators=50, random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2,Q3,Q4
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error,0.0,1.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[],1.0,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46],0.0,1.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[],1.0,1.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[],0.0,0.0


### Question 5, 6

In [12]:
new_data = extract_relevant_sentences(new_data, 5)
new_data = extract_relevant_sentences(new_data, 6)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2,Q3,Q4,Q5,Q6
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error,0.0,1.0,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[],1.0,0.0,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46],0.0,1.0,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[],1.0,1.0,,
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[],0.0,0.0,,


### Question 7

In [13]:
qn_name, feature_engineering = 'Q7', getCountVectDf
kw_lists = question_keywords[qn_name]
model = ensemble.ExtraTreesClassifier(random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2,Q3,Q4,Q5,Q6,Q7
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error,0.0,1.0,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[],1.0,0.0,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46],0.0,1.0,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...,0.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[],1.0,1.0,,,0.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[],0.0,0.0,,,0.0


### Question 8 (Scale)

In [14]:
qn_name, feature_engineering = 'Q8', getCountVectVaderDf
kw_lists = question_keywords[qn_name]
model = ensemble.RandomForestClassifier(n_estimators=50, random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error,0.0,1.0,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...,0.0,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[],1.0,0.0,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,,0.0,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46],0.0,1.0,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...,0.0,0.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[],1.0,1.0,,,0.0,1.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[],0.0,0.0,,,0.0,0.0


### Question 9

In [15]:
qn_name, feature_engineering = 'Q9', getCountVectDf
kw_lists = question_keywords[qn_name]
model = LogisticRegression(random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error,0.0,1.0,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...,0.0,0.0,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[],1.0,0.0,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,,0.0,0.0,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46],0.0,1.0,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...,0.0,0.0,0.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[],1.0,1.0,,,0.0,1.0,0.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[],0.0,0.0,,,0.0,0.0,0.0


### Question 10

In [16]:
qn_name, feature_engineering = 'Q10', getCountVectVaderDf
kw_lists = question_keywords[qn_name]
model = DecisionTreeClassifier(max_depth=3, random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error,0.0,1.0,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...,0.0,0.0,0.0,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[],1.0,0.0,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,,0.0,0.0,0.0,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46],0.0,1.0,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...,0.0,0.0,0.0,0.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[],1.0,1.0,,,0.0,1.0,0.0,0.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[],0.0,0.0,,,0.0,0.0,0.0,0.0


### Question 11

In [17]:
qn_name, feature_engineering = 'Q11', getTfidfVaderDf
kw_lists = question_keywords[qn_name]
model = ensemble.GradientBoostingClassifier(loss="exponential", n_estimators=150, random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,Error,0.0,1.0,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...,0.0,0.0,0.0,0.0,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],[],1.0,0.0,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,,0.0,0.0,0.0,0.0,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],[46],0.0,1.0,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...,0.0,0.0,0.0,0.0,0.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],[],1.0,1.0,,,0.0,1.0,0.0,0.0,1.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],[],0.0,0.0,,,0.0,0.0,0.0,0.0,1.0


### Question 12

In [18]:
qn_name, feature_engineering = 'Q12', getTfidfVaderDf
kw_lists = question_keywords[qn_name]
model = ensemble.ExtraTreesClassifier(criterion='entropy', n_estimators=50, random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,...,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,...,0.0,1.0,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...,0.0,0.0,0.0,0.0,0.0,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],...,1.0,0.0,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,,0.0,0.0,0.0,0.0,0.0,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],...,0.0,1.0,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...,0.0,0.0,0.0,0.0,0.0,1.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],...,1.0,1.0,,,0.0,1.0,0.0,0.0,1.0,0.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],...,0.0,0.0,,,0.0,0.0,0.0,0.0,1.0,0.0


### Question 13

In [19]:
qn_name, feature_engineering = 'Q13', getCountVectDf
kw_lists = question_keywords[qn_name]
model = ensemble.RandomForestClassifier(random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,...,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,...,1.0,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],...,0.0,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],...,1.0,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],...,1.0,,,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],...,0.0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Question 14

In [20]:
qn_name, feature_engineering = 'Q14', getCountVectVaderDf
kw_lists = question_keywords[qn_name]
model = ensemble.ExtraTreesClassifier(random_state=123)
new_data[qn_name] = binaryModelPredict(qn_name, kw_lists, feature_engineering, model, df3)
new_data

Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Processed_Sentences,Q1,...,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14
0,Kilroy Realty Corporation,US49427F1084,KRC,USA,Real Estate,Office REITs,2021,https://kilroyrealty.com/wp-content/uploads/20...,"[S., EPA GREEN POWER PARTNERSHIP In 2021, Kilr...",Error,...,As we work towards our Science based target1 (...,As we work towards our Science based target1 (...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Highwoods Properties, Inc.",US4312841087,HIW,USA,Real Estate,Office REITs,2021,https://www.responsibilityreports.com/Click/1335,[ Despite the continued challenges of the pand...,[],...,Reduce Scope 1 and 2 greenhouse gas (GHG) emis...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Prologis, Inc.",US74340W1036,PLD,USA,Real Estate,Industrial REITs,2021,https://prologis.getbynder.com/m/40de827ea131c...,[2021-22 ESG Report Prologis Eindhoven Distrib...,[46],...,SCOPE Scope 1 and 2 2025 TARGET (2016 Baseline...,homes in a year 195MSF of certified sustainabl...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,"Host Hotels & Resorts, Inc.",US44107P1049,HST,USA,Real Estate,Hotel & Resort REITs,2021,https://www.hosthotels.com/-/media/HostHotels/...,[ None of the owners of these trademarks has a...,[],...,,,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,DiamondRock Hospitality Company,US2527843013,DRH,USA,Real Estate,Hotel & Resort REITs,2021,https://investor.drhc.com/static-files/52de73f...,[2021 CORPORATE RESPONSIBILITY REPORT ##PAGE_B...,[33],...,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Output CSV   

In [21]:
final_output = df3[df3['Q3'].isna()==False]
final_output = final_output.append(new_data)
for i in ['Q3', 'Q4', 'Q7', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14']:
    final_output[i] = final_output[i].replace({1:'Yes', 0: "NA"})
final_output['Q8'] = final_output['Q8'].replace({0: "NA", 1:'Plans to Transition to Low Carbon Environment', 2:'Established Carbon Transition Plan'})
final_output.reset_index(inplace=True, drop=True)
final_output
final_output.to_csv('consolidated_output.csv')

In [24]:
df = pd.read_csv('consolidated_output.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,IssuerName,ISIN,Ticker,CountryOfIncorporation,GICSSector,GICSSubIndustry,Year,Report URL,Q3,...,Q10,Q11,Q12,Q13,Q14,Processed_Sentences,Q1,Q2,Q5,Q6
0,0,Repsol SA,ES0173516115,REP,Spain,Energy,Integrated Oil & Gas,2021,https://www.repsol.com/content/dam/repsol-corp...,,...,,Yes,Yes,Yes,Yes,['2 0 2 1 REPSOL Group Integrated Management R...,,,,
1,1,OMV AG,AT0000743059,OMV,Austria,Energy,Integrated Oil & Gas,2021,https://www.omv.com/services/downloads/00/omv....,,...,Yes,Yes,Yes,,Yes,['Sustainability Report 2021 Non-Financial Rep...,,,,
2,2,TotalEnergies SE,FR0000120271,TTE,France,Energy,Integrated Oil & Gas,2021,https://totalenergies.com/system/files/documen...,,...,,Yes,Yes,Yes,Yes,['Universal Registration Document 2021includin...,,,,
3,3,Eni SpA,IT0003132476,ENI,Italy,Energy,Integrated Oil & Gas,2021,https://www.eni.com/assets/documents/eng/just-...,Yes,...,Yes,Yes,Yes,Yes,Yes,['Eni for 2021 A just transition ##PAGE_BREAK#...,,,,
4,4,Woodside Energy Group Ltd.,AU0000224040,WDS,Australia,Energy,Oil & Gas Exploration & Production,2021,https://www.woodside.com/docs/default-source/i...,,...,Yes,,Yes,,Yes,['SUSTAINABLE DEVELOPMENT REPORT ##PAGE_BREAK#...,,,,
5,5,Equinor ASA,NO0010096985,EQNR,Norway,Energy,Integrated Oil & Gas,2021,https://cdn.sanity.io/files/h61q9gi9/global/d4...,,...,,Yes,Yes,Yes,Yes,['2021 Sustainability report ##PAGE_BREAK##INT...,,,,
6,6,OMV Petrom SA,ROSNPPACNOR9,SNP,Romania,Energy,Integrated Oil & Gas,2021,https://www.omvpetrom.com/services/downloads/0...,Yes,...,,Yes,Yes,Yes,Yes,"['1938/2016, as amended.', 'The sustainability...",,,,
7,7,MOL Hungarian Oil & Gas Plc,HU0000153937,MOL,Hungary,Energy,Integrated Oil & Gas,2021,https://molgroup.info/storage/documents/public...,,...,,Yes,Yes,,Yes,"[""MOL GROUP INTEGRATED ANNUAL REPORT 2021 ##PA...",,,,
8,8,Galp Energia SGPS SA,PTGAL0AM0009,GALP,Portugal,Energy,Integrated Oil & Gas,2021,https://www.galp.com/corp/Portals/0/Recursos/I...,Yes,...,,Yes,Yes,,,['##PAGE_BREAK##Strategic execution Financial ...,,,,
9,9,Harbour Energy Plc,GB00BMBVGQ36,HBR,United Kingdom,Energy,Oil & Gas Exploration & Production,2021,https://www.harbourenergy.com/media/jyof20ez/3...,,...,,,,,,['#WeAreHarbourEnergy ESG Report 2021 Harbour ...,,,,
