In [233]:
!pip install python-docx



In [234]:
from docx import Document
import re
import pandas as pd
import os
import numpy as np

In [235]:
def extract_words_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text
    words = re.findall(r'b[w/&&[^/]]+b', text)
    return words

In [236]:
from collections import OrderedDict
from transformers import MPNetPreTrainedModel, MPNetModel, AutoTokenizer
import torch

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Definition of ESGify class because of custom,sentence-transformers like, mean pooling function and classifier head
class ESGify(MPNetPreTrainedModel):
    """Model for Classification ESG risks from text."""

    def __init__(self,config): #tuning only the head
        """
        """
        super().__init__(config)
        # Instantiate Parts of model
        self.mpnet = MPNetModel(config,add_pooling_layer=False)
        self.id2label =  config.id2label
        self.label2id =  config.label2id
        self.classifier = torch.nn.Sequential(OrderedDict([('norm',torch.nn.BatchNorm1d(768)),
                                                ('linear',torch.nn.Linear(768,512)),
                                                ('act',torch.nn.ReLU()),
                                                ('batch_n',torch.nn.BatchNorm1d(512)),
                                                ('drop_class', torch.nn.Dropout(0.2)),
                                                ('class_l',torch.nn.Linear(512 ,47))]))


    def forward(self, input_ids, attention_mask):
         # Feed input to mpnet model
        outputs = self.mpnet(input_ids=input_ids,
                             attention_mask=attention_mask)

        # mean pooling dataset and eed input to classifier to compute logits
        logits = self.classifier( mean_pooling(outputs['last_hidden_state'],attention_mask))

        # apply sigmoid
        logits  = 1.0 / (1.0 + torch.exp(-logits))
        return logits

In [237]:
model = ESGify.from_pretrained('ai-lab/ESGify')
tokenizer = AutoTokenizer.from_pretrained('ai-lab/ESGify')

In [238]:
print(model.id2label.values())

dict_values(['Legal Proceedings & Law Violations', 'Biodiversity', 'Communities Health and Safety', 'Land Acquisition and Resettlement (S)', 'Emergencies (Social)', 'Corporate Governance', 'Responsible Investment & Greenwashing', 'Not Relevant to ESG', 'Economic Crime', 'Emergencies (Environmental)', 'Hazardous Materials Management', 'Environmental Management', 'Landscape Transformation', 'Human Rights', 'Climate Risks', 'Labor Relations Management', 'Freedom of Association and Right to Organise', 'Employee Health and Safety', 'Surface Water Pollution', 'Animal Welfare', 'Water Consumption', 'Disclosure', 'Product Safety and Quality', 'Greenhouse Gas Emissions', 'Indigenous People', 'Cultural Heritage', 'Air Pollution', 'Waste Management', 'Soil and Groundwater Impact', 'Forced Labour', 'Wastewater Management', 'Natural Resources', 'Physical Impacts', 'Values and Ethics', 'Risk Management and Internal Control', 'Supply Chain (Environmental)', 'Supply Chain (Social)', 'Discrimination', 

In [239]:
df1 = pd.DataFrame(index = model.id2label.values(), columns = ['10'])
df1 = df1.fillna(10)
df1

Unnamed: 0,10
Legal Proceedings & Law Violations,10
Biodiversity,10
Communities Health and Safety,10
Land Acquisition and Resettlement (S),10
Emergencies (Social),10
Corporate Governance,10
Responsible Investment & Greenwashing,10
Not Relevant to ESG,10
Economic Crime,10
Emergencies (Environmental),10


In [240]:
folder_path = '/content/data/'
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        words = extract_words_from_docx(file_path)
        ms = " ".join(words)
        paragraphs = ms.split('/n')
        max_res = np.zeros(47)
        for paragraph in paragraphs:
          texts = [paragraph]
          to_model = tokenizer.batch_encode_plus(
                  texts,
                  add_special_tokens=True,
                  max_length= 512,
                  return_token_type_ids=False,
                  padding="longest",
                  truncation=True,
                  return_attention_mask=True,
                  return_tensors='pt',
									         )
          results = model(**to_model)
          ress_1 = results.detach().numpy()
          max_res = np.maximum(max_res, ress_1[0])
        df1[file_path] = max_res

In [241]:
df1 = df1.drop(columns = {'10'})
df1

Legal Proceedings & Law Violations
Biodiversity
Communities Health and Safety
Land Acquisition and Resettlement (S)
Emergencies (Social)
Corporate Governance
Responsible Investment & Greenwashing
Not Relevant to ESG
Economic Crime
Emergencies (Environmental)
Hazardous Materials Management


In [242]:
df1.shape

(47, 0)

In [243]:
df1.to_csv("Weights_with_paragraphs.csv")

In [244]:
from google.colab import files

#files.download('Weights_with_paragraphs.csv')

In [245]:
df1 = pd.read_csv("Weights.csv")
df1.rename(columns={ 'Unnamed: 0' :  'index' }, inplace=True)
df1.set_index('index', inplace=True)
# df1.drop(columns = df1.columns[0])
df1 = df1.T
df1.reset_index(inplace=True)
df1.rename(columns={ 'index' :  'Company_name' }, inplace=True)
df1.reset_index(drop=True, inplace=True)
df1['Company_name'] = df1['Company_name'].apply(lambda name : name[len('/content/data/'):])
df1

index,Company_name,Legal Proceedings & Law Violations,Biodiversity,Communities Health and Safety,Land Acquisition and Resettlement (S),Emergencies (Social),Corporate Governance,Responsible Investment & Greenwashing,Not Relevant to ESG,Economic Crime,...,Discrimination,Minimum Age and Child Labour,Planning Limitations,Data Safety,Strategy Implementation,Energy Efficiency and Renewables,Land Acquisition and Resettlement (E),Supply Chain (Economic / Governance),Land Rehabilitation,Retrenchment
0,ЮНИПРО инфа.docx,0.039817,0.045126,0.014498,0.052833,0.01118,0.728921,0.273245,0.412475,0.316775,...,0.535917,0.024024,0.153431,0.475766,0.02686,0.040551,0.030685,0.21403,0.038574,0.027505
1,Московская биржа инфа.docx,0.033245,0.042686,0.030259,0.049912,0.014704,0.775577,0.354382,0.479726,0.308021,...,0.555876,0.022171,0.124238,0.418576,0.024781,0.049906,0.057707,0.247029,0.077427,0.04376
2,ГЛОБАЛТРАНС инфа.docx,0.02916,0.039368,0.020696,0.048926,0.015981,0.745445,0.369496,0.497546,0.274858,...,0.580218,0.024193,0.107708,0.314998,0.021801,0.020834,0.023195,0.226007,0.047028,0.028427
3,БАНК САНКТ-ПЕТЕРБУРГ инфа.docx,0.024268,0.050859,0.017729,0.062759,0.014744,0.728921,0.276244,0.456638,0.382314,...,0.535917,0.027335,0.142273,0.381222,0.029315,0.052609,0.039289,0.187325,0.040687,0.041224
4,СПАРК_Отчет_ПАО_СУРГУТНЕФТЕГАЗ_8602060555_2024...,0.024834,0.056892,0.014822,0.058694,0.014046,0.728921,0.253048,0.440045,0.293951,...,0.535917,0.024159,0.113753,0.461641,0.029707,0.051025,0.033375,0.240988,0.077732,0.036344
5,Русал инфа.docx,0.022855,0.02608,0.025891,0.024671,0.016048,0.573294,0.306282,0.315058,0.218307,...,0.302483,0.025327,0.068539,0.190838,0.020921,0.022835,0.026355,0.140769,0.047283,0.031649
6,МЕЧЕЛ инфа.docx,0.025845,0.045481,0.019843,0.055948,0.012421,0.728921,0.298488,0.403745,0.328289,...,0.535917,0.022971,0.17089,0.368367,0.023805,0.040792,0.03065,0.196451,0.037485,0.040574
7,СПАРК_Отчет_ПАО_ПИК_СЗ_7713011336_20240423_183...,0.029449,0.051023,0.015,0.066875,0.013677,0.728921,0.267154,0.431812,0.35951,...,0.535917,0.023259,0.108205,0.42911,0.025506,0.038868,0.02465,0.254428,0.05507,0.026118
8,СПАРК_Отчет_ПАО_ГМК_НОРИЛЬСКИЙ_НИКЕЛЬ_84010057...,0.020183,0.033084,0.016274,0.045485,0.01969,0.728921,0.281097,0.402394,0.164439,...,0.535917,0.017793,0.086595,0.244967,0.027904,0.027738,0.015486,0.114982,0.026955,0.024516
9,СПАРК_Отчет_ПАО_ЛУКОЙЛ_7708004767_20240423_180...,0.033401,0.047469,0.015526,0.066717,0.014471,0.728921,0.244315,0.403949,0.322691,...,0.535917,0.02038,0.10759,0.426639,0.026977,0.038817,0.030884,0.217002,0.057481,0.037126


In [246]:
df1['ESG rating'] = [7/18, 13/18, 1/6, 1/18, 5/18, 11/18, 1/6, 1/6, 13/18, 13/18, 1/18, 11/18, 13/18, 1/2, 7/18, 1/2, 11/18, 15/18, 17/18, 15/18, 1/18, 15/18, 1/2, 1/18, 7/18, 11/18, 11/18, 17/18, 1/2, 11/18, 11/18,  1/18, 5/18, 11/18, 11/18, 15/18, 1/2, 1/18, 11/18, 5/18, 1/2, 1/2, 11/18]
df1['RAEX rank'] = [56, 16, 106, 155, 72, 21, 119, 107, 6, 11, 156, 30, 7, 39, 60, 51, 21, 7, 1, 3, 94, 4, 38, 79, 55, 28, 23, 2, 37, 19, 26, 157, 78, 34, 30, 5, 52, 158, 15, 67, 47, 45, 24]

In [247]:
df1

index,Company_name,Legal Proceedings & Law Violations,Biodiversity,Communities Health and Safety,Land Acquisition and Resettlement (S),Emergencies (Social),Corporate Governance,Responsible Investment & Greenwashing,Not Relevant to ESG,Economic Crime,...,Planning Limitations,Data Safety,Strategy Implementation,Energy Efficiency and Renewables,Land Acquisition and Resettlement (E),Supply Chain (Economic / Governance),Land Rehabilitation,Retrenchment,ESG rating,RAEX rank
0,ЮНИПРО инфа.docx,0.039817,0.045126,0.014498,0.052833,0.01118,0.728921,0.273245,0.412475,0.316775,...,0.153431,0.475766,0.02686,0.040551,0.030685,0.21403,0.038574,0.027505,0.388889,56
1,Московская биржа инфа.docx,0.033245,0.042686,0.030259,0.049912,0.014704,0.775577,0.354382,0.479726,0.308021,...,0.124238,0.418576,0.024781,0.049906,0.057707,0.247029,0.077427,0.04376,0.722222,16
2,ГЛОБАЛТРАНС инфа.docx,0.02916,0.039368,0.020696,0.048926,0.015981,0.745445,0.369496,0.497546,0.274858,...,0.107708,0.314998,0.021801,0.020834,0.023195,0.226007,0.047028,0.028427,0.166667,106
3,БАНК САНКТ-ПЕТЕРБУРГ инфа.docx,0.024268,0.050859,0.017729,0.062759,0.014744,0.728921,0.276244,0.456638,0.382314,...,0.142273,0.381222,0.029315,0.052609,0.039289,0.187325,0.040687,0.041224,0.055556,155
4,СПАРК_Отчет_ПАО_СУРГУТНЕФТЕГАЗ_8602060555_2024...,0.024834,0.056892,0.014822,0.058694,0.014046,0.728921,0.253048,0.440045,0.293951,...,0.113753,0.461641,0.029707,0.051025,0.033375,0.240988,0.077732,0.036344,0.277778,72
5,Русал инфа.docx,0.022855,0.02608,0.025891,0.024671,0.016048,0.573294,0.306282,0.315058,0.218307,...,0.068539,0.190838,0.020921,0.022835,0.026355,0.140769,0.047283,0.031649,0.611111,21
6,МЕЧЕЛ инфа.docx,0.025845,0.045481,0.019843,0.055948,0.012421,0.728921,0.298488,0.403745,0.328289,...,0.17089,0.368367,0.023805,0.040792,0.03065,0.196451,0.037485,0.040574,0.166667,119
7,СПАРК_Отчет_ПАО_ПИК_СЗ_7713011336_20240423_183...,0.029449,0.051023,0.015,0.066875,0.013677,0.728921,0.267154,0.431812,0.35951,...,0.108205,0.42911,0.025506,0.038868,0.02465,0.254428,0.05507,0.026118,0.166667,107
8,СПАРК_Отчет_ПАО_ГМК_НОРИЛЬСКИЙ_НИКЕЛЬ_84010057...,0.020183,0.033084,0.016274,0.045485,0.01969,0.728921,0.281097,0.402394,0.164439,...,0.086595,0.244967,0.027904,0.027738,0.015486,0.114982,0.026955,0.024516,0.722222,6
9,СПАРК_Отчет_ПАО_ЛУКОЙЛ_7708004767_20240423_180...,0.033401,0.047469,0.015526,0.066717,0.014471,0.728921,0.244315,0.403949,0.322691,...,0.10759,0.426639,0.026977,0.038817,0.030884,0.217002,0.057481,0.037126,0.722222,11


In [248]:
df1.to_csv("weights_with_rating.csv")

In [249]:
from google.colab import files

#files.download('weights_with_rating.csv')

In [250]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (ExtraTreesClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (ConfusionMatrixDisplay, accuracy_score,
                             classification_report, confusion_matrix, f1_score,
                             make_scorer, precision_score, recall_score,
                             roc_auc_score)
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [251]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [252]:
target = 'ESG rating'

X = df1.drop(columns=[target, 'RAEX rank', 'Company_name'])
y = df1[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=200, shuffle=True, stratify=y)
y_test.values

array([0.16666667, 0.05555556, 0.38888889, 0.27777778, 0.5       ,
       0.72222222, 0.61111111, 0.83333333, 0.94444444, 0.5       ,
       0.61111111, 0.05555556, 0.61111111])

In [253]:
#сделаем красивую общую табличку с резами.
compare_table = pd.DataFrame(columns=['Model', 'MSE', 'R^2 score'] + [str(i) for i in range(len(y_test))])
new_row = pd.DataFrame({'Model': 'RAEX values', 'MSE': 0, 'R^2 score': 0, **dict(zip(compare_table.columns[3:], y_test.values))}, index=[0])
compare_table = pd.concat([compare_table, new_row], ignore_index=True)
compare_table


Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9,10,11,12
0,RAEX values,0,0,0.166667,0.055556,0.388889,0.277778,0.5,0.722222,0.611111,0.833333,0.944444,0.5,0.611111,0.055556,0.611111


In [254]:
def add_row_res(model, mse, rsq, y_pred):
    return pd.DataFrame({'Model': model, 'MSE': mse, 'R^2 score': rsq, **dict(zip(compare_table.columns[3:], y_pred))}, index=[0])

In [255]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(y_pred)
print(y_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

new_row = add_row_res('LinReg', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

[ 0.37289195  0.79825942  1.55405062  0.65329678 -0.56619618  0.70162459
  1.16970087 -1.33483768  1.2052337  -0.60622123 -0.12724337  0.78110657
  1.4670269 ]
2     0.166667
3     0.055556
14    0.388889
32    0.277778
22    0.500000
12    0.722222
11    0.611111
35    0.833333
27    0.944444
28    0.500000
33    0.611111
10    0.055556
29    0.611111
Name: ESG rating, dtype: float64
Mean Squared Error: 0.8722200604728843
R^2 Score: -10.821604364171597


In [256]:
%time

model = RandomForestRegressor(random_state=42)
#model.fit(X_train, y_train)
param_grid = {
    'n_estimators': [90],
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [1],
    'bootstrap': [True],
    'criterion': ['friedman_mse'],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Лучшие параметры: ", grid_search.best_params_)
#Лучшие параметры:  {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
y_pred = grid_search.predict(X_test)

# Оценка модели
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print(y_test)
print(y_pred)

new_row = add_row_res('RandomForest', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.06 µs
Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 90}
Mean Squared Error: 0.07478768033193954
R^2 Score: -0.013632233726653675
2     0.166667
3     0.055556
14    0.388889
32    0.277778
22    0.500000
12    0.722222
11    0.611111
35    0.833333
27    0.944444
28    0.500000
33    0.611111
10    0.055556
29    0.611111
Name: ESG rating, dtype: float64
[0.52600117 0.5072669  0.63666114 0.48440279 0.60665973 0.67639796
 0.69956635 0.58066196 0.54413933 0.24789729 0.46800782 0.50429027
 0.54625927]


In [257]:
param_grid = {
    'n_neighbors': [20],
    'weights': ['distance'],
    'metric': ['manhattan']
}
model = KNeighborsRegressor()
model.fit(X_train, y_train)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров
grid_search.fit(X_train, y_train)

# Вывод лучших параметров
print("Лучшие параметры: ", grid_search.best_params_)
#Лучшие параметры:  {'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'distance'}
y_pred = grid_search.predict(X_test)
# Оценка модели
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

new_row = add_row_res('KNN', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Лучшие параметры:  {'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'distance'}
Mean Squared Error: 0.07296451599230064
R^2 Score: 0.01107796097167979


In [258]:
# Определение параметров для поиска
param_grid = {
    'max_depth': [None],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'max_features': ['log2']
}

# Создание модели
model = DecisionTreeRegressor(random_state=42)

# Создание объекта GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров
grid_search.fit(X_train, y_train)

# Вывод лучших параметров
print("Лучшие параметры: ", grid_search.best_params_)
# Лучшие параметры:  {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5}

# Предсказание на тестовых данных с использованием лучшей модели
y_pred = grid_search.predict(X_test)
# Оценка модели
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print(y_pred)
y_test

new_row = add_row_res('DecisionTree', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Лучшие параметры:  {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5}
Mean Squared Error: 0.08736942070275405
R^2 Score: -0.18415841584158432
[0.11111111 0.5        0.55555556 0.61111111 0.61111111 0.72222222
 0.72222222 0.61111111 0.61111111 0.5        0.11111111 0.38888889
 0.11111111]


In [259]:
#Проверка на дурака - какая ошибка будет у константного решения (матожидание = 0.5+-eps)
y_pred = [0.5 for i in range(len(y_test))]

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

new_row = add_row_res('CoolMonkey', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

In [260]:
compare_table

Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9,10,11,12
0,RAEX values,0.0,0.0,0.166667,0.055556,0.388889,0.277778,0.5,0.722222,0.611111,0.833333,0.944444,0.5,0.611111,0.055556,0.611111
1,LinReg,0.87222,-10.821604,0.372892,0.798259,1.554051,0.653297,-0.566196,0.701625,1.169701,-1.334838,1.205234,-0.606221,-0.127243,0.781107,1.467027
2,RandomForest,0.074788,-0.013632,0.526001,0.507267,0.636661,0.484403,0.60666,0.676398,0.699566,0.580662,0.544139,0.247897,0.468008,0.50429,0.546259
3,KNN,0.072965,0.011078,0.414771,0.465161,0.557107,0.446383,0.561817,0.481967,0.582058,0.488739,0.457263,0.44368,0.412505,0.482128,0.471747
4,DecisionTree,0.087369,-0.184158,0.111111,0.5,0.555556,0.611111,0.611111,0.722222,0.722222,0.611111,0.611111,0.5,0.111111,0.388889,0.111111
5,CoolMonkey,0.074074,-0.00396,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
