In [1]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/244.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [2]:
from docx import Document
import re
import pandas as pd
import os
import numpy as np

In [3]:
def extract_words_from_docx(file_path):
    with open(file_path, 'r') as file:
      text = file.read()
    return text

In [4]:
from collections import OrderedDict
from transformers import MPNetPreTrainedModel, MPNetModel, AutoTokenizer
import torch

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Definition of ESGify class because of custom,sentence-transformers like, mean pooling function and classifier head
class ESGify(MPNetPreTrainedModel):
    """Model for Classification ESG risks from text."""

    def __init__(self,config): #tuning only the head
        """
        """
        super().__init__(config)
        # Instantiate Parts of model
        self.mpnet = MPNetModel(config,add_pooling_layer=False)
        self.id2label =  config.id2label
        self.label2id =  config.label2id
        self.classifier = torch.nn.Sequential(OrderedDict([('norm',torch.nn.BatchNorm1d(768)),
                                                ('linear',torch.nn.Linear(768,512)),
                                                ('act',torch.nn.ReLU()),
                                                ('batch_n',torch.nn.BatchNorm1d(512)),
                                                ('drop_class', torch.nn.Dropout(0.2)),
                                                ('class_l',torch.nn.Linear(512 ,47))]))


    def forward(self, input_ids, attention_mask):
         # Feed input to mpnet model
        outputs = self.mpnet(input_ids=input_ids,
                             attention_mask=attention_mask)

        # mean pooling dataset and eed input to classifier to compute logits
        logits = self.classifier( mean_pooling(outputs['last_hidden_state'],attention_mask))

        # apply sigmoid
        logits  = 1.0 / (1.0 + torch.exp(-logits))
        return logits

In [5]:
model = ESGify.from_pretrained('ai-lab/ESGify')
tokenizer = AutoTokenizer.from_pretrained('ai-lab/ESGify')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/5.48k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/437M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

In [6]:
print(model.id2label.values())

dict_values(['Legal Proceedings & Law Violations', 'Biodiversity', 'Communities Health and Safety', 'Land Acquisition and Resettlement (S)', 'Emergencies (Social)', 'Corporate Governance', 'Responsible Investment & Greenwashing', 'Not Relevant to ESG', 'Economic Crime', 'Emergencies (Environmental)', 'Hazardous Materials Management', 'Environmental Management', 'Landscape Transformation', 'Human Rights', 'Climate Risks', 'Labor Relations Management', 'Freedom of Association and Right to Organise', 'Employee Health and Safety', 'Surface Water Pollution', 'Animal Welfare', 'Water Consumption', 'Disclosure', 'Product Safety and Quality', 'Greenhouse Gas Emissions', 'Indigenous People', 'Cultural Heritage', 'Air Pollution', 'Waste Management', 'Soil and Groundwater Impact', 'Forced Labour', 'Wastewater Management', 'Natural Resources', 'Physical Impacts', 'Values and Ethics', 'Risk Management and Internal Control', 'Supply Chain (Environmental)', 'Supply Chain (Social)', 'Discrimination', 

In [7]:
df1 = pd.DataFrame(index = model.id2label.values(), columns = ['10'])
df1 = df1.fillna(10)
df1

Unnamed: 0,10
Legal Proceedings & Law Violations,10
Biodiversity,10
Communities Health and Safety,10
Land Acquisition and Resettlement (S),10
Emergencies (Social),10
Corporate Governance,10
Responsible Investment & Greenwashing,10
Not Relevant to ESG,10
Economic Crime,10
Emergencies (Environmental),10


In [8]:
folder_path = '/content/data/'
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        print(file_path)
        text = extract_words_from_docx(file_path)
        paragraphs = text.split('/n')
        max_res = np.zeros(47)
        for paragraph in paragraphs:
          texts = [paragraph]
          to_model = tokenizer.batch_encode_plus(
                  texts,
                  add_special_tokens=True,
                  max_length= 512,
                  return_token_type_ids=False,
                  padding="longest",
                  truncation=True,
                  return_attention_mask=True,
                  return_tensors='pt',
									         )
          results = model(**to_model)
          ress_1 = results.detach().numpy()
          max_res = np.maximum(max_res, ress_1[0])
        df1[file_path] = max_res

In [9]:
df1 = df1.drop(columns = {'10'})
df1

Legal Proceedings & Law Violations
Biodiversity
Communities Health and Safety
Land Acquisition and Resettlement (S)
Emergencies (Social)
Corporate Governance
Responsible Investment & Greenwashing
Not Relevant to ESG
Economic Crime
Emergencies (Environmental)
Hazardous Materials Management


In [10]:
df1.shape

(47, 0)

In [11]:
#df1.to_csv("Weights_with_paragraphs_new.csv")

In [12]:
from google.colab import files

#files.download('Weights_with_paragraphs.csv')

In [13]:
df1 = pd.read_csv("Weights_with_paragraphs_new.csv")
df1.rename(columns={ 'Unnamed: 0' :  'index' }, inplace=True)
df1.set_index('index', inplace=True)
# df1.drop(columns = df1.columns[0])
df1 = df1.T
df1.reset_index(inplace=True)
df1.rename(columns={ 'index' :  'Company_name' }, inplace=True)
df1.reset_index(drop=True, inplace=True)
df1['Company_name'] = df1['Company_name'].apply(lambda name : name[len('/content/data/'):])
df1

index,Company_name,Legal Proceedings & Law Violations,Biodiversity,Communities Health and Safety,Land Acquisition and Resettlement (S),Emergencies (Social),Corporate Governance,Responsible Investment & Greenwashing,Not Relevant to ESG,Economic Crime,...,Discrimination,Minimum Age and Child Labour,Planning Limitations,Data Safety,Strategy Implementation,Energy Efficiency and Renewables,Land Acquisition and Resettlement (E),Supply Chain (Economic / Governance),Land Rehabilitation,Retrenchment
0,СЕГЕЖА инфа.txt,0.045125,0.094551,0.025876,0.040868,0.075061,0.566426,0.514729,0.495201,0.235149,...,0.374118,0.069972,0.078965,0.180511,0.02408,0.094233,0.108541,0.07587,0.059261,0.117523
1,САМОЛЁТ инфа.txt,0.025009,0.061903,0.020453,0.03526,0.04385,0.630309,0.440806,0.493231,0.149466,...,0.43785,0.055911,0.074751,0.171938,0.026611,0.093256,0.072581,0.081097,0.064717,0.087164
2,Лукойл инфа.txt,0.027974,0.062846,0.015628,0.032729,0.055892,0.549041,0.380366,0.544024,0.172322,...,0.348759,0.066491,0.073473,0.16308,0.023239,0.110797,0.07863,0.069863,0.062046,0.099246
3,ЭН+ ГРУП инфа.txt,0.022794,0.090075,0.01561,0.030713,0.056106,0.607297,0.400765,0.582579,0.172775,...,0.484575,0.074694,0.078245,0.199,0.031634,0.094207,0.079795,0.062724,0.055306,0.084947
4,МЕЧЕЛ инфа.txt,0.023837,0.086233,0.019279,0.031943,0.066247,0.544013,0.451916,0.606429,0.192228,...,0.431164,0.094119,0.066355,0.230393,0.028222,0.146423,0.110235,0.071762,0.064342,0.084335
5,Аэрофлот инфа.txt,0.023541,0.055277,0.020064,0.027003,0.037927,0.473231,0.330453,0.537024,0.119924,...,0.371841,0.039103,0.054184,0.103961,0.024212,0.05672,0.074787,0.089417,0.048368,0.047178
6,Полюс инфа.txt,0.029651,0.061544,0.030379,0.026903,0.039365,0.552986,0.454973,0.570482,0.148577,...,0.340641,0.060605,0.064306,0.155836,0.019436,0.081202,0.063464,0.067147,0.043119,0.062543
7,Татнефть инфа.txt,0.027104,0.139767,0.016457,0.033366,0.105035,0.462689,0.375903,0.516913,0.17896,...,0.403494,0.112418,0.054712,0.251177,0.035685,0.155086,0.136433,0.07595,0.084478,0.120358
8,Транснефть инфа.txt,0.025848,0.066365,0.016379,0.02766,0.056751,0.473676,0.337067,0.646416,0.151827,...,0.43783,0.076602,0.052765,0.134401,0.026969,0.072367,0.064111,0.072693,0.049438,0.054992
9,ПИК инфа.txt,0.028719,0.093617,0.020887,0.037927,0.063723,0.563013,0.455897,0.580182,0.158557,...,0.415579,0.077204,0.088287,0.198868,0.028311,0.099311,0.087027,0.094098,0.050251,0.103554


In [14]:
df1 = df1.rename(index=lambda x: df1['Company_name'][x].lower().replace(' инфа.txt', '')).drop(columns = {'Company_name'})
df1

index,Legal Proceedings & Law Violations,Biodiversity,Communities Health and Safety,Land Acquisition and Resettlement (S),Emergencies (Social),Corporate Governance,Responsible Investment & Greenwashing,Not Relevant to ESG,Economic Crime,Emergencies (Environmental),...,Discrimination,Minimum Age and Child Labour,Planning Limitations,Data Safety,Strategy Implementation,Energy Efficiency and Renewables,Land Acquisition and Resettlement (E),Supply Chain (Economic / Governance),Land Rehabilitation,Retrenchment
сегежа,0.045125,0.094551,0.025876,0.040868,0.075061,0.566426,0.514729,0.495201,0.235149,0.217873,...,0.374118,0.069972,0.078965,0.180511,0.02408,0.094233,0.108541,0.07587,0.059261,0.117523
самолёт,0.025009,0.061903,0.020453,0.03526,0.04385,0.630309,0.440806,0.493231,0.149466,0.172215,...,0.43785,0.055911,0.074751,0.171938,0.026611,0.093256,0.072581,0.081097,0.064717,0.087164
лукойл,0.027974,0.062846,0.015628,0.032729,0.055892,0.549041,0.380366,0.544024,0.172322,0.196258,...,0.348759,0.066491,0.073473,0.16308,0.023239,0.110797,0.07863,0.069863,0.062046,0.099246
эн+ груп,0.022794,0.090075,0.01561,0.030713,0.056106,0.607297,0.400765,0.582579,0.172775,0.172992,...,0.484575,0.074694,0.078245,0.199,0.031634,0.094207,0.079795,0.062724,0.055306,0.084947
мечел,0.023837,0.086233,0.019279,0.031943,0.066247,0.544013,0.451916,0.606429,0.192228,0.211642,...,0.431164,0.094119,0.066355,0.230393,0.028222,0.146423,0.110235,0.071762,0.064342,0.084335
аэрофлот,0.023541,0.055277,0.020064,0.027003,0.037927,0.473231,0.330453,0.537024,0.119924,0.103425,...,0.371841,0.039103,0.054184,0.103961,0.024212,0.05672,0.074787,0.089417,0.048368,0.047178
полюс,0.029651,0.061544,0.030379,0.026903,0.039365,0.552986,0.454973,0.570482,0.148577,0.129249,...,0.340641,0.060605,0.064306,0.155836,0.019436,0.081202,0.063464,0.067147,0.043119,0.062543
татнефть,0.027104,0.139767,0.016457,0.033366,0.105035,0.462689,0.375903,0.516913,0.17896,0.378207,...,0.403494,0.112418,0.054712,0.251177,0.035685,0.155086,0.136433,0.07595,0.084478,0.120358
транснефть,0.025848,0.066365,0.016379,0.02766,0.056751,0.473676,0.337067,0.646416,0.151827,0.200072,...,0.43783,0.076602,0.052765,0.134401,0.026969,0.072367,0.064111,0.072693,0.049438,0.054992
пик,0.028719,0.093617,0.020887,0.037927,0.063723,0.563013,0.455897,0.580182,0.158557,0.188869,...,0.415579,0.077204,0.088287,0.198868,0.028311,0.099311,0.087027,0.094098,0.050251,0.103554


In [15]:
df2 =  pd.read_excel("/content/RAEX_rating.xlsx").drop(columns = ['Код MOEX', 'Подотрасль', 'Год последней оцененной отчетности']).rename(columns = {'Название' : 'index'})
df2 = df2.set_index('index')
df2.index = df2.index.str.strip('«')
df2.index = df2.index.str.strip('»')
df2.index = df2.index.str.split('»').str[0]
df2 = df2.rename(index = lambda x: x.lower())
df2

Unnamed: 0_level_0,Y,ESG-рейтинг,E Rank,E-рейтинг,S Rank,S-рейтинг,G Rank,G-рейтинг
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
сбербанк,1,AA,2,AA,1,AAA,4,AAA
фосагро,2,AA,5,AA,3,AA,2,AAA
полюс,3,AA,1,AA,2,AA,14,AA
татнефть,4,AA,13,A,5,AA,6,AA
роснефть,5,AA,4,AA,22,A,5,AAA
...,...,...,...,...,...,...,...,...
новикомбанк,156,C,114,C,145,C,155-156,C
квадра,157-160,M,157-160,M,157-160,M,157-160,M
русская медная компания,157-160,M,157-160,M,157-160,M,157-160,M
угмк,157-160,M,157-160,M,157-160,M,157-160,M


In [16]:
df1.index

Index(['сегежа', 'самолёт', 'лукойл', 'эн+ груп', 'мечел', 'аэрофлот', 'полюс',
       'татнефть', 'транснефть', 'пик', 'сургутнефтегаз', 'ткс-холдинг',
       'россети', 'гмк норильский никель', 'икс 5 ритейл', 'газпром',
       'русагро', 'нлмк', 'вк', 'втб', 'сбербанк', 'русгидро', 'совкомфлот',
       'глобалтранс', 'магнит', 'роснефть', 'селигдар', 'группа позитив',
       'ммк', 'русал', 'фосагро', 'мтс', 'алроса', 'мосэнерго', 'северсталь',
       'интер рао', 'юнипро', 'банк санкт-петербург', 'мкб', 'новатэк',
       'московская биржа', 'афк система'],
      dtype='object')

In [17]:
df1 = df1.rename(index = {'икс 5 ритейл' : 'x5 group', 'вк' : 'vk', 'гмк норильский никель' : 'норильский никель', 'группа позитив' : 'positive technologies'})

In [18]:
df2 = df2.rename(index = {'афк «система' : 'афк система', 'аэрофлот - российские авиалинии' : 'аэрофлот', 'банк «санкт-петербург' : 'банк санкт-петербург', 'банк втб' : 'втб', 'globaltrans' : 'глобалтранс', 'московский кредитный банк' : 'мкб', 'магнитогорский металлургический комбинат (ммк)' : 'ммк', 'пао "мтс"' : 'мтс', 'группа компаний пик' : 'пик', 'сегежа групп': 'сегежа', 'гк «самолет' : 'самолёт', 'акционерная компания «алроса' : 'алроса', 'пао «селигдар' : 'селигдар'})
df2

Unnamed: 0_level_0,Y,ESG-рейтинг,E Rank,E-рейтинг,S Rank,S-рейтинг,G Rank,G-рейтинг
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
сбербанк,1,AA,2,AA,1,AAA,4,AAA
фосагро,2,AA,5,AA,3,AA,2,AAA
полюс,3,AA,1,AA,2,AA,14,AA
татнефть,4,AA,13,A,5,AA,6,AA
роснефть,5,AA,4,AA,22,A,5,AAA
...,...,...,...,...,...,...,...,...
новикомбанк,156,C,114,C,145,C,155-156,C
квадра,157-160,M,157-160,M,157-160,M,157-160,M
русская медная компания,157-160,M,157-160,M,157-160,M,157-160,M
угмк,157-160,M,157-160,M,157-160,M,157-160,M


In [19]:
for word in sorted(df2.index):
  print(word)

"каустик" (г. волгоград)
"нефтиса", нефтяная компания
"соликамскбумпром"
"титан", группа компаний (деревообработка)
en+ group
global ports
highland gold mining
nordgold
positive technologies
s7 airlines
vk
x5 group
аб «россия
агрохолдинг «степь
азбука вкуса
ак барс
акрон
алмазэргиэнбанк
алроса
альфа-банк
архангельский цбк
афк система
ашинский металлургический завод
аэрофлот
банк санкт-петербург
башкирская содовая компания
башнефть
вбрр
вкусвилл
втб
вымпелком
высочайший
газ
газпром
газпром нефть
газпромбанк
гиперглобус
глобалтранс
группа «эталон
группа компаний «азот
группа лср
группа полипластик
группа синара
группа черкизово
детский мир
евраз
зарубежнефть
золоторудная компания павлик
илим
инарктика
интер рао
интернет решения
иркутская нефтяная компания
камаз
карелия палп
квадра
киви банк
кордиант
корпорация всмпо-ависма
красноярский завод цветных металлов имени в.н. гулидова
кузбассразрезуголь
куйбышевазот
лента
лукойл
м.видео-эльдорадо
магнит
мегафон
международный аэропорт шереметьев

In [20]:
merged_df = df1.merge(df2, left_index=True, right_index=True)

In [21]:
merged_df

Unnamed: 0,Legal Proceedings & Law Violations,Biodiversity,Communities Health and Safety,Land Acquisition and Resettlement (S),Emergencies (Social),Corporate Governance,Responsible Investment & Greenwashing,Not Relevant to ESG,Economic Crime,Emergencies (Environmental),...,Land Rehabilitation,Retrenchment,Y,ESG-рейтинг,E Rank,E-рейтинг,S Rank,S-рейтинг,G Rank,G-рейтинг
сегежа,0.045125,0.094551,0.025876,0.040868,0.075061,0.566426,0.514729,0.495201,0.235149,0.217873,...,0.059261,0.117523,55,B,56,CCC,50,BB,60,BB
самолёт,0.025009,0.061903,0.020453,0.03526,0.04385,0.630309,0.440806,0.493231,0.149466,0.172215,...,0.064717,0.087164,52,BB,60,CCC,47,BB,43,BBB
лукойл,0.027974,0.062846,0.015628,0.032729,0.055892,0.549041,0.380366,0.544024,0.172322,0.196258,...,0.062046,0.099246,17,A,19,BBB,17,A,12,AA
мечел,0.023837,0.086233,0.019279,0.031943,0.066247,0.544013,0.451916,0.606429,0.192228,0.211642,...,0.064342,0.084335,112,CC,117,C,123,CC,95,B
аэрофлот,0.023541,0.055277,0.020064,0.027003,0.037927,0.473231,0.330453,0.537024,0.119924,0.103425,...,0.048368,0.047178,30,BBB,37,B,23,A,28,A
полюс,0.029651,0.061544,0.030379,0.026903,0.039365,0.552986,0.454973,0.570482,0.148577,0.129249,...,0.043119,0.062543,3,AA,1,AA,2,AA,14,AA
татнефть,0.027104,0.139767,0.016457,0.033366,0.105035,0.462689,0.375903,0.516913,0.17896,0.378207,...,0.084478,0.120358,4,AA,13,A,5,AA,6,AA
транснефть,0.025848,0.066365,0.016379,0.02766,0.056751,0.473676,0.337067,0.646416,0.151827,0.200072,...,0.049438,0.054992,94,CCC,82,CC,76,CCC,115,CCC
пик,0.028719,0.093617,0.020887,0.037927,0.063723,0.563013,0.455897,0.580182,0.158557,0.188869,...,0.050251,0.103554,134,CC,133-134,C,149,C,120,CCC
сургутнефтегаз,0.031649,0.094598,0.029158,0.046756,0.081039,0.666742,0.50335,0.532149,0.23382,0.225991,...,0.069793,0.102514,72,CCC,39,B,110,CC,102,B


In [22]:
set1 = set(df1.index)
set2 = set(df2.index)
result = set1.difference(set2)
result

{'мосэнерго', 'ткс-холдинг', 'эн+ груп'}

In [23]:
#merged_df.to_csv("Weights_with_rating.csv")

In [24]:
from google.colab import files

#files.download('Weights_with_rating.csv')

In [25]:
points_dict = {'C' : 1/18, 'CC' : 1/6, 'CCC' : 5/18, 'B' : 7/18, 'BB' : 1/2, 'BBB' : 11/18, 'A' : 13/18, 'AA' : 5/6, 'AAA' : 17/18}
merged_df['ESG-рейтинг'] = merged_df['ESG-рейтинг'].replace(points_dict)
merged_df['E-рейтинг'] = merged_df['E-рейтинг'].replace(points_dict)
merged_df['S-рейтинг'] = merged_df['S-рейтинг'].replace(points_dict)
merged_df['G-рейтинг'] = merged_df['G-рейтинг'].replace(points_dict)
merged_df

Unnamed: 0,Legal Proceedings & Law Violations,Biodiversity,Communities Health and Safety,Land Acquisition and Resettlement (S),Emergencies (Social),Corporate Governance,Responsible Investment & Greenwashing,Not Relevant to ESG,Economic Crime,Emergencies (Environmental),...,Land Rehabilitation,Retrenchment,Y,ESG-рейтинг,E Rank,E-рейтинг,S Rank,S-рейтинг,G Rank,G-рейтинг
сегежа,0.045125,0.094551,0.025876,0.040868,0.075061,0.566426,0.514729,0.495201,0.235149,0.217873,...,0.059261,0.117523,55,0.388889,56,0.277778,50,0.5,60,0.5
самолёт,0.025009,0.061903,0.020453,0.03526,0.04385,0.630309,0.440806,0.493231,0.149466,0.172215,...,0.064717,0.087164,52,0.5,60,0.277778,47,0.5,43,0.611111
лукойл,0.027974,0.062846,0.015628,0.032729,0.055892,0.549041,0.380366,0.544024,0.172322,0.196258,...,0.062046,0.099246,17,0.722222,19,0.611111,17,0.722222,12,0.833333
мечел,0.023837,0.086233,0.019279,0.031943,0.066247,0.544013,0.451916,0.606429,0.192228,0.211642,...,0.064342,0.084335,112,0.166667,117,0.055556,123,0.166667,95,0.388889
аэрофлот,0.023541,0.055277,0.020064,0.027003,0.037927,0.473231,0.330453,0.537024,0.119924,0.103425,...,0.048368,0.047178,30,0.611111,37,0.388889,23,0.722222,28,0.722222
полюс,0.029651,0.061544,0.030379,0.026903,0.039365,0.552986,0.454973,0.570482,0.148577,0.129249,...,0.043119,0.062543,3,0.833333,1,0.833333,2,0.833333,14,0.833333
татнефть,0.027104,0.139767,0.016457,0.033366,0.105035,0.462689,0.375903,0.516913,0.17896,0.378207,...,0.084478,0.120358,4,0.833333,13,0.722222,5,0.833333,6,0.833333
транснефть,0.025848,0.066365,0.016379,0.02766,0.056751,0.473676,0.337067,0.646416,0.151827,0.200072,...,0.049438,0.054992,94,0.277778,82,0.166667,76,0.277778,115,0.277778
пик,0.028719,0.093617,0.020887,0.037927,0.063723,0.563013,0.455897,0.580182,0.158557,0.188869,...,0.050251,0.103554,134,0.166667,133-134,0.055556,149,0.055556,120,0.277778
сургутнефтегаз,0.031649,0.094598,0.029158,0.046756,0.081039,0.666742,0.50335,0.532149,0.23382,0.225991,...,0.069793,0.102514,72,0.277778,39,0.388889,110,0.166667,102,0.388889


In [26]:
#merged_df.to_csv("Weights_with_rating_number.csv")

In [27]:
from google.colab import files

#files.download('Weights_with_rating.csv')

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
df1

index,Legal Proceedings & Law Violations,Biodiversity,Communities Health and Safety,Land Acquisition and Resettlement (S),Emergencies (Social),Corporate Governance,Responsible Investment & Greenwashing,Not Relevant to ESG,Economic Crime,Emergencies (Environmental),...,Discrimination,Minimum Age and Child Labour,Planning Limitations,Data Safety,Strategy Implementation,Energy Efficiency and Renewables,Land Acquisition and Resettlement (E),Supply Chain (Economic / Governance),Land Rehabilitation,Retrenchment
сегежа,0.045125,0.094551,0.025876,0.040868,0.075061,0.566426,0.514729,0.495201,0.235149,0.217873,...,0.374118,0.069972,0.078965,0.180511,0.02408,0.094233,0.108541,0.07587,0.059261,0.117523
самолёт,0.025009,0.061903,0.020453,0.03526,0.04385,0.630309,0.440806,0.493231,0.149466,0.172215,...,0.43785,0.055911,0.074751,0.171938,0.026611,0.093256,0.072581,0.081097,0.064717,0.087164
лукойл,0.027974,0.062846,0.015628,0.032729,0.055892,0.549041,0.380366,0.544024,0.172322,0.196258,...,0.348759,0.066491,0.073473,0.16308,0.023239,0.110797,0.07863,0.069863,0.062046,0.099246
эн+ груп,0.022794,0.090075,0.01561,0.030713,0.056106,0.607297,0.400765,0.582579,0.172775,0.172992,...,0.484575,0.074694,0.078245,0.199,0.031634,0.094207,0.079795,0.062724,0.055306,0.084947
мечел,0.023837,0.086233,0.019279,0.031943,0.066247,0.544013,0.451916,0.606429,0.192228,0.211642,...,0.431164,0.094119,0.066355,0.230393,0.028222,0.146423,0.110235,0.071762,0.064342,0.084335
аэрофлот,0.023541,0.055277,0.020064,0.027003,0.037927,0.473231,0.330453,0.537024,0.119924,0.103425,...,0.371841,0.039103,0.054184,0.103961,0.024212,0.05672,0.074787,0.089417,0.048368,0.047178
полюс,0.029651,0.061544,0.030379,0.026903,0.039365,0.552986,0.454973,0.570482,0.148577,0.129249,...,0.340641,0.060605,0.064306,0.155836,0.019436,0.081202,0.063464,0.067147,0.043119,0.062543
татнефть,0.027104,0.139767,0.016457,0.033366,0.105035,0.462689,0.375903,0.516913,0.17896,0.378207,...,0.403494,0.112418,0.054712,0.251177,0.035685,0.155086,0.136433,0.07595,0.084478,0.120358
транснефть,0.025848,0.066365,0.016379,0.02766,0.056751,0.473676,0.337067,0.646416,0.151827,0.200072,...,0.43783,0.076602,0.052765,0.134401,0.026969,0.072367,0.064111,0.072693,0.049438,0.054992
пик,0.028719,0.093617,0.020887,0.037927,0.063723,0.563013,0.455897,0.580182,0.158557,0.188869,...,0.415579,0.077204,0.088287,0.198868,0.028311,0.099311,0.087027,0.094098,0.050251,0.103554


In [30]:
from google.colab import files

#files.download('weights_with_rating.csv')

In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (ExtraTreesClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (ConfusionMatrixDisplay, accuracy_score,
                             classification_report, confusion_matrix, f1_score,
                             make_scorer, precision_score, recall_score,
                             roc_auc_score)
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [32]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [33]:
merged_df

Unnamed: 0,Legal Proceedings & Law Violations,Biodiversity,Communities Health and Safety,Land Acquisition and Resettlement (S),Emergencies (Social),Corporate Governance,Responsible Investment & Greenwashing,Not Relevant to ESG,Economic Crime,Emergencies (Environmental),...,Land Rehabilitation,Retrenchment,Y,ESG-рейтинг,E Rank,E-рейтинг,S Rank,S-рейтинг,G Rank,G-рейтинг
сегежа,0.045125,0.094551,0.025876,0.040868,0.075061,0.566426,0.514729,0.495201,0.235149,0.217873,...,0.059261,0.117523,55,0.388889,56,0.277778,50,0.5,60,0.5
самолёт,0.025009,0.061903,0.020453,0.03526,0.04385,0.630309,0.440806,0.493231,0.149466,0.172215,...,0.064717,0.087164,52,0.5,60,0.277778,47,0.5,43,0.611111
лукойл,0.027974,0.062846,0.015628,0.032729,0.055892,0.549041,0.380366,0.544024,0.172322,0.196258,...,0.062046,0.099246,17,0.722222,19,0.611111,17,0.722222,12,0.833333
мечел,0.023837,0.086233,0.019279,0.031943,0.066247,0.544013,0.451916,0.606429,0.192228,0.211642,...,0.064342,0.084335,112,0.166667,117,0.055556,123,0.166667,95,0.388889
аэрофлот,0.023541,0.055277,0.020064,0.027003,0.037927,0.473231,0.330453,0.537024,0.119924,0.103425,...,0.048368,0.047178,30,0.611111,37,0.388889,23,0.722222,28,0.722222
полюс,0.029651,0.061544,0.030379,0.026903,0.039365,0.552986,0.454973,0.570482,0.148577,0.129249,...,0.043119,0.062543,3,0.833333,1,0.833333,2,0.833333,14,0.833333
татнефть,0.027104,0.139767,0.016457,0.033366,0.105035,0.462689,0.375903,0.516913,0.17896,0.378207,...,0.084478,0.120358,4,0.833333,13,0.722222,5,0.833333,6,0.833333
транснефть,0.025848,0.066365,0.016379,0.02766,0.056751,0.473676,0.337067,0.646416,0.151827,0.200072,...,0.049438,0.054992,94,0.277778,82,0.166667,76,0.277778,115,0.277778
пик,0.028719,0.093617,0.020887,0.037927,0.063723,0.563013,0.455897,0.580182,0.158557,0.188869,...,0.050251,0.103554,134,0.166667,133-134,0.055556,149,0.055556,120,0.277778
сургутнефтегаз,0.031649,0.094598,0.029158,0.046756,0.081039,0.666742,0.50335,0.532149,0.23382,0.225991,...,0.069793,0.102514,72,0.277778,39,0.388889,110,0.166667,102,0.388889


In [118]:
target = 'ESG-рейтинг'
print(merged_df.columns)
X = merged_df.drop(columns=[target, 'Y', 'E Rank', 'S Rank', 'G Rank',
                                         'E-рейтинг', 'S-рейтинг', 'G-рейтинг'])
y = merged_df[target]

rand_seed = 200
n_butches = 10

X_train = [None for _ in range(n_butches)]
X_test = [None for _ in range(n_butches)]
y_train = [None for _ in range(n_butches)]
y_test = [None for _ in range(n_butches)]

for i in range(n_butches):
    X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(X, y, test_size=0.25, random_state=rand_seed * (i + 1), shuffle=True, stratify=y)


y_test[0].values

Index(['Legal Proceedings & Law Violations', 'Biodiversity',
       'Communities Health and Safety',
       'Land Acquisition and Resettlement (S)', 'Emergencies (Social)',
       'Corporate Governance', 'Responsible Investment & Greenwashing',
       'Not Relevant to ESG', 'Economic Crime', 'Emergencies (Environmental)',
       'Hazardous Materials Management', 'Environmental Management',
       'Landscape Transformation', 'Human Rights', 'Climate Risks',
       'Labor Relations Management',
       'Freedom of Association and Right to Organise',
       'Employee Health and Safety', 'Surface Water Pollution',
       'Animal Welfare', 'Water Consumption', 'Disclosure',
       'Product Safety and Quality', 'Greenhouse Gas Emissions',
       'Indigenous People', 'Cultural Heritage', 'Air Pollution',
       'Waste Management', 'Soil and Groundwater Impact', 'Forced Labour',
       'Wastewater Management', 'Natural Resources', 'Physical Impacts',
       'Values and Ethics', 'Risk Management

array([0.38888889, 0.72222222, 0.61111111, 0.72222222, 0.16666667,
       0.27777778, 0.61111111, 0.27777778, 0.5       , 0.83333333])

In [119]:
#сделаем красивую общую табличку с резами.
compare_table = pd.DataFrame(columns=['Model', 'MSE', 'R^2 score'] + [str(i) for i in range(len(y_test[-1]))])
new_row = pd.DataFrame({'Model': 'RAEX values', 'MSE': 0, 'R^2 score': 0, **dict(zip(compare_table.columns[3:], y_test[-1].values))}, index=[0])
compare_table = pd.concat([compare_table, new_row], ignore_index=True)
compare_table


Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9
0,RAEX values,0,0,0.388889,0.611111,0.277778,0.5,0.611111,0.277778,0.166667,0.722222,0.833333,0.833333


In [120]:
def add_row_res(model, mse, rsq, y_pred):
    return pd.DataFrame({'Model': model, 'MSE': mse, 'R^2 score': rsq, **dict(zip(compare_table.columns[3:], y_pred))}, index=[0])

In [121]:
mse = [0 for _ in range(n_butches)]
r2 = [0 for _ in range(n_butches)]
for i in range(n_butches):
    model = LinearRegression(fit_intercept = True)
    model.fit(X_train[i], y_train[i])

    y_pred = model.predict(X_test[i])

    # print(y_pred)
    # print(y_test)

    mse[i] = mean_squared_error(y_test[i], y_pred)
    r2[i] = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('LinReg', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Mean Squared Error: 0.28298430404883124
R^2 Score: -4.932865370229651


In [122]:
%time

model = RandomForestRegressor(random_state=42)
#model.fit(X_train, y_train)
param_grid = {
    'n_estimators': [50],
    'max_depth': [None],
    'min_samples_split': [20],
    'min_samples_leaf': [1],
    'bootstrap': [True],
    'criterion': ['friedman_mse']
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

#print("Лучшие параметры: ", grid_search.best_params_)
#Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 50}
#Mean Squared Error: 0.05508845985402971
for i in range(n_butches):

    grid_search.fit(X_train[i], y_train[i])
    y_pred = grid_search.predict(X_test[i])

    # Оценка модели
    mse[i] = mean_squared_error(y_test[i], y_pred)
    r2[i] = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('RandomForest', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs
Mean Squared Error: 0.0528779256641844
R^2 Score: -0.11586787097248656


In [123]:
param_grid = {
    'n_neighbors': [19],
    'weights': ['uniform'],
    'metric': ['chebyshev'],
}
model = KNeighborsRegressor()
model.fit(X_train[-1], y_train[-1])

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров

# Вывод лучших параметров
#print("Лучшие параметры: ", grid_search.best_params_)
# Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 19, 'weights': 'uniform'}
# Mean Squared Error: 0.04452367110107953

# Оценка модели
for i in range(n_butches):
    grid_search.fit(X_train[i], y_train[i])
    y_pred = grid_search.predict(X_test[i])

    mse = mean_squared_error(y_test[i], y_pred)
    r2 = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('KNN', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Mean Squared Error: 0.05406449847816423
R^2 Score: -0.052698167483486325


In [124]:
# Определение параметров для поиска
param_grid = {
    'max_depth': [None],
    'min_samples_split': [20],
    'min_samples_leaf': [9],
    'max_features': ['sqrt'],
}

# Создание модели
model = DecisionTreeRegressor(random_state=42)

# Создание объекта GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров


# Вывод лучших параметров
# print("Лучшие параметры: ", grid_search.best_params_)

# Лучшие параметры:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 9, 'min_samples_split': 20}
# Mean Squared Error: 0.050617283950617285
for i in range(n_butches):
    grid_search.fit(X_train[i], y_train[i])
    # Предсказание на тестовых данных с использованием лучшей модели
    y_pred = grid_search.predict(X_test[i])
    # Оценка модели
    mse = mean_squared_error(y_test[i], y_pred)
    r2 = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")
print(y_pred)

new_row = add_row_res('DecisionTree', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Mean Squared Error: 0.06432171816787202
R^2 Score: -0.2524180700955849
[0.61111111 0.43162393 0.43162393 0.43162393 0.43162393 0.43162393
 0.43162393 0.43162393 0.43162393 0.43162393]


In [125]:
#Проверка на дурака - какая ошибка будет у константного решения (матожидание = 0.5+-eps)
y_pred = [0.54 for i in range(len(y_test[-1]))]

mse = mean_squared_error(y_test[-1], y_pred)
r2 = r2_score(y_test[-1], y_pred)

new_row = add_row_res('CoolMonkey', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

In [126]:
compare_table

Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9
0,RAEX values,0.0,0.0,0.388889,0.611111,0.277778,0.5,0.611111,0.277778,0.166667,0.722222,0.833333,0.833333
1,LinReg,0.282984,-4.932865,0.870943,-0.026127,-0.45305,0.025419,0.565073,-0.837588,0.215785,2.08188,0.482407,1.115623
2,RandomForest,0.052878,-0.115868,0.519299,0.514601,0.509806,0.557664,0.524954,0.505151,0.562756,0.529197,0.524966,0.525025
3,KNN,0.054064,-0.052698,0.535088,0.505848,0.482456,0.55848,0.5,0.5,0.55848,0.511696,0.505848,0.523392
4,DecisionTree,0.064322,-0.252418,0.611111,0.431624,0.431624,0.431624,0.431624,0.431624,0.431624,0.431624,0.431624,0.431624
5,CoolMonkey,0.051674,-0.006154,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54


Теперь попробуем обучить для каждой из букв E, S, G

# E rating

In [177]:
target = 'E-рейтинг'
print(merged_df.columns)
X = merged_df.drop(columns=[target, 'Y', 'E Rank', 'S Rank', 'G Rank',
                                         'ESG-рейтинг', 'S-рейтинг', 'G-рейтинг'])
y = merged_df[target]

rand_seed = 200
n_butches = 10

X_train = [None for _ in range(n_butches)]
X_test = [None for _ in range(n_butches)]
y_train = [None for _ in range(n_butches)]
y_test = [None for _ in range(n_butches)]

for i in range(n_butches):
    X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(X, y, test_size=0.25, random_state=rand_seed * (i + 1), shuffle=True)


y

Index(['Legal Proceedings & Law Violations', 'Biodiversity',
       'Communities Health and Safety',
       'Land Acquisition and Resettlement (S)', 'Emergencies (Social)',
       'Corporate Governance', 'Responsible Investment & Greenwashing',
       'Not Relevant to ESG', 'Economic Crime', 'Emergencies (Environmental)',
       'Hazardous Materials Management', 'Environmental Management',
       'Landscape Transformation', 'Human Rights', 'Climate Risks',
       'Labor Relations Management',
       'Freedom of Association and Right to Organise',
       'Employee Health and Safety', 'Surface Water Pollution',
       'Animal Welfare', 'Water Consumption', 'Disclosure',
       'Product Safety and Quality', 'Greenhouse Gas Emissions',
       'Indigenous People', 'Cultural Heritage', 'Air Pollution',
       'Waste Management', 'Soil and Groundwater Impact', 'Forced Labour',
       'Wastewater Management', 'Natural Resources', 'Physical Impacts',
       'Values and Ethics', 'Risk Management

сегежа                   0.277778
самолёт                  0.277778
лукойл                   0.611111
мечел                    0.055556
аэрофлот                 0.388889
полюс                    0.833333
татнефть                 0.722222
транснефть               0.166667
пик                      0.055556
сургутнефтегаз           0.388889
россети                  0.388889
норильский никель        0.722222
x5 group                 0.611111
газпром                  0.500000
русагро                  0.055556
нлмк                     0.500000
vk                       0.388889
втб                      0.055556
сбербанк                 0.833333
русгидро                 0.500000
совкомфлот               0.166667
глобалтранс              0.166667
магнит                   0.277778
роснефть                 0.833333
селигдар                 0.277778
positive technologies    0.055556
ммк                      0.722222
русал                    0.722222
фосагро                  0.833333
мтс           

In [178]:
#сделаем красивую общую табличку с резами.
compare_table = pd.DataFrame(columns=['Model', 'MSE', 'R^2 score'] + [str(i) for i in range(len(y_test[-1]))])
new_row = pd.DataFrame({'Model': 'E values', 'MSE': 0, 'R^2 score': 0, **dict(zip(compare_table.columns[3:], y_test[-1].values))}, index=[0])
compare_table = pd.concat([compare_table, new_row], ignore_index=True)
compare_table


Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9
0,E values,0,0,0.833333,0.722222,0.166667,0.277778,0.833333,0.055556,0.5,0.5,0.277778,0.722222


In [179]:
mse = [0 for _ in range(n_butches)]
r2 = [0 for _ in range(n_butches)]
for i in range(n_butches):
    model = LinearRegression(fit_intercept = True)
    model.fit(X_train[i], y_train[i])

    y_pred = model.predict(X_test[i])

    # print(y_pred)
    # print(y_test)

    mse[i] = mean_squared_error(y_test[i], y_pred)
    r2[i] = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('LinReg', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Mean Squared Error: 0.6147851975885477
R^2 Score: -9.273566557213819


In [180]:
%time
print(mse)
model = RandomForestRegressor(random_state=42)
#model.fit(X_train, y_train)
param_grid_try = {
    'n_estimators': [10, 25, 50, 75, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [5, 10, 15, 20],
    'min_samples_leaf': [1, 5, 7],
    'bootstrap': [True],
    'criterion': ['friedman_mse']
}
param_grid = {
    'n_estimators': [25],
    'max_depth': [None],
    'min_samples_split': [15],
    'min_samples_leaf': [1],
    'bootstrap': [True],
    'criterion': ['friedman_mse']
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

#print("Лучшие параметры: ", grid_search.best_params_)
#Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 50}
#Mean Squared Error: 0.05508845985402971
for i in range(n_butches):

    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    y_pred = grid_search.predict(X_test[i])
    print(mse)
    # Оценка модели
    mse[i] = mean_squared_error(y_test[i], y_pred)
    r2[i] = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('RandomForest', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.58 µs
[1.0612033537994645, 0.4931666798704956, 0.3193159926238077, 0.4836100610538841, 0.557536965483411, 0.6993635694557818, 0.8493536500958733, 0.8416563868315239, 0.34074697950353156, 0.5018983371677024]


  pid = os.fork()


Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 25}
[1.0612033537994645, 0.4931666798704956, 0.3193159926238077, 0.4836100610538841, 0.557536965483411, 0.6993635694557818, 0.8493536500958733, 0.8416563868315239, 0.34074697950353156, 0.5018983371677024]
Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 25}
[0.09519665580963295, 0.4931666798704956, 0.3193159926238077, 0.4836100610538841, 0.557536965483411, 0.6993635694557818, 0.8493536500958733, 0.8416563868315239, 0.34074697950353156, 0.5018983371677024]
Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 25}
[0.09519665580963295, 0.05023413257064234, 0.3193159926238077, 0.4836100610538841, 0.557536965483411, 0.6993635694557818, 0.84935365

In [181]:
param_grid_try = {
    'n_neighbors': [5, 10, 15, 20, 25, 30, 40, 50, 70, 100],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev', 'seuclidean', 'mahalanobis', 'wminkowski'],
}
param_grid = {
    'n_neighbors': [20],
    'weights': ['uniform'],
    'metric': ['chebyshev'],
}
model = KNeighborsRegressor()
model.fit(X_train[-1], y_train[-1])

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров

# Вывод лучших параметров
#print("Лучшие параметры: ", grid_search.best_params_)
# Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 19, 'weights': 'uniform'}
# Mean Squared Error: 0.04452367110107953

# Оценка модели
for i in range(n_butches):
    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    y_pred = grid_search.predict(X_test[i])

    mse = mean_squared_error(y_test[i], y_pred)
    r2 = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('KNN', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Mean Squared Error: 0.0859104938271605
R^2 Score: -0.18145161290322598


In [182]:
# Определение параметров для поиска
param_grid_try = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 5],
    'max_features': [None],
}
param_grid = {
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [2],
    'max_features': [None],
}

# Создание модели
model = DecisionTreeRegressor(random_state=42)

# Создание объекта GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров


# Вывод лучших параметров
# print("Лучшие параметры: ", grid_search.best_params_)

# Лучшие параметры:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 9, 'min_samples_split': 20}
# Mean Squared Error: 0.050617283950617285
for i in range(n_butches):
    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    # Предсказание на тестовых данных с использованием лучшей модели
    y_pred = grid_search.predict(X_test[i])
    # Оценка модели
    mse = mean_squared_error(y_test[i], y_pred)
    r2 = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")
print(y_pred)

new_row = add_row_res('DecisionTree', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  

In [183]:
#Проверка на дурака - какая ошибка будет у константного решения (матожидание = 0.5+-eps)
y_pred = [0.54 for i in range(len(y_test[-1]))]

mse = mean_squared_error(y_test[-1], y_pred)
r2 = r2_score(y_test[-1], y_pred)

new_row = add_row_res('CoolMonkey', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

In [184]:
compare_table

Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9
0,E values,0.0,0.0,0.833333,0.722222,0.166667,0.277778,0.833333,0.055556,0.5,0.5,0.277778,0.722222
1,LinReg,0.614785,-9.273567,1.068549,-1.041066,-0.04012,0.620831,0.217229,0.982204,0.102848,0.132288,0.15477,1.106529
2,RandomForest,0.08021,-0.284653,0.386852,0.44508,0.375057,0.405353,0.460355,0.539516,0.437048,0.378653,0.47026,0.417812
3,KNN,0.08591,-0.181452,0.344444,0.416667,0.4,0.366667,0.394444,0.388889,0.422222,0.338889,0.377778,0.377778
4,DecisionTree,0.091805,-0.262519,0.527778,0.420635,0.111111,0.420635,0.111111,0.111111,0.420635,0.103175,0.420635,0.759259
5,CoolMonkey,0.075328,-0.035925,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54


# S rating

In [186]:
target = 'S-рейтинг'
print(merged_df.columns)
X = merged_df.drop(columns=[target, 'Y', 'E Rank', 'S Rank', 'G Rank',
                                         'ESG-рейтинг', 'E-рейтинг', 'G-рейтинг'])
y = merged_df[target]
rand_seed = 200
n_butches = 10

X_train = [None for _ in range(n_butches)]
X_test = [None for _ in range(n_butches)]
y_train = [None for _ in range(n_butches)]
y_test = [None for _ in range(n_butches)]

for i in range(n_butches):
    X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(X, y, test_size=0.25, random_state=rand_seed * (i + 1), shuffle=True)


y_test[-1].values

Index(['Legal Proceedings & Law Violations', 'Biodiversity',
       'Communities Health and Safety',
       'Land Acquisition and Resettlement (S)', 'Emergencies (Social)',
       'Corporate Governance', 'Responsible Investment & Greenwashing',
       'Not Relevant to ESG', 'Economic Crime', 'Emergencies (Environmental)',
       'Hazardous Materials Management', 'Environmental Management',
       'Landscape Transformation', 'Human Rights', 'Climate Risks',
       'Labor Relations Management',
       'Freedom of Association and Right to Organise',
       'Employee Health and Safety', 'Surface Water Pollution',
       'Animal Welfare', 'Water Consumption', 'Disclosure',
       'Product Safety and Quality', 'Greenhouse Gas Emissions',
       'Indigenous People', 'Cultural Heritage', 'Air Pollution',
       'Waste Management', 'Soil and Groundwater Impact', 'Forced Labour',
       'Wastewater Management', 'Natural Resources', 'Physical Impacts',
       'Values and Ethics', 'Risk Management

array([0.83333333, 0.83333333, 0.27777778, 0.72222222, 0.94444444,
       0.27777778, 0.38888889, 0.61111111, 0.61111111, 0.83333333])

In [187]:
#сделаем красивую общую табличку с резами.
compare_table = pd.DataFrame(columns=['Model', 'MSE', 'R^2 score'] + [str(i) for i in range(len(y_test[-1]))])
new_row = pd.DataFrame({'Model': 'E values', 'MSE': 0, 'R^2 score': 0, **dict(zip(compare_table.columns[3:], y_test[-1].values))}, index=[0])
compare_table = pd.concat([compare_table, new_row], ignore_index=True)
compare_table


Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9
0,E values,0,0,0.833333,0.833333,0.277778,0.722222,0.944444,0.277778,0.388889,0.611111,0.611111,0.833333


In [188]:
mse = [0 for _ in range(n_butches)]
r2 = [0 for _ in range(n_butches)]
for i in range(n_butches):
    model = LinearRegression(fit_intercept = True)
    model.fit(X_train[i], y_train[i])

    y_pred = model.predict(X_test[i])

    # print(y_pred)
    # print(y_test)

    mse[i] = mean_squared_error(y_test[i], y_pred)
    r2[i] = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('LinReg', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Mean Squared Error: 0.5537179492874631
R^2 Score: -10.481536425232354


In [189]:
%time
print(mse)
model = RandomForestRegressor(random_state=42)
#model.fit(X_train, y_train)
param_grid_try = {
    'n_estimators': [10, 25, 50, 75, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [5, 10, 15, 20],
    'min_samples_leaf': [1, 5, 7],
    'bootstrap': [True],
    'criterion': ['friedman_mse']
}
param_grid = {
    'n_estimators': [25],
    'max_depth': [None],
    'min_samples_split': [15],
    'min_samples_leaf': [1],
    'bootstrap': [True],
    'criterion': ['friedman_mse']
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

#print("Лучшие параметры: ", grid_search.best_params_)
#Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 50}
#Mean Squared Error: 0.05508845985402971
for i in range(n_butches):

    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    y_pred = grid_search.predict(X_test[i])
    print(mse)
    # Оценка модели
    mse[i] = mean_squared_error(y_test[i], y_pred)
    r2[i] = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('RandomForest', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs
[1.3257937318203061, 0.3315037447141355, 0.3387809888007506, 0.4763872368196053, 0.33160030386408795, 0.3547795458087758, 0.7283360982036118, 0.5749891698202255, 0.49234356162757076, 0.5826651113955611]


  pid = os.fork()


Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 25}
[1.3257937318203061, 0.3315037447141355, 0.3387809888007506, 0.4763872368196053, 0.33160030386408795, 0.3547795458087758, 0.7283360982036118, 0.5749891698202255, 0.49234356162757076, 0.5826651113955611]
Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 25}
[0.05695521080295525, 0.3315037447141355, 0.3387809888007506, 0.4763872368196053, 0.33160030386408795, 0.3547795458087758, 0.7283360982036118, 0.5749891698202255, 0.49234356162757076, 0.5826651113955611]
Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 25}
[0.05695521080295525, 0.037396974301206885, 0.3387809888007506, 0.4763872368196053, 0.33160030386408795, 0.3547795458087758, 0.7

In [190]:
param_grid_try = {
    'n_neighbors': [5, 10, 15, 20, 25, 30, 40, 50, 70, 100],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev', 'seuclidean', 'mahalanobis', 'wminkowski'],
}
param_grid = {
    'n_neighbors': [20],
    'weights': ['uniform'],
    'metric': ['chebyshev'],
}
model = KNeighborsRegressor()
model.fit(X_train[-1], y_train[-1])

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров

# Вывод лучших параметров
#print("Лучшие параметры: ", grid_search.best_params_)
# Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 19, 'weights': 'uniform'}
# Mean Squared Error: 0.04452367110107953

# Оценка модели
for i in range(n_butches):
    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    y_pred = grid_search.predict(X_test[i])

    mse = mean_squared_error(y_test[i], y_pred)
    r2 = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('KNN', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Mean Squared Error: 0.07841975308641976
R^2 Score: -0.4568807339449543


In [191]:
# Определение параметров для поиска
param_grid_try = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 5],
    'max_features': [None],
}
param_grid = {
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [2],
    'max_features': [None],
}

# Создание модели
model = DecisionTreeRegressor(random_state=42)

# Создание объекта GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров


# Вывод лучших параметров
# print("Лучшие параметры: ", grid_search.best_params_)

# Лучшие параметры:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 9, 'min_samples_split': 20}
# Mean Squared Error: 0.050617283950617285
for i in range(n_butches):
    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    # Предсказание на тестовых данных с использованием лучшей модели
    y_pred = grid_search.predict(X_test[i])
    # Оценка модели
    mse = mean_squared_error(y_test[i], y_pred)
    r2 = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")
print(y_pred)

new_row = add_row_res('DecisionTree', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  

In [192]:
#Проверка на дурака - какая ошибка будет у константного решения (матожидание = 0.5+-eps)
y_pred = [0.54 for i in range(len(y_test[-1]))]

mse = mean_squared_error(y_test[-1], y_pred)
r2 = r2_score(y_test[-1], y_pred)

new_row = add_row_res('CoolMonkey', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

In [193]:
compare_table

Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9
0,E values,0.0,0.0,0.833333,0.833333,0.277778,0.722222,0.944444,0.277778,0.388889,0.611111,0.611111,0.833333
1,LinReg,0.553718,-10.481536,0.991731,-0.79291,0.700871,0.55235,0.415192,0.554428,0.379476,-0.380485,-0.40916,1.587046
2,RandomForest,0.07727,-0.499838,0.467122,0.471644,0.466143,0.429692,0.406613,0.507345,0.561025,0.479517,0.517412,0.501363
3,KNN,0.07842,-0.456881,0.472222,0.516667,0.494444,0.461111,0.5,0.505556,0.544444,0.45,0.483333,0.483333
4,DecisionTree,0.118748,-1.206093,0.333333,0.574074,0.333333,0.416667,0.333333,0.333333,0.769841,0.333333,0.574074,0.416667
5,CoolMonkey,0.062538,-0.161835,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54


G rating

In [194]:
target = 'G-рейтинг'
print(merged_df.columns)
X = merged_df.drop(columns=[target, 'Y', 'E Rank', 'S Rank', 'G Rank',
                                         'ESG-рейтинг', 'E-рейтинг', 'S-рейтинг'])
y = merged_df[target]
rand_seed = 200
n_butches = 10

X_train = [None for _ in range(n_butches)]
X_test = [None for _ in range(n_butches)]
y_train = [None for _ in range(n_butches)]
y_test = [None for _ in range(n_butches)]

for i in range(n_butches):
    X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(X, y, test_size=0.25, random_state=rand_seed * (i + 1), shuffle=True)


y_test[-1].values

Index(['Legal Proceedings & Law Violations', 'Biodiversity',
       'Communities Health and Safety',
       'Land Acquisition and Resettlement (S)', 'Emergencies (Social)',
       'Corporate Governance', 'Responsible Investment & Greenwashing',
       'Not Relevant to ESG', 'Economic Crime', 'Emergencies (Environmental)',
       'Hazardous Materials Management', 'Environmental Management',
       'Landscape Transformation', 'Human Rights', 'Climate Risks',
       'Labor Relations Management',
       'Freedom of Association and Right to Organise',
       'Employee Health and Safety', 'Surface Water Pollution',
       'Animal Welfare', 'Water Consumption', 'Disclosure',
       'Product Safety and Quality', 'Greenhouse Gas Emissions',
       'Indigenous People', 'Cultural Heritage', 'Air Pollution',
       'Waste Management', 'Soil and Groundwater Impact', 'Forced Labour',
       'Wastewater Management', 'Natural Resources', 'Physical Impacts',
       'Values and Ethics', 'Risk Management

array([0.83333333, 0.83333333, 0.38888889, 0.83333333, 0.94444444,
       0.38888889, 0.72222222, 0.72222222, 0.61111111, 0.72222222])

In [195]:
#сделаем красивую общую табличку с резами.
compare_table = pd.DataFrame(columns=['Model', 'MSE', 'R^2 score'] + [str(i) for i in range(len(y_test[-1]))])
new_row = pd.DataFrame({'Model': 'E values', 'MSE': 0, 'R^2 score': 0, **dict(zip(compare_table.columns[3:], y_test[-1].values))}, index=[0])
compare_table = pd.concat([compare_table, new_row], ignore_index=True)
compare_table


Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9
0,E values,0,0,0.833333,0.833333,0.388889,0.833333,0.944444,0.388889,0.722222,0.722222,0.611111,0.722222


In [196]:
mse = [0 for _ in range(n_butches)]
r2 = [0 for _ in range(n_butches)]
for i in range(n_butches):
    model = LinearRegression(fit_intercept = True)
    model.fit(X_train[i], y_train[i])

    y_pred = model.predict(X_test[i])

    # print(y_pred)
    # print(y_test)

    mse[i] = mean_squared_error(y_test[i], y_pred)
    r2[i] = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('LinReg', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Mean Squared Error: 0.34092760993716575
R^2 Score: -7.565274601716948


In [197]:
%time
print(mse)
model = RandomForestRegressor(random_state=42)
#model.fit(X_train, y_train)
param_grid_try = {
    'n_estimators': [10, 25, 50, 75, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [5, 10, 15, 20],
    'min_samples_leaf': [1, 5, 7],
    'bootstrap': [True],
    'criterion': ['friedman_mse']
}
param_grid = {
    'n_estimators': [25],
    'max_depth': [None],
    'min_samples_split': [15],
    'min_samples_leaf': [1],
    'bootstrap': [True],
    'criterion': ['friedman_mse']
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

#print("Лучшие параметры: ", grid_search.best_params_)
#Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 50}
#Mean Squared Error: 0.05508845985402971
for i in range(n_butches):

    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    y_pred = grid_search.predict(X_test[i])
    print(mse)
    # Оценка модели
    mse[i] = mean_squared_error(y_test[i], y_pred)
    r2[i] = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('RandomForest', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.34 µs
[1.101562613502096, 0.1268108353052831, 0.23043726994118846, 0.2328321312807607, 0.14442695890010837, 0.3111689862572564, 0.22978546304221106, 0.411899581053898, 0.20241259908727716, 0.4179396610015781]
Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 25}
[1.101562613502096, 0.1268108353052831, 0.23043726994118846, 0.2328321312807607, 0.14442695890010837, 0.3111689862572564, 0.22978546304221106, 0.411899581053898, 0.20241259908727716, 0.4179396610015781]
Лучшие параметры:  {'bootstrap': True, 'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 25}
[0.06598366813999863, 0.1268108353052831, 0.23043726994118846, 0.2328321312807607, 0.14442695890010837, 0.3111689862572564, 0.22978546304221106, 0.411899581053898, 0.20241259908727716, 0.4179396610015781]
Лучшие параметр

In [198]:
param_grid_try = {
    'n_neighbors': [5, 10, 15, 20, 25, 30, 40, 50, 70, 100],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev', 'seuclidean', 'mahalanobis', 'wminkowski'],
}
param_grid = {
    'n_neighbors': [20],
    'weights': ['uniform'],
    'metric': ['chebyshev'],
}
model = KNeighborsRegressor()
model.fit(X_train[-1], y_train[-1])

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров

# Вывод лучших параметров
#print("Лучшие параметры: ", grid_search.best_params_)
# Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 19, 'weights': 'uniform'}
# Mean Squared Error: 0.04452367110107953

# Оценка модели
for i in range(n_butches):
    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    y_pred = grid_search.predict(X_test[i])

    mse = mean_squared_error(y_test[i], y_pred)
    r2 = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")

new_row = add_row_res('KNN', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Лучшие параметры:  {'metric': 'chebyshev', 'n_neighbors': 20, 'weights': 'uniform'}
Mean Squared Error: 0.03701851851851852
R^2 Score: -0.17128906249999987


In [199]:
# Определение параметров для поиска
param_grid_try = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 5],
    'max_features': [None],
}
param_grid = {
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [2],
    'max_features': [None],
}

# Создание модели
model = DecisionTreeRegressor(random_state=42)

# Создание объекта GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Обучение модели и поиск оптимальных параметров


# Вывод лучших параметров
# print("Лучшие параметры: ", grid_search.best_params_)

# Лучшие параметры:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 9, 'min_samples_split': 20}
# Mean Squared Error: 0.050617283950617285
for i in range(n_butches):
    grid_search.fit(X_train[i], y_train[i])
    print("Лучшие параметры: ", grid_search.best_params_)
    # Предсказание на тестовых данных с использованием лучшей модели
    y_pred = grid_search.predict(X_test[i])
    # Оценка модели
    mse = mean_squared_error(y_test[i], y_pred)
    r2 = r2_score(y_test[i], y_pred)

print(f"Mean Squared Error: {np.mean(mse)}")
print(f"R^2 Score: {np.mean(r2)}")
print(y_pred)

new_row = add_row_res('DecisionTree', np.mean(mse), np.mean(r2), y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Лучшие параметры:  

In [200]:
#Проверка на дурака - какая ошибка будет у константного решения (матожидание = 0.5+-eps)
y_pred = [0.54 for i in range(len(y_test[-1]))]

mse = mean_squared_error(y_test[-1], y_pred)
r2 = r2_score(y_test[-1], y_pred)

new_row = add_row_res('CoolMonkey', mse, r2, y_pred)
compare_table = pd.concat([compare_table, new_row], ignore_index=True)

In [201]:
compare_table

Unnamed: 0,Model,MSE,R^2 score,0,1,2,3,4,5,6,7,8,9
0,E values,0.0,0.0,0.833333,0.833333,0.388889,0.833333,0.944444,0.388889,0.722222,0.722222,0.611111,0.722222
1,LinReg,0.340928,-7.565275,1.171496,-0.626145,0.696618,0.681067,0.752182,0.261629,0.493057,0.487688,-0.031907,1.837089
2,RandomForest,0.052078,-0.396718,0.709722,0.54294,0.618692,0.559932,0.636163,0.620664,0.661397,0.674003,0.534581,0.617799
3,KNN,0.037019,-0.171289,0.6,0.638889,0.627778,0.616667,0.633333,0.605556,0.65,0.611111,0.594444,0.611111
4,DecisionTree,0.051101,-0.616874,0.611111,0.888889,0.888889,0.888889,0.907407,0.5,0.5,0.5,0.5,0.438272
5,CoolMonkey,0.057205,-0.81,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54
