In [226]:
# Зависимости
import pandas as pd
import numpy as np
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor 
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

In [227]:
# Инициализируем все известные генераторы случайных чисел
my_code = "Bevz"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

os.environ['PYTHONHASHSEED']=str(my_seed)

random.seed(my_seed)

np.random.seed(my_seed)

In [228]:
# Читаем данные из файла
train_data = pd.read_csv("../datasets/rus_stocks_funds_train.csv")

In [229]:
train_data.head()

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,7210000000.0,2870000000.0,110809.6875,3190.0,9.87,0.27
1,10600000000.0,1130000000.0,318000.0,28230.0,13.91,0.3
2,22490000000.0,10270000000.0,329570.0,61044.948718,4.08,0.25
3,499520000.0,196420000.0,59380.0,81.0,2.91,1.31
4,9520000000.0,130996000000.0,110809.6875,61044.948718,15.525191,0.35


In [230]:
# Определим размер валидационной выборки
val_size = round(0.2 * len(train_data))
print(val_size)

66


In [231]:
# Создадим обучающую и валидационную выборки
random_state = my_seed
train, val = train_test_split(train_data, test_size=val_size, random_state=random_state)
print(len(train), len(val))

263 66


In [232]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
y_column = 'Debt/Assets'
x_columns = [e for e in train_data.columns if e != y_column]

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), x_columns)], remainder='passthrough')

ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['Total issued ordinary shares',
                                  'Number of shares available for sale',
                                  'Number of employees',
                                  'Number of shareholders', 'Price/Profit'])])

In [233]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_val = pd.DataFrame(ct.transform(val))

In [234]:
# Устанавливаем названия столбцов
column_names = list(x_columns) + [y_column]
sc_train.columns = column_names
sc_val.columns = column_names

In [235]:
sc_train

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,0.000275,0.024408,0.231181,0.000221,0.063121,0.53
1,0.000671,0.025882,0.975855,0.111928,0.012426,0.20
2,0.000647,0.025647,0.991104,0.111928,0.012690,0.18
3,0.000444,0.024098,0.119520,0.000000,0.009309,0.51
4,0.000326,0.049084,0.231181,0.149645,0.033371,0.13
...,...,...,...,...,...,...
258,0.000010,0.049084,0.231181,0.149645,0.469044,0.19
259,0.001843,0.049084,0.231181,0.149645,0.044788,0.01
260,0.000010,0.049084,0.231181,0.149645,0.274436,0.19
261,0.000004,0.023818,0.231181,0.094595,0.030298,0.51


In [236]:
# Отберем необходимые параметры
x_train = sc_train[x_columns]
x_val = sc_val[x_columns]

y_train = sc_train[y_column].values.flatten()
y_val = sc_val[y_column].values.flatten()

In [237]:
# Создадим простую модель градиентного бустинга

model = TransformedTargetRegressor(regressor=GradientBoostingRegressor(), transformer=MinMaxScaler())

In [238]:
# Обучим модель
model.fit(x_train, y_train)

TransformedTargetRegressor(regressor=GradientBoostingRegressor(),
                           transformer=MinMaxScaler())

In [239]:
# Проверим работу обученной модели на валидационной выборке
pred_val = model.predict(x_val)
mse = mean_squared_error(y_val, pred_val)
print(mse)

0.022996502408611376


In [240]:
pred_val

array([ 0.19956786,  0.48206141,  0.19526541,  0.37401079,  0.52380078,
        0.4682465 ,  0.23934236,  0.16446476,  0.44651677,  0.71553682,
        0.27294131,  0.25715935,  0.53963691,  0.43461838,  0.33468572,
        0.48364998,  0.13975754,  0.99160626,  0.24867657, -0.00594409,
        0.19156867,  0.21061557,  0.547719  ,  0.45473109,  0.37401079,
        0.30415349,  0.21448613,  1.03919333,  0.35764598,  0.43663736,
        0.49378838,  0.2533404 ,  0.16657038,  0.5785327 ,  0.49818626,
        0.23741273,  0.57521071,  0.36558763,  0.24416895,  0.25774371,
        0.3415079 ,  0.27194766,  0.16657038,  0.41628991,  0.1338499 ,
        0.20828321,  0.37401079,  0.11411313,  0.16233817,  0.74328571,
        0.1338499 ,  0.08160557,  0.19078485,  0.17349903,  0.15919933,
        0.1517034 ,  0.5773063 ,  0.1338499 ,  0.05202464,  0.34535406,
        0.24463357,  0.19271582,  0.27132109,  0.28415561,  0.22751842,
        0.13661244])

In [241]:
test = pd.read_csv("../datasets/rus_stocks_funds_test.csv")

In [242]:
test[y_column] = 0.0

In [243]:
test

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,3.644700e+11,1.017600e+11,5840.0000,306350.000000,7.860000,0.0
1,6.968900e+08,4.339700e+08,110809.6875,39210.000000,5.670000,0.0
2,1.580000e+07,3.880000e+06,110809.6875,29.000000,7.060000,0.0
3,1.295000e+08,3.573000e+07,110809.6875,25.000000,14.940000,0.0
4,2.400000e+09,1.309960e+11,110809.6875,61044.948718,15.525191,0.0
...,...,...,...,...,...,...
77,1.528600e+08,1.309960e+11,110809.6875,61044.948718,6.270000,0.0
78,1.295000e+08,2.634000e+07,110809.6875,25.000000,15.525191,0.0
79,2.246000e+10,0.000000e+00,281550.0000,61044.948718,6.340000,0.0
80,1.295000e+08,3.371000e+07,110809.6875,25.000000,6.320000,0.0


In [244]:
sc_test = pd.DataFrame(ct.transform(test))
sc_test.columns = column_names

In [245]:
x_test = sc_test[x_columns]

In [246]:
test[y_column] = model.predict(x_test)

In [247]:
test.head()

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,364470000000.0,101760000000.0,5840.0,306350.0,7.86,0.171426
1,696890000.0,433970000.0,110809.6875,39210.0,5.67,0.247478
2,15800000.0,3880000.0,110809.6875,29.0,7.06,0.317596
3,129500000.0,35730000.0,110809.6875,25.0,14.94,0.42719
4,2400000000.0,130996000000.0,110809.6875,61044.948718,15.525191,0.578533


In [248]:
test.to_csv('../task5_results/Бевз_АС.csv', index=False)