In [73]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor 

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [74]:
# Инициализируем все известные генераторы случаынйх чисел / Setting all known random seeds
my_code = "Nosovaoa"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

os.environ['PYTHONHASHSEED']=str(my_seed)

random.seed(my_seed)

np.random.seed(my_seed)

In [75]:
# Читаем данные из файла
train_data = pd.read_csv("datasets/rus_stocks_funds_train.csv")

In [76]:
train_data.head()

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,7210000000.0,2870000000.0,110809.6875,3190.0,9.87,0.27
1,10600000000.0,1130000000.0,318000.0,28230.0,13.91,0.3
2,22490000000.0,10270000000.0,329570.0,61044.948718,4.08,0.25
3,499520000.0,196420000.0,59380.0,81.0,2.91,1.31
4,9520000000.0,130996000000.0,110809.6875,61044.948718,15.525191,0.35


In [77]:
#pairplot(train_data)

In [78]:
# Определим размер валидационной выборки
val_size = round(0.2*len(train_data))
print(val_size)

66


In [79]:
# Создадим обучающую и валидационную выборки
random_state = my_seed
train, val = train_test_split(train_data, test_size=val_size, random_state=random_state)
print(len(train), len(val))

263 66


In [80]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
y_column = 'Debt/Assets'
x_columns = [e for e in train_data.columns if e != y_column]

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), x_columns)], remainder='passthrough')

ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['Total issued ordinary shares',
                                  'Number of shares available for sale',
                                  'Number of employees',
                                  'Number of shareholders', 'Price/Profit'])])

In [81]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_val = pd.DataFrame(ct.transform(val))

In [82]:
# Устанавливаем названия столбцов
column_names = list(x_columns) + [y_column]
sc_train.columns = column_names
sc_val.columns = column_names

In [83]:
sc_train

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,5.030478e-05,0.049084,0.233354,0.149645,0.035567,0.63
1,2.133603e-05,0.049084,0.233354,0.000017,0.063121,0.33
2,2.119893e-05,0.049084,0.233354,0.149645,0.060025,0.68
3,6.550534e-05,0.024065,0.000406,0.000027,0.047774,0.82
4,4.437654e-04,0.024101,0.117842,0.000000,0.236673,0.64
...,...,...,...,...,...,...
258,1.272845e-02,0.049084,0.233354,0.149645,0.025995,0.21
259,5.147406e-05,0.049084,0.233354,0.149645,0.043251,0.62
260,6.466344e-04,0.025643,0.992521,0.111928,0.005137,0.18
261,3.007957e-07,0.023808,0.233354,0.000061,0.026302,0.32


In [84]:
# Отберем необходимые параметры
x_train = sc_train[x_columns]
x_val = sc_val[x_columns]

y_train = (sc_train[y_column].values).flatten()
y_val = (sc_val[y_column].values).flatten()

In [85]:
# Создадим простую модель логистической регрессии
model = TransformedTargetRegressor(regressor=LinearRegression(), transformer=MinMaxScaler())

In [86]:
# Обучим модель
model.fit(x_train, y_train)

TransformedTargetRegressor(regressor=LinearRegression(),
                           transformer=MinMaxScaler())

In [87]:
# Проверим работу обученной модели на валидационной выборке
pred_val = model.predict(x_val)
mse = mean_squared_error(y_val, pred_val)
print(mse)

0.06738361973062726


In [88]:
pred_val

array([0.32860842, 0.15346352, 0.32512464, 0.37686925, 0.37291548,
       0.32489997, 0.42346685, 0.36537258, 0.36207253, 0.32629072,
       0.36387887, 0.39058154, 0.32750539, 0.32593841, 0.35325876,
       0.32857026, 0.4042688 , 0.41525901, 0.31479271, 0.32888732,
       0.34968924, 0.27948946, 0.32651536, 0.35959546, 0.32869228,
       0.32640268, 0.32996716, 0.39164902, 0.32839322, 0.32229398,
       0.32858997, 0.38737331, 0.32731571, 0.28556463, 0.22091765,
       0.36538886, 0.32595807, 0.32877294, 0.33029609, 0.3993432 ,
       0.32585784, 0.3670148 , 0.21600953, 0.36983118, 0.1571319 ,
       0.32773708, 0.32171423, 0.32891289, 0.39671388, 0.32347693,
       0.36373549, 0.36503513, 0.31559093, 0.32885993, 0.32853441,
       0.15754737, 0.32770124, 0.3883064 , 0.32261022, 0.32774312,
       0.36647789, 0.32901624, 0.32595918, 0.18106173, 0.32828302,
       0.15567477])

In [89]:
test = pd.read_csv("datasets/rus_stocks_funds_test.csv")

In [90]:
test[y_column] = 0.0

In [91]:
test

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,3.644700e+11,1.017600e+11,5840.0000,306350.000000,7.860000,0.0
1,6.968900e+08,4.339700e+08,110809.6875,39210.000000,5.670000,0.0
2,1.580000e+07,3.880000e+06,110809.6875,29.000000,7.060000,0.0
3,1.295000e+08,3.573000e+07,110809.6875,25.000000,14.940000,0.0
4,2.400000e+09,1.309960e+11,110809.6875,61044.948718,15.525191,0.0
...,...,...,...,...,...,...
77,1.528600e+08,1.309960e+11,110809.6875,61044.948718,6.270000,0.0
78,1.295000e+08,2.634000e+07,110809.6875,25.000000,15.525191,0.0
79,2.246000e+10,0.000000e+00,281550.0000,61044.948718,6.340000,0.0
80,1.295000e+08,3.371000e+07,110809.6875,25.000000,6.320000,0.0


In [92]:
sc_test = pd.DataFrame(ct.transform(test))
sc_test.columns = column_names

In [93]:
x_test = sc_test[x_columns]

In [94]:
test[y_column] = model.predict(x_test)

In [95]:
test.head()

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,364470000000.0,101760000000.0,5840.0,306350.0,7.86,0.216086
1,696890000.0,433970000.0,110809.6875,39210.0,5.67,0.342954
2,15800000.0,3880000.0,110809.6875,29.0,7.06,0.368515
3,129500000.0,35730000.0,110809.6875,25.0,14.94,0.365693
4,2400000000.0,130996000000.0,110809.6875,61044.948718,15.525191,0.325943


In [96]:
test.to_csv('task5_results/nosova.csv', index=False)