In [1]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor 

from sklearn.linear_model import LinearRegression,Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Инициализируем все известные генераторы случаынйх чисел / Setting all known random seeds
my_code = "Волков Н."
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

os.environ['PYTHONHASHSEED']=str(my_seed)

random.seed(my_seed)

np.random.seed(my_seed)

In [3]:
# Читаем данные из файла
train_data = pd.read_csv("datasets/rus_stocks_funds_train.csv")

In [4]:
train_data.head()

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,7210000000.0,2870000000.0,110809.6875,3190.0,9.87,0.27
1,10600000000.0,1130000000.0,318000.0,28230.0,13.91,0.3
2,22490000000.0,10270000000.0,329570.0,61044.948718,4.08,0.25
3,499520000.0,196420000.0,59380.0,81.0,2.91,1.31
4,9520000000.0,130996000000.0,110809.6875,61044.948718,15.525191,0.35


In [5]:
# Определим размер валидационной выборки
val_size = round(0.2*len(train_data))
print(val_size)

66


In [6]:
# Создадим обучающую и валидационную выборки
random_state = my_seed
train, val = train_test_split(train_data, test_size=val_size, random_state=random_state)
print(len(train), len(val))

263 66


In [7]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
y_column = 'Debt/Assets'
x_columns = [e for e in train_data.columns if e != y_column]

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), x_columns)], remainder='passthrough')

ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['Total issued ordinary shares',
                                  'Number of shares available for sale',
                                  'Number of employees',
                                  'Number of shareholders', 'Price/Profit'])])

In [8]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_val = pd.DataFrame(ct.transform(val))

In [9]:
# Устанавливаем названия столбцов
column_names = list(x_columns) + [y_column]
sc_train.columns = column_names
sc_val.columns = column_names

In [10]:
sc_train

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,0.000364,0.023841,0.231278,0.020338,0.016686,0.21
1,0.004865,0.049084,0.231278,0.149645,0.045534,0.01
2,0.000012,0.023818,0.231278,0.094595,0.042900,0.55
3,0.000056,0.049084,0.231278,0.000017,0.063121,0.37
4,0.097309,0.069533,0.231278,1.000000,0.063121,0.28
...,...,...,...,...,...,...
258,0.032600,0.039494,0.143901,0.751021,0.196452,0.22
259,0.000861,0.049084,0.231278,0.149645,0.067884,0.14
260,0.000734,0.049084,0.231278,0.149645,0.063121,0.35
261,0.033603,0.049084,0.231278,0.149645,0.033371,0.20


In [11]:
# Отберем необходимые параметры
x_train = sc_train[x_columns]
x_val = sc_val[x_columns]

y_train = (sc_train[y_column].values).flatten()
y_val = (sc_val[y_column].values).flatten()

In [12]:
# Создадим простую модель логистической регрессии
r_models = []

r_models.append(ElasticNet(alpha=0.5))
r_models.append(ElasticNet(alpha=0.5, l1_ratio=0.75))
r_models.append(LinearRegression())
r_models.append(Lasso())

r_models.append(DecisionTreeRegressor())

r_models.append(SVR())

r_models.append(Ridge())

r_models.append(KNeighborsRegressor(n_neighbors=5))
r_models.append(KNeighborsRegressor(n_neighbors=10))
r_models.append(KNeighborsRegressor(n_neighbors=15))

r_models.append(SVR(kernel='linear'))
r_models.append(SVR(kernel='poly'))
r_models.append(SVR(kernel='rbf'))
r_models.append(SVR(kernel='sigmoid'))

r_models.append(DecisionTreeRegressor(criterion='squared_error'))
r_models.append(DecisionTreeRegressor(criterion='friedman_mse'))
r_models.append(DecisionTreeRegressor(criterion='absolute_error'))

In [13]:
r_models

[ElasticNet(alpha=0.5),
 ElasticNet(alpha=0.5, l1_ratio=0.75),
 LinearRegression(),
 Lasso(),
 DecisionTreeRegressor(),
 SVR(),
 Ridge(),
 KNeighborsRegressor(),
 KNeighborsRegressor(n_neighbors=10),
 KNeighborsRegressor(n_neighbors=15),
 SVR(kernel='linear'),
 SVR(kernel='poly'),
 SVR(),
 SVR(kernel='sigmoid'),
 DecisionTreeRegressor(),
 DecisionTreeRegressor(criterion='friedman_mse'),
 DecisionTreeRegressor(criterion='absolute_error')]

In [14]:
# Обучим модель
for model in r_models:
    print(model)
    model.fit(x_train, y_train)

ElasticNet(alpha=0.5)
ElasticNet(alpha=0.5, l1_ratio=0.75)
LinearRegression()
Lasso()
DecisionTreeRegressor()
SVR()
Ridge()
KNeighborsRegressor()
KNeighborsRegressor(n_neighbors=10)
KNeighborsRegressor(n_neighbors=15)
SVR(kernel='linear')
SVR(kernel='poly')
SVR()
SVR(kernel='sigmoid')
DecisionTreeRegressor()
DecisionTreeRegressor(criterion='friedman_mse')
DecisionTreeRegressor(criterion='absolute_error')


In [15]:
# Проверим работу обученной модели на валидационной выборке
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(model, '\t', mse)

ElasticNet(alpha=0.5) 	 0.048327045492896285
ElasticNet(alpha=0.5, l1_ratio=0.75) 	 0.048327045492896285
LinearRegression() 	 0.05151465832984994
Lasso() 	 0.048327045492896285
DecisionTreeRegressor() 	 0.013281591073484951
SVR() 	 0.046996217288608706
Ridge() 	 0.05015988729637438
KNeighborsRegressor() 	 0.049855166991560014
KNeighborsRegressor(n_neighbors=10) 	 0.05094792533602958
KNeighborsRegressor(n_neighbors=15) 	 0.0524791587990908
SVR(kernel='linear') 	 0.04802011675772887
SVR(kernel='poly') 	 2.1817568995543324
SVR() 	 0.046996217288608706
SVR(kernel='sigmoid') 	 54.51421120585733
DecisionTreeRegressor() 	 0.017011399266394975
DecisionTreeRegressor(criterion='friedman_mse') 	 0.017932611387607097
DecisionTreeRegressor(criterion='absolute_error') 	 0.01408412653912225


In [16]:
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

DecisionTreeRegressor()

In [17]:
val_pred

array([0.04      , 0.        , 0.04      , 0.53      , 0.06      ,
       0.62      , 0.13      , 0.2       , 0.41      , 0.36      ,
       0.5       , 0.62      , 0.21      , 0.61      , 0.37      ,
       0.61      , 0.82      , 0.12      , 0.21      , 0.62      ,
       0.21      , 0.22      , 0.22      , 0.07      , 0.19      ,
       0.09      , 0.22      , 0.15      , 0.65      , 0.19      ,
       0.19      , 0.29      , 0.65      , 0.01      , 0.41      ,
       0.05      , 0.27      , 0.07      , 0.28      , 0.21      ,
       0.1       , 0.16      , 0.26      , 0.62      , 0.22      ,
       0.06      , 0.53      , 0.51      , 0.1       , 0.18      ,
       0.84      , 0.61      , 0.1       , 0.32      , 0.53      ,
       0.2       , 0.48612469, 0.11      , 0.07      , 0.        ,
       0.12      , 0.61      , 0.01      , 0.09      , 0.27      ,
       0.29      ])

In [18]:
test = pd.read_csv("datasets/rus_stocks_funds_test.csv")

In [19]:
test[y_column] = 0.0

In [20]:
test

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,3.644700e+11,1.017600e+11,5840.0000,306350.000000,7.860000,0.0
1,6.968900e+08,4.339700e+08,110809.6875,39210.000000,5.670000,0.0
2,1.580000e+07,3.880000e+06,110809.6875,29.000000,7.060000,0.0
3,1.295000e+08,3.573000e+07,110809.6875,25.000000,14.940000,0.0
4,2.400000e+09,1.309960e+11,110809.6875,61044.948718,15.525191,0.0
...,...,...,...,...,...,...
77,1.528600e+08,1.309960e+11,110809.6875,61044.948718,6.270000,0.0
78,1.295000e+08,2.634000e+07,110809.6875,25.000000,15.525191,0.0
79,2.246000e+10,0.000000e+00,281550.0000,61044.948718,6.340000,0.0
80,1.295000e+08,3.371000e+07,110809.6875,25.000000,6.320000,0.0


In [21]:
sc_test = pd.DataFrame(ct.transform(test))
sc_test.columns = column_names

In [22]:
x_test = sc_test[x_columns]

In [23]:
test[y_column] = model.predict(x_test)

In [24]:
test.head()

Unnamed: 0,Total issued ordinary shares,Number of shares available for sale,Number of employees,Number of shareholders,Price/Profit,Debt/Assets
0,364470000000.0,101760000000.0,5840.0,306350.0,7.86,0.2
1,696890000.0,433970000.0,110809.6875,39210.0,5.67,0.12
2,15800000.0,3880000.0,110809.6875,29.0,7.06,0.27
3,129500000.0,35730000.0,110809.6875,25.0,14.94,0.48
4,2400000000.0,130996000000.0,110809.6875,61044.948718,15.525191,0.65


In [25]:
test.to_csv('task5_results/Волков Н..csv', index=False)