In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Классификация тренда акции различными можелями, основанное на разнице в ценах за n прошедших часов



Для предсказания цен акций используется следующая техника: по данным о разнице в цене за предыдущие 10 часов строится предположение о тренде в следующий час




In [2]:
import math
import numpy as np
import os 
import pandas as pd

from datetime import datetime, timedelta

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

### Загрзка данных: почасовые данные с диска

In [3]:
df = pd.read_csv("/content/drive/MyDrive/MTS.Project/DataSet/GazNeft_H1/GAZP_H1_HLOCV_MTS_Project_4_8_2021___5-41-42.csv", sep=";")

### Подготовка датасета: Используем колонки с датой, ценой закрытия

In [4]:
df = df[['TIMEDATE','Close']]
df['TIMEDATE'] = pd.to_datetime(df['TIMEDATE'])

In [5]:
#df.head, df.info()

Создаём датасет из разниц цен

In [6]:
df_dif = df[1:].reset_index(drop = True)
df_dif['Close'] = df[1:]['Close'].reset_index(drop = True,) - df[0:len(df)-1]['Close']

In [7]:
df_dif

Unnamed: 0,TIMEDATE,Close
0,2017-01-03 13:00:00,-0.22
1,2017-01-03 14:00:00,0.19
2,2017-01-03 15:00:00,0.44
3,2017-01-03 16:00:00,-0.01
4,2017-01-03 17:00:00,0.07
...,...,...
12918,2021-08-03 17:00:00,0.93
12919,2021-08-03 18:00:00,0.93
12920,2021-08-03 19:04:00,-0.14
12921,2021-08-03 20:00:00,-0.28


Для использования техники создадим новый датасет, рамещая в его i-ой строчке даннные по объёму и цене о (i+10)-ти часах и цене (i+11)-го часа, всего 11 смысловых столбцов; следующая строчка начинается с i+12 часа. Для этого написана следующая функция

In [8]:
def make_dataset (data, mode='train', hours = 10, vol_column = 'Volume', cost_column = 'Close', offset = 0):
  
  #offset - показатель смещения, прдсказываем час посел hours + offset
  d={}
  step = hours+1 + offset
  initial_start = (len(data))%(step) #чтобы избежать конфликта рамеров столбцов, уберем первые несколько строк из датасета


  d['start_datatime'] = list(data[initial_start::step]['TIMEDATE']) # взять 0 день, 11 день
  for i in range(hours):
    # В новом датасете будут колонки 1vol, 1cost, 2vol, 2cost, ..., 10vol, 10cost
    # Количество столбцов = hours + 1(таргет) +2(дата первого дня и полследнего)
    end = len(data) - hours + i - offset     
    #d[str(i+1)+'vol'] = list(data[initial_start + i:end:step][vol_column]) # от вида среза (slice) зависит количесвто строк
    d[str(i+1)+'dif'] = list(data[initial_start + i:end:step][cost_column]) # каждая итерация цикла передаёт данные в один столбец

  
  d['end_datatime'] = list(data[initial_start + hours-1::step]['TIMEDATE'])  # дата последнего столбца в датасете
  d['target'] = list(data[initial_start + offset + hours::step][cost_column]) # таргет
  d['target_datetime'] = list(data[initial_start + offset + hours::step]['TIMEDATE']) # таргет дата
  
  return pd.DataFrame(d)

In [9]:
df_new = make_dataset(df_dif)
df_new['target'] = (df_new ['target'] > 0).astype(int)

### Разделение выборки

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score

In [11]:
df_new

Unnamed: 0,start_datatime,1dif,2dif,3dif,4dif,5dif,6dif,7dif,8dif,9dif,10dif,end_datatime,target,target_datetime
0,2017-01-04 12:00:00,0.14,-0.13,-0.22,0.10,-0.22,0.19,1.04,0.00,0.30,-1.14,2017-01-05 11:00:00,0,2017-01-05 12:00:00
1,2017-01-05 13:00:00,-0.29,-0.73,-0.34,0.37,-0.53,0.30,0.15,0.08,-1.42,0.11,2017-01-06 12:00:00,1,2017-01-06 13:00:00
2,2017-01-06 14:00:00,0.02,0.25,-0.05,0.50,0.44,0.10,0.57,0.56,-1.02,0.00,2017-01-09 13:00:00,0,2017-01-09 14:00:00
3,2017-01-09 15:00:00,-0.15,-0.13,0.34,0.50,-0.39,1.16,0.18,0.25,0.37,-0.02,2017-01-10 14:00:00,1,2017-01-10 15:00:00
4,2017-01-10 16:00:00,1.28,-0.18,0.53,-0.47,-0.15,0.63,-0.27,-0.28,-0.38,-0.97,2017-01-11 15:00:00,0,2017-01-11 16:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1169,2021-07-29 12:00:00,-0.35,0.63,-0.20,-0.33,0.00,0.36,0.08,0.19,0.13,0.36,2021-07-29 21:00:00,1,2021-07-29 22:00:00
1170,2021-07-29 23:00:00,-0.60,-1.98,-0.63,-0.22,0.04,0.23,0.86,-1.05,0.68,0.74,2021-07-30 17:00:00,0,2021-07-30 18:00:00
1171,2021-07-30 19:04:00,0.36,0.09,-0.21,1.73,0.62,0.53,-0.43,1.53,-1.46,0.12,2021-08-02 13:00:00,1,2021-08-02 14:00:00
1172,2021-08-02 15:00:00,-0.66,0.30,-1.64,-0.62,-0.06,0.14,-0.02,0.11,0.49,0.00,2021-08-03 09:59:00,1,2021-08-03 10:00:00


In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_new.drop(columns = ['start_datatime','end_datatime','target_datetime', 'target']), df_new['target'], test_size=0.15, shuffle=False) 

In [13]:
X_train, X_test, y_train, y_test

(     1dif  2dif  3dif  4dif  5dif  6dif  7dif  8dif  9dif  10dif
 0    0.14 -0.13 -0.22  0.10 -0.22  0.19  1.04  0.00  0.30  -1.14
 1   -0.29 -0.73 -0.34  0.37 -0.53  0.30  0.15  0.08 -1.42   0.11
 2    0.02  0.25 -0.05  0.50  0.44  0.10  0.57  0.56 -1.02   0.00
 3   -0.15 -0.13  0.34  0.50 -0.39  1.16  0.18  0.25  0.37  -0.02
 4    1.28 -0.18  0.53 -0.47 -0.15  0.63 -0.27 -0.28 -0.38  -0.97
 ..    ...   ...   ...   ...   ...   ...   ...   ...   ...    ...
 992  0.84  0.59 -0.87 -2.12  0.03  1.56 -0.37  0.01  0.90   0.69
 993 -2.16 -0.25  2.82  0.52 -0.58  0.26 -0.33 -0.03 -0.83   0.52
 994  0.20  0.35  0.15  0.00  0.69  1.26 -0.56  0.80 -0.68  -1.77
 995 -0.22 -0.03 -0.16  0.61  0.45 -0.44 -0.41 -0.10 -1.41  -0.12
 996  1.18  0.63 -0.10  0.74  0.95 -0.94 -0.59 -0.29  0.41   0.28
 
 [997 rows x 10 columns],
       1dif  2dif  3dif  4dif  5dif  6dif  7dif  8dif  9dif  10dif
 997  -0.10 -1.00 -1.41 -0.99  0.42  0.04  0.05 -0.52 -0.59  -0.16
 998  -1.12 -0.50  0.23 -0.13 -0.07  1.87 -1.0

### Нормализация данных

In [14]:
from sklearn import preprocessing

In [15]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Смотр моделей

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss, f1_score


In [19]:
scores = pd.DataFrame({'parametr':[],'accuracy':[], 'precision':[], 'recall':[], 'f1':[]})
columns = ['parametr', 'accuracy', 'precision', 'recall', 'f1']

In [20]:
def make_score(y_test, y_pred):
  return [accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds), f1_score(y_test, preds)]

### Предсказание теста линейными моделями

In [21]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet, Lars, LassoLars

In [22]:
for model in [LinearRegression(), LogisticRegression(), Ridge(alpha=.5), Lasso(alpha=0.1, ), ElasticNet(), Lars(), LassoLars()]:
  model.fit(X_train_scaled, y_train)
  preds = (model.predict(X_test_scaled)>0.5).astype(int)
  print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) )

0.4971751412429379 0.5 0.4943820224719101
0.5028248587570622 0.5057471264367817 0.4943820224719101
0.4971751412429379 0.5 0.4943820224719101
0.4971751412429379 0.0 0.0
0.4971751412429379 0.0 0.0
0.4971751412429379 0.5 0.4943820224719101
0.4971751412429379 0.0 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Проверка векторами

In [23]:
from sklearn.svm import SVC

In [24]:
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
  model = SVC(kernel = kernel)
  model.fit(X_train_scaled, y_train, )
  preds = (model.predict(X_test_scaled)>0.5).astype(int)

  a,p,r,f = make_score(y_test, preds)
  scores = scores.append(pd.DataFrame([[kernel, a,p,r,f]], columns=columns))

scores.set_index('parametr', inplace=True)

In [25]:
scores

Unnamed: 0_level_0,accuracy,precision,recall,f1
parametr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
linear,0.508475,0.513889,0.41573,0.459627
poly,0.508475,0.508772,0.651685,0.571429
rbf,0.531073,0.533333,0.539326,0.536313
sigmoid,0.497175,0.5,0.449438,0.473373


### SGD

In [26]:
 from sklearn.linear_model import SGDClassifier

In [27]:
for loss in ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']:
  model = SGDClassifier(loss = loss)
  model.fit(X_train_scaled, y_train)
  preds = (model.predict(X_test_scaled)>0.5).astype(int)
  print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) , f1_score(y_test, preds))

0.5423728813559322 0.5487804878048781 0.5056179775280899 0.5263157894736842
0.4858757062146893 0.4891304347826087 0.5056179775280899 0.4972375690607735
0.5141242937853108 0.5135135135135135 0.6404494382022472 0.5699999999999998
0.519774011299435 0.5285714285714286 0.4157303370786517 0.46540880503144655
0.480225988700565 0.48760330578512395 0.6629213483146067 0.5619047619047619
0.5028248587570622 0.5051546391752577 0.550561797752809 0.5268817204301075
0.5141242937853108 0.5135135135135135 0.6404494382022472 0.5699999999999998
0.4576271186440678 0.4715447154471545 0.651685393258427 0.5471698113207548
0.5028248587570622 0.5054945054945055 0.5168539325842697 0.5111111111111112


Лучшие

> **accuracy: log**

> precision: log

> recall: perception

> f-measure: perception









### Nearest Neighbors

In [28]:
from sklearn.neighbors import  KNeighborsClassifier, NearestCentroid

Исследование на количество

In [29]:
acc = 0
prec = 0
rec = 0
f1 = 0
acc_model = 0
prec_model = 0
rec_model = 0
f1_model = 0


for number in range(1,300):
  model = KNeighborsClassifier(n_neighbors = number)
  model.fit(X_train_scaled, y_train, )
  preds = (model.predict(X_test_scaled)>0.5).astype(int)

  if accuracy_score(y_test, preds) > acc: 
    acc_model = number
    acc = accuracy_score(y_test, preds)

  if precision_score(y_test, preds) > prec: 
    prec_model = number
    prec = precision_score(y_test, preds)

  if recall_score(y_test, preds) > rec: 
    rec_model = number
    rec = recall_score(y_test, preds)

  if f1_score(y_test, preds) > f1: 
    f1_model = number
    f1 = f1_score(y_test, preds)

 # print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) , f1_score(y_test, preds))

print('\n')
print(acc, prec, rec, f1)
print (acc_model, prec_model, rec_model, f1_model)




0.5932203389830508 0.6024096385542169 0.5730337078651685 0.5813953488372093
51 51 71 51


Исследование на метрику расстояния

In [30]:
distance_metrics = ['euclidean', 'manhattan', 'chebyshev']

In [31]:
acc = 0
prec = 0
rec = 0
f1 = 0
acc_model = 0
prec_model = 0
rec_model = 0
f1_model = 0


for metric in distance_metrics:
  model = KNeighborsClassifier(n_neighbors = 51, metric=metric)
  model.fit(X_train_scaled, y_train)
  preds = (model.predict(X_test_scaled)>0.5).astype(int)
  if accuracy_score(y_test, preds) > acc: 
    acc_model = metric
    acc = accuracy_score(y_test, preds)

  if precision_score(y_test, preds) > prec: 
    prec_model = metric
    prec = precision_score(y_test, preds)

  if recall_score(y_test, preds) > rec: 
    rec_model = metric
    rec = recall_score(y_test, preds)

  if f1_score(y_test, preds) > f1: 
    f1_model = metric
    f1 = f1_score(y_test, preds)

#print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) , f1_score(y_test, preds))print('\n\n')
print('\n')
print(acc, prec, rec, f1)
print (acc_model, prec_model, rec_model, f1_model)



0.5932203389830508 0.6024096385542169 0.5617977528089888 0.5813953488372093
euclidean euclidean euclidean euclidean



Лучшие показатели дали **n = 51**, **metric = 'euclidean'**





### Nearest Centroid Classifier

In [32]:
acc = 0
prec = 0
rec = 0
f1 = 0
acc_model = 0
prec_model = 0
rec_model = 0
f1_model = 0


for metric in distance_metrics:
  model = NearestCentroid(metric=metric)
  model.fit(X_train_scaled, y_train)
  preds = (model.predict(X_test_scaled)>0.5).astype(int)
  if accuracy_score(y_test, preds) > acc: 
    acc_model = metric
    acc = accuracy_score(y_test, preds)

  if precision_score(y_test, preds) > prec: 
    prec_model = metric
    prec = precision_score(y_test, preds)

  if recall_score(y_test, preds) > rec: 
    rec_model = metric
    rec = recall_score(y_test, preds)

  if f1_score(y_test, preds) > f1: 
    f1_model = metric
    f1 = f1_score(y_test, preds)

#print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) , f1_score(y_test, preds))print('\n\n')
print('\n')
print(acc, prec, rec, f1)
print (acc_model, prec_model, rec_model, f1_model)



0.4971751412429379 0.5 0.5393258426966292 0.5082872928176796
euclidean euclidean manhattan euclidean




### Gaussian Process

In [33]:
from sklearn.gaussian_process import GaussianProcessClassifier

In [34]:
model = GaussianProcessClassifier()
model.fit(X_train_scaled, y_train)
preds = (model.predict(X_test_scaled)>0.5).astype(int)
print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) , f1_score(y_test, preds))

0.5254237288135594 0.5301204819277109 0.4943820224719101 0.5116279069767442


### Naive Bayes

In [35]:
from sklearn.naive_bayes import GaussianNB

In [36]:
model = GaussianNB()
model.fit(X_train_scaled, y_train)
preds = (model.predict(X_test_scaled)>0.5).astype(int)
print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) , f1_score(y_test, preds))

0.480225988700565 0.4745762711864407 0.3146067415730337 0.3783783783783784


### Decision Trees

In [37]:
from sklearn import tree


In [38]:
for max_depth in range(1,10):
  model = tree.DecisionTreeClassifier()
  model.fit(X_train_scaled, y_train)
  preds = (model.predict(X_test_scaled)>0.5).astype(int)
  print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) , f1_score(y_test, preds))



0.4519774011299435 0.45454545454545453 0.449438202247191 0.45197740112994345
0.4971751412429379 0.5 0.5056179775280899 0.5027932960893854
0.5310734463276836 0.5348837209302325 0.5168539325842697 0.5257142857142857
0.4745762711864407 0.4777777777777778 0.48314606741573035 0.4804469273743017
0.4632768361581921 0.4625 0.4157303370786517 0.4378698224852071
0.5028248587570622 0.5054945054945055 0.5168539325842697 0.5111111111111112
0.4915254237288136 0.4946236559139785 0.5168539325842697 0.5054945054945056
0.4858757062146893 0.4888888888888889 0.4943820224719101 0.4916201117318436
0.5028248587570622 0.5057471264367817 0.4943820224719101 0.5


### Neural network from scikit

In [39]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [40]:
for learning_rate in ['constant']:
  model = MLPClassifier(hidden_layer_sizes = (200,), learning_rate = learning_rate, random_state = 42, max_iter = 1000)
  model.fit(X_train_scaled, y_train)
  preds = (model.predict(X_test_scaled)>0.5).astype(int)
  print(accuracy_score(y_test, preds), precision_score(y_test, preds), recall_score(y_test, preds) , f1_score(y_test, preds))



0.519774011299435 0.5232558139534884 0.5056179775280899 0.5142857142857142




## Более сложные модели

### Случайный лес

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
model = RandomForestClassifier(n_estimators = 35)
model.fit(X_train_scaled, y_train)
preds = (model.predict(X_test_scaled)>0.5).astype(int)
print (make_score(y_test, preds))


[0.480225988700565, 0.4835164835164835, 0.4943820224719101, 0.48888888888888893]


In [43]:
params = {
   'n_estimators': [10, 100, 500, 1000],
   'criterion': ['gini', 'entropy'],
   'max_depth': [None, 4, 16, 64, 256],
   'max_features': ['auto', 'sqrt', 'log2', 5, 8],
   'max_leaf_nodes': [None, 2, 5, 10, 20, 50],
   'class_weight': ['balanced', 'balanced_subsample']
   }
 

In [44]:
scoring={'accuracy': 'accuracy', 
        'f1': 'f1',
         'roc_auc': 'roc_auc',
         'recall': 'recall',
         'precision':'precision'
}
         

In [45]:
param_grid_test  = {
    'n_estimators': [10, 100],
   'criterion': ['gini', 'entropy']
  }                    


In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
#gs = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs= -1,),
                  #param_grid=params,
                  #scoring=scoring,
                  #refit='accuracy',
                  #return_train_score=False)

In [48]:
#gs.fit(X_train_scaled, y_train)

In [49]:
#gs.best_params_

In [50]:
#gs.best_score_

In [51]:
#gs.cv_results_

In [52]:
#preds = gs.predict(X_test_scaled)
#make_score(y_test, preds)

### AdaBoost

In [53]:
from sklearn.ensemble import AdaBoostClassifier

In [56]:
accuracy_list = []
max_accuracy = 0
max_precision = 0
max_recall = 0
max_f1 = 0
max_accuracy_n = 0
max_precision_n = 0
max_recall_n = 0
max_f1_n = 0


model = AdaBoostClassifier(n_estimators = 88)
model.fit(X_train_scaled, y_train)
preds = (model.predict(X_test_scaled)>0.5).astype(int)

if make_score(y_test, preds)[0]>max_accuracy:
  max_accuracy = make_score(y_test, preds)[0]
  max_accuracy_n = n_estimators = 88
if make_score(y_test, preds)[1]>max_precision:
  max_precision = make_score(y_test, preds)[1]
  max_precision_n = n_estimators
if make_score(y_test, preds)[2]>max_recall:
  max_recall = make_score(y_test, preds)[2]
  max_recall_n = n_estimators
if make_score(y_test, preds)[3]>max_f1:
  max_f1 = make_score(y_test, preds)[3]
  max_f1_n = n_estimators

  #print (make_score(y_test, preds))
  accuracy_list.append(make_score(y_test, preds))

In [58]:
print(max_accuracy,
max_precision,
max_recall,
max_f1,
max_accuracy_n,
max_precision_n,
max_recall_n,
max_f1_n,)

0.5875706214689266 0.58 0.651685393258427 0.6137566137566138 88 88 88 88


In [59]:
model = AdaBoostClassifier(n_estimators = 88)
model.fit(X_train_scaled, y_train)
preds = (model.predict(X_test_scaled)>0.5).astype(int)
print (make_score(y_test, preds))

[0.5875706214689266, 0.58, 0.651685393258427, 0.6137566137566138]


In [60]:
np.array(accuracy_list).max ## после (1-400)

<function ndarray.max>

In [61]:
max(accuracy_list) ## после (400-800)

[0.5875706214689266, 0.58, 0.651685393258427, 0.6137566137566138]

In [62]:
# plt.figure(figsize=(20, 10))
# plt.plot(np.arange(1,len(accuracy_list)+1),accuracy_list)
# plt.suptitle("The dependence of quality metrics on the n_estimators parameter ", fontsize = 20)
# plt.xlabel("n_estimators", fontsize = 20)
# plt.ylabel("value of metric", fontsize = 20)
# #plt.minorticks_on()
# plt.grid(which='major', 
#         color = 'k', 
#         linewidth = 1)
# plt.grid(which='minor', 
#         color = 'k', 
#         linestyle = ':')
# plt.legend(['acc','prec','rec', 'f1'], prop={'size': 20})
# plt.show
# #plt.savefig('Dependence of methics on the n_estimators parametr (1-400)')

### Gradient Tree Boosting

In [63]:
from sklearn.ensemble import GradientBoostingClassifier

In [64]:
model = GradientBoostingClassifier(n_estimators=88, learning_rate=1.0, max_depth=1)
model.fit(X_train_scaled, y_train)
preds = (model.predict(X_test_scaled)>0.5).astype(int)
print (make_score(y_test, preds))

[0.5932203389830508, 0.5825242718446602, 0.6741573033707865, 0.625]


### Histogram-Based Gradient Boosting

In [65]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [66]:
model = HistGradientBoostingClassifier(max_iter=10000)
model.fit(X_train_scaled, y_train)
preds = (model.predict(X_test_scaled)>0.5).astype(int)
print (make_score(y_test, preds))

[0.5084745762711864, 0.51, 0.5730337078651685, 0.5396825396825395]


### Voting Classifier

In [67]:
from sklearn.ensemble import VotingClassifier

In [68]:
clf1 = KNeighborsClassifier(n_neighbors = 51, metric='euclidean')
clf2 = MLPClassifier(hidden_layer_sizes = (200,),
                     learning_rate = 'constant',
                     random_state = 42,
                     max_iter = 200)
clf3 = AdaBoostClassifier(n_estimators = 88)
clf4 = GradientBoostingClassifier(n_estimators=88, learning_rate=1.0, max_depth=1)

In [69]:
model  = VotingClassifier(estimators=[('KNN', clf1), ('MLP', clf2), ('Ada', clf3), ('GB', clf4)], voting='hard')
model.fit(X_train_scaled, y_train)
preds = (model.predict(X_test_scaled)>0.5).astype(int)
print (make_score(y_test, preds))



[0.632768361581921, 0.6666666666666666, 0.5393258426966292, 0.5962732919254659]


In [71]:
for clf, label in zip([clf1, clf2, clf3, clf4,], ['Nearest Neighbors', 'Neural Network', 'AdaBoost', 'Gradient Boosting',]):
     scores = cross_val_score(clf, X_test_scaled, y_test, scoring='accuracy', cv=5)
     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.48 (+/- 0.12) [Nearest Neighbors]




Accuracy: 0.50 (+/- 0.08) [Neural Network]
Accuracy: 0.46 (+/- 0.09) [AdaBoost]
Accuracy: 0.47 (+/- 0.06) [Gradient Boosting]


## Скачивание модели

In [72]:
import joblib

In [73]:
joblib.dump(model, '/content/drive/MyDrive/MTS.Project/Strategy/Trend_classification/trend_classification_model.pkl')

['/content/drive/MyDrive/MTS.Project/Strategy/Trend_classification/trend_classification_model.pkl']

# ОСТАЛЬНЫЕ МОДЕЛИ

In [77]:
symbols = ['LKOH', 'NVTK', 'ROSN', 'SIBN', 'SNGS']

In [130]:
clf1 = KNeighborsClassifier(n_neighbors = 51, metric='euclidean')
clf2 = MLPClassifier(hidden_layer_sizes = (200,),
                     learning_rate = 'constant',
                     random_state = 42,
                     max_iter = 200)
clf3 = AdaBoostClassifier(n_estimators = 88)
clf4 = GradientBoostingClassifier(n_estimators=88, learning_rate=1.0, max_depth=1)

In [136]:
symbols = ['LKOH', 'NVTK', 'ROSN', 'SIBN', 'SNGS', 'GAZP']
for symbol in symbols:
  df = pd.read_csv('/content/'+symbol+'_170101_210829.csv')
  df['<TIME>'][df['<TIME>']==0]='000000'
  df ['<DATE>'] = df['<DATE>'].astype(str)+df['<TIME>'].astype(str)
  df['<DATE>'] = pd.to_datetime(df['<DATE>'], format='%Y%m%d%H%M%S') 
  df.drop(columns=['<TIME>'], inplace=True)
  df_dif = df[1:].reset_index(drop = True)
  df_dif['<CLOSE>'] = df[1:]['<CLOSE>'].reset_index(drop = True,) - df[0:len(df)-1]['<CLOSE>']
  df_dif = df_dif[['<DATE>','<CLOSE>']]
  df_dif.columns = ['TIMEDATE', 'Close']
  #print (df_dif.head(1))
  df_new = make_dataset(df_dif)
  df_new['target'] = (df_new ['target'] > 0).astype(int)

  X_train = df_new.drop(columns = ['start_datatime','end_datatime', 'target', 'target_datetime'])
  y_train = df_new['target']
  scaler = preprocessing.StandardScaler().fit(X_train)
  X_train_scaled = scaler.transform(X_train)

  #model  = VotingClassifier(estimators=[('KNN', clf1), ('MLP', clf2), ('Ada', clf3), ('GB', clf4)], voting='hard')
  #model.fit(X_train_scaled, y_train)

  joblib.dump(scaler, '/content/drive/MyDrive/MTS.Project/Strategy/Trend_classification/trend_classification_normalizer_'+symbol+'.pkl')

  #preds = (model.predict(X_train)>0.5).astype(int)
  #print (make_score(y_train, preds))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is tryin

In [133]:
# df = pd.read_csv("/content/LKOH_170101_210829.csv", )
# df['<TIME>'][df['<TIME>']==0]='000000'
# df ['<DATE>'] = df['<DATE>'].astype(str)+df['<TIME>'].astype(str)
# df['<DATE>'] = pd.to_datetime(df['<DATE>'], format='%Y%m%d%H%M%S') 
# df.drop(columns=['<TIME>'], inplace=True)


In [134]:
# df = pd.read_csv("/content/LKOH_170101_210829.csv", )
# #df['<DATE>'] = pd.to_datetime(df['<DATE>'] )
# df