In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LinearRegression

## Classification

In [None]:
df = pd.read_csv('researches/data/marked_stock_news_df.csv')
df

Unnamed: 0,day,open,news,difference,color
0,2009-01-01,619.53,[BOS][EOS],,1
1,2009-01-02,619.53,"[BOS]Да, да, уже наступил 2009 год...[EOS]",0.000000,1
2,2009-01-03,619.53,[BOS]Будем дружить?[EOS],0.000000,1
3,2009-01-04,619.53,"[BOS]Новый год - Новый Опыт, Новые знания[EOS]",0.000000,1
4,2009-01-05,619.53,"[BOS]Обзоры и идеи: ""Кирпич"" для G7""[SEP]Новый...",0.000000,1
...,...,...,...,...,...
5568,2024-03-31,3313.17,[BOS][EOS],0.000000,1
5569,2024-04-01,3343.66,[BOS]Рынок ждет дивидендов от MD Medical Group...,30.489990,2
5570,2024-04-02,3369.32,[BOS]Индекс МосБиржи продолжает движение вверх...,25.660156,2
5571,2024-04-03,3381.95,[BOS]После резкого взлета ожидается мягкая пос...,12.629883,1


In [None]:
df = pd.read_csv('researches/data/every_piece_of_news.csv')
df

Unnamed: 0,day,open,news,difference,color
0,2009-01-01,619.53,,,1
1,2009-01-02,619.53,"Да, да, уже наступил 2009 год...",0.000000,1
2,2009-01-03,619.53,Будем дружить?,0.000000,1
3,2009-01-04,619.53,"Новый год - Новый Опыт, Новые знания",0.000000,1
4,2009-01-05,619.53,"Обзоры и идеи: ""Кирпич"" для G7""",0.000000,1
...,...,...,...,...,...
101262,2024-04-04,3399.81,Что делать во времена рыночной неопределенности,17.860107,2
101263,2024-04-04,3399.81,Спрос в России не готов к резкому замедлению,17.860107,2
101264,2024-04-04,3399.81,«Самолет» - масштабирование любой ценой,17.860107,2
101265,2024-04-04,3399.81,Ускорение экономической динамики негативно пов...,17.860107,2


In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

day           0
open          0
news          0
difference    0
color         0
dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.news, df.color, random_state=42)
X_train.shape

(75943,)

In [None]:
def train_pipeline(pipeline: Pipeline):
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
sgd_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SGDClassifier())
])

train_pipeline(sgd_pipeline)

Accuracy: 0.5595101718348805
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.02      0.04      4967
           1       0.58      0.96      0.72     14444
           2       0.28      0.04      0.07      5904

    accuracy                           0.56     25315
   macro avg       0.37      0.34      0.28     25315
weighted avg       0.44      0.56      0.43     25315



In [None]:
sgd_tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd', SGDClassifier())
])

train_pipeline(sgd_tfidf_pipeline)

Accuracy: 0.5704523010073079
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.00      0.00      4967
           1       0.57      1.00      0.73     14444
           2       0.27      0.00      0.00      5904

    accuracy                           0.57     25315
   macro avg       0.41      0.33      0.24     25315
weighted avg       0.47      0.57      0.41     25315



In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/380.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m327.7/380.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collec

In [None]:
import optuna
from sklearn.model_selection import cross_val_score

In [None]:
def objective(trial):
    params = {
        'loss': trial.suggest_categorical('loss', ['hinge', 'log_loss', 'modified_huber']),
        'penalty': trial.suggest_categorical('penalty', ['l2', 'l1', 'elasticnet']),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1)
    }

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('sgd', SGDClassifier(random_state=42, **params))
    ])
    score = cross_val_score(pipeline, df.news, df.color, cv=5, scoring='accuracy').mean()

    return score


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective)

best_params = study.best_params
print("Best Hyperparameters:", best_params)


[I 2024-04-08 17:58:17,607] A new study created in memory with name: no-name-6ee84517-7055-4423-8dd6-7e78e5eaef49
  'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1)
[I 2024-04-08 17:58:27,111] Trial 0 finished with value: 0.561130875847049 and parameters: {'loss': 'log', 'penalty': 'l2', 'alpha': 5.036690791215055e-05}. Best is trial 0 with value: 0.561130875847049.
  'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1)
[I 2024-04-08 17:58:35,994] Trial 1 finished with value: 0.5653380403801148 and parameters: {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.0001474906657758604}. Best is trial 1 with value: 0.5653380403801148.
  'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1)
[I 2024-04-08 17:58:43,994] Trial 2 finished with value: 0.5655849417677006 and parameters: {'loss': 'log', 'penalty': 'l1', 'alpha': 0.004425900607079905}. Best is trial 2 with value: 0.5655849417677006.
  'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1)
[I 2024-04-08 17:58:52,425] Trial 3 finis

Best Hyperparameters: {'loss': 'modified_huber', 'penalty': 'l1', 'alpha': 0.0013328923120878445}


In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd', SGDClassifier(loss='modified_huber', penalty='l1', alpha=0.013))
])

train_pipeline(pipeline)

Accuracy: 0.5705708078214498
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4967
           1       0.57      1.00      0.73     14444
           2       0.00      0.00      0.00      5904

    accuracy                           0.57     25315
   macro avg       0.19      0.33      0.24     25315
weighted avg       0.33      0.57      0.41     25315



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.linear_model import LogisticRegression

log_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

train_pipeline(log_pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.5122654552636776
Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.10      0.14      4967
           1       0.59      0.80      0.67     14444
           2       0.27      0.16      0.20      5904

    accuracy                           0.51     25315
   macro avg       0.36      0.35      0.34     25315
weighted avg       0.44      0.51      0.46     25315



In [None]:
!pip install transformers



In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

encoded_data_train = tokenizer.batch_encode_plus(
    X_train.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=64,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    X_test.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=64,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(y_test.values)

train_data = TensorDataset(input_ids_train, attention_masks_train, labels_train)
train_dataloader = DataLoader(train_data, batch_size=16)

test_data = TensorDataset(input_ids_test, attention_masks_test, labels_test)
test_dataloader = DataLoader(test_data, batch_size=16)

model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
for epoch in range(10):
    model.train()
    print(f'Current epoch: {epoch}')
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_masks, labels = [data.to(device) for data in batch]
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Current epoch: 0
Current epoch: 1
Current epoch: 2


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
predictions = []
true_labels = []
for batch in test_dataloader:
    input_ids, attention_masks, labels = [data.to(device) for data in batch]
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).tolist())
    true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5705313055500691


## Regression

In [None]:
df = pd.read_csv('researches/data/data.tsv', sep='\t')
df.head()

Unnamed: 0,title,score,link,summary,published,tickers
0,Электромобильный стартап Arrival экс-главы Yot...,-0.583333,https://www.rbc.ru/technology_and_media/12/05/...,"Британский электромобильный стартап Arrival, к...","Thu, 12 May 2022 05:10:01 +0300",['ARVL']
1,Экс-глава НМТП рассказал о «напряженных отноше...,-0.314286,https://www.rbc.ru/society/16/11/2020/5fb2709d...,Экс-председатель совета директоров Новороссийс...,"Fri, 20 May 2022 19:13:18 +0300",['NMTP']
2,Шрёдер отклонил предложение войти в совет дире...,-0.333333,https://www.rbc.ru/business/20/05/2022/628772b...,Его кандидатуру выдвинули в начале февраля. Ка...,"Tue, 24 May 2022 22:12:05 +0300",['GAZP']
3,Шельф берут в разработку // Генподрядчиком «Га...,0.7,https://www.kommersant.ru/doc/5482398,"Как стало известно “Ъ”, «Аврора» Андрея Патруш...","Fri, 29 Jul 2022 00:28:00 +0300",['GAZP']
4,"Чистый убыток ""Юнипро"" в 1 полугодии 2022 года...",-0.611111,https://www.finam.ru/analysis/newsitem/chistyi...,"Чистый убыток ""Юнипро"" в 1 полугодии 2022 года...","Thu, 28 Jul 2022 12:43:00 +0300",['UPRO']


In [None]:
df.isnull().sum()

title         0
score         0
link          0
summary      12
published     0
tickers       0
dtype: int64

In [None]:
train, test = train_test_split(df, random_state=42)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', LinearRegression())
])

pipeline.fit(train['title'], train['score'])

print(mean_squared_error(test['score'], pipeline.predict(test['title']), squared=False))

print(pipeline.predict(["Яндекс увеличил прибыль на $1 млрд"]))

0.4945632000811333
[0.48365427]


In [None]:
import joblib
joblib.dump(pipeline, 'model.joblib')

In [None]:
finam_df = pd.read_csv('researches/data/every_piece_of_news.csv')
finam_df.head()

Unnamed: 0,day,open,news,difference,color
0,2009-01-01,619.53,,,1
1,2009-01-02,619.53,"Да, да, уже наступил 2009 год...",0.0,1
2,2009-01-03,619.53,Будем дружить?,0.0,1
3,2009-01-04,619.53,"Новый год - Новый Опыт, Новые знания",0.0,1
4,2009-01-05,619.53,"Обзоры и идеи: ""Кирпич"" для G7""",0.0,1


In [None]:
finam_df.isnull().sum()

day           0
open          0
news          9
difference    1
color         0
dtype: int64

In [None]:
finam_df.news = finam_df.news.fillna('')
finam_df.difference = finam_df.difference.fillna(0)

In [None]:
pred = pipeline.predict(finam_df.news)
finam_df['score'] = pred
finam_df.head()

Unnamed: 0,day,open,news,difference,color,score
0,2009-01-01,619.53,,0.0,1,0.020985
1,2009-01-02,619.53,"Да, да, уже наступил 2009 год...",0.0,1,0.020985
2,2009-01-03,619.53,Будем дружить?,0.0,1,0.020985
3,2009-01-04,619.53,"Новый год - Новый Опыт, Новые знания",0.0,1,-0.059655
4,2009-01-05,619.53,"Обзоры и идеи: ""Кирпич"" для G7""",0.0,1,0.123659


In [None]:
finam_df.to_csv('researches/data/finam_news_scored.csv')