In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, accuracy_score
from scipy.sparse import hstack, vstack
# from matplotlib import pyplot as plt

## Загрузка данных

In [2]:
issuers_df = pd.read_excel('../data/issuers.xlsx', index_col = 0)
issuers_additional_df = pd.read_excel('../data/names and synonyms.xlsx')
mentions_texts_df = pd.read_pickle('../data/mentions texts.pickle')
sentiment_texts_df = pd.read_pickle('../data/sentiment_texts.pickle')

## Предобработка данных

In [3]:
# добавим похожие названия
cols  =['EMITENT_FULL_NAME', 'BGTicker', 'BGTicker.1'] + list(issuers_additional_df.loc[:,'Unnamed: 5':].columns)
issuers_additional_df['EMITENT_ADDITIONAL_NAME'] = issuers_additional_df[cols].apply(lambda x: ', '.join(x[x.notnull()]), axis = 1)
issuers_df= issuers_df.merge(issuers_additional_df[['issuerid', 'EMITENT_ADDITIONAL_NAME']], on = 'issuerid', how = 'inner', validate = '1:1')

In [4]:
# приведем к виду сентимента
mentions_texts_df['SentimentScore'] = np.nan
keep_cols = ['ChannelID', 'MessageID', 'issuerid', 'DatePosted', 'MessageText', 'SentimentScore']
mentions_texts_df[keep_cols].head(3)

Unnamed: 0,ChannelID,MessageID,issuerid,DatePosted,MessageText,SentimentScore
0,1197210433,5408,90,2020-04-29 07:29:01,?? Фокус недели #ФН Сегодня ????? ММК опублик...,
1,1203560567,64803,57,2020-01-21 12:51:42,??#LSRG ЛСР - операционные результаты (2019г)...,
2,1197210433,23389,152,2021-07-21 11:15:46,#CHMF Северсталь (CHMF) впервые поставила в Бр...,


In [5]:
# объединим дубли
sentiment_texts_df = sentiment_texts_df.groupby(['ChannelID', 'MessageID', 'issuerid', 'MessageText', 'DatePosted'], as_index = False)['SentimentScore'].mean()
sentiment_texts_df['SentimentScore'] = sentiment_texts_df['SentimentScore'].apply(np.ceil)
sentiment_texts_df[keep_cols].head(3)

Unnamed: 0,ChannelID,MessageID,issuerid,DatePosted,MessageText,SentimentScore
0,1001029560,12638,235,2022-12-26 13:53:40,❓ На чем можно заработать в последнюю неделю г...,0.0
1,1001029560,12745,235,2023-01-11 17:35:21,🥇 Прогноз по золоту на 2023 В среднесрочной п...,0.0
2,1001029560,12818,235,2023-01-20 15:22:33,Три интересные бумаги на следующую неделю Вну...,0.0


In [6]:
# объединим
df = pd.concat([mentions_texts_df[keep_cols], sentiment_texts_df[keep_cols]])
df = df.groupby(['ChannelID', 'MessageID', 'issuerid', 'MessageText', 'DatePosted'], as_index = False)['SentimentScore'].max()
df.reset_index(drop = True, inplace = True)

# оставим только валидные компании
df = df[df['issuerid'].isin(issuers_df['issuerid'])].copy()

# # оставим только одно сообщение = одна компания
# df = df[~df.duplicated(['ChannelID', 'MessageID'], keep = 'first')].copy()
# df.reset_index(drop = True, inplace = True)

# добавим названия компаний
df = df.merge(issuers_df[['issuerid', 'EMITENT_FULL_NAME']], on = ['issuerid'], how = 'left', validate = 'm:1')

df.head(3)

Unnamed: 0,ChannelID,MessageID,issuerid,MessageText,DatePosted,SentimentScore,EMITENT_FULL_NAME
0,1001029560,1113,32,У «Ростелекома» вышло приложение «Аллё». Через...,2017-01-25 14:53:12,,"Публичное акционерное общество ""Аэрофлот – рос..."
1,1001029560,1177,62,Встречаем! Новая бумага на российском рынке ак...,2017-02-09 10:01:09,,"Публичное акционерное общество ""Детский мир"""
2,1001029560,1501,26,Несколько мыслей о текущей ситуации на рынке и...,2017-05-04 06:02:56,,"Публичное акционерное общество ""Акционерная фи..."


**Соберем каркас для обучения**

In [7]:
# collect all possible combinations of messages AND issuerid
df['target'] = 1
df['key'] = df['ChannelID'].astype('str') + '_' + df['MessageID'].astype('str')
df1 = df[['key']].drop_duplicates().copy()
df2 = issuers_df[['issuerid']].drop_duplicates().copy()
df1['key2'] = 1
df2['key2'] = 1
base_df = pd.merge(df1, df2, on ='key2').drop("key2", axis = 1)

# add basic info
base_df= base_df.merge(df[['key', 'ChannelID', 'MessageID', 'DatePosted']].drop_duplicates(), on = ['key'], how = 'left', validate = 'm:1')

# add target
base_df = base_df.merge(df[['key', 'issuerid', 'target']].drop_duplicates(), on = ['key', 'issuerid'], how = 'left', validate = 'm:1')
base_df['target'].fillna(0, inplace = True)

base_df.head(3)

Unnamed: 0,key,issuerid,ChannelID,MessageID,DatePosted,target
0,1001029560_1113,1,1001029560,1113,2017-01-25 14:53:12,0.0
1,1001029560_1113,2,1001029560,1113,2017-01-25 14:53:12,0.0
2,1001029560_1113,3,1001029560,1113,2017-01-25 14:53:12,0.0


**обработаем тексты**

In [8]:
import re
import pymorphy2
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

morph = pymorphy2.MorphAnalyzer()

In [9]:
words_regex = re.compile('\w+')

def find_words(text, regex = words_regex):
    tokens =  regex.findall(text.lower())
    return [w for w in tokens if w.isalpha() and len(w) >= 2]

with open('../data/russian.txt') as f:
    stopwords = f.readlines()
stopwords = [x.replace('\n', '') for x in stopwords]
stopwords = [x for x in stopwords if 'не' not in x]

def lemmatize(words, lemmer = morph, stopwords = stopwords):
    lemmas = [lemmer.parse(w)[0].normal_form for w in words]
    return [w for w in lemmas if not w in stopwords
            and w.isalpha()]

def preprocess(text):
    return (lemmatize(find_words(text)))

In [10]:
issuers_df['issuer_text'] = issuers_df['EMITENT_ADDITIONAL_NAME'].apply(preprocess)
df['message_text'] = df['MessageText'].apply(preprocess)

df['message_len'] = df['message_text'].apply(len)
issuers_df['issuer_len'] = issuers_df['issuer_text'].apply(len)

In [11]:
# векторизуем текст
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords, ngram_range=(1, 1))
# vectorizer.fit(pd.concat([df['message_text'].map(lambda x: ', '.join(x)), issuers_df['issuer_text'].map(lambda x: ', '.join(x))]));
vectorizer.fit(issuers_df['issuer_text'].map(lambda x: ', '.join(x)));
tfidf_cols = ['TFIDF_' + x for x in vectorizer.get_feature_names_out()]

In [12]:
# добавим tfidf
df = pd.concat([df, pd.DataFrame(vectorizer.transform(df['message_text'].map(lambda x: ', '.join(x))).todense(), columns = tfidf_cols)], axis = 1)
issuers_df = pd.concat([issuers_df, pd.DataFrame(vectorizer.transform(issuers_df['issuer_text'].map(lambda x: ', '.join(x))).todense(), columns = tfidf_cols)], axis = 1)

**добавим признаки**

In [17]:
base_df = base_df.merge(df[~df.duplicated(['key'], keep = 'first')][['key', 'message_text', 'message_len'] + tfidf_cols], on = ['key'], how = 'left', validate = 'm:1')
base_df = base_df.merge(issuers_df[['issuerid', 'issuer_text', 'issuer_len']], on = ['issuerid'], how = 'left', validate = 'm:1')

In [18]:
base_df['str_intersection'] = [len(set(a).intersection(b)) for a, b in zip(base_df['message_text'], base_df['issuer_text'])]
base_df['dayofweek'] = base_df['DatePosted'].dt.dayofweek
base_df['hour'] = base_df['DatePosted'].dt.hour

In [19]:
base_df.head(3)

Unnamed: 0,key,issuerid,ChannelID,MessageID,DatePosted,target,message_text,message_len,TFIDF_bank,TFIDF_group,...,TFIDF_энергетический,TFIDF_энергия,TFIDF_энерго,TFIDF_энергосбытовый,TFIDF_южный,issuer_text,issuer_len,str_intersection,dayofweek,hour
0,1001029560_1113,1,1001029560,1113,2017-01-25 14:53:12,0.0,"[ростелеком, выйти, приложение, аллё, звонить,...",31,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,"[акционерный, коммерческий, банк, держава, пуб...",11,0,2,14
1,1001029560_1113,2,1001029560,1113,2017-01-25 14:53:12,0.0,"[ростелеком, выйти, приложение, аллё, звонить,...",31,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,"[московский, кредитный, банк, публичный, акцио...",18,0,2,14
2,1001029560_1113,3,1001029560,1113,2017-01-25 14:53:12,0.0,"[ростелеком, выйти, приложение, аллё, звонить,...",31,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,"[российский, акционерный, коммерческий, дорожн...",28,0,2,14


In [21]:
basic_features =['message_len', 'issuer_len', 'str_intersection']
basic_categorical_features = ['ChannelID', 'dayofweek', 'hour'] # add DatePosted
target_col = 'target'

features = basic_features + tfidf_cols
categorical_features = basic_categorical_features

print(f'Всего используем {len(features + categorical_features)} признаков, {len(categorical_features)} из которых категориальные')

Всего используем 106 признаков, 3 из которых категориальные


## Catboost

In [22]:
import catboost as ctb
from catboost import CatBoostClassifier, cv, Pool
from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [23]:
base_df.sort_values(['DatePosted'], inplace = True)
base_df.reset_index(inplace = True, drop = True)
split_date = base_df['DatePosted'].quantile(0.9) # ~2к для теста

# general train / test
train_df = base_df[base_df['DatePosted'] <= split_date].copy()
test_df = base_df[base_df['DatePosted'] > split_date].copy()

X_train, y_train = train_df[features+categorical_features], train_df[target_col]
X_test, y_test = test_df[features+categorical_features], test_df[target_col]

print('1st task:\nTrain:', train_df.shape[0], '\nTest:', test_df.shape[0])

train_df.head(3)

1st task:
Train: 3830100 
Test: 425340


Unnamed: 0,key,issuerid,ChannelID,MessageID,DatePosted,target,message_text,message_len,TFIDF_bank,TFIDF_group,...,TFIDF_энергетический,TFIDF_энергия,TFIDF_энерго,TFIDF_энергосбытовый,TFIDF_южный,issuer_text,issuer_len,str_intersection,dayofweek,hour
0,1041482399_97,1,1041482399,97,2016-03-23 21:33:16,0.0,"[дивиденд, черкизовый, большой, доля, вероятно...",13,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,"[акционерный, коммерческий, банк, держава, пуб...",11,0,2,21
1,1041482399_97,163,1041482399,97,2016-03-23 21:33:16,0.0,"[дивиденд, черкизовый, большой, доля, вероятно...",13,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,"[публичный, акционерный, общество, татнефть, ш...",9,0,2,21
2,1041482399_97,164,1041482399,97,2016-03-23 21:33:16,0.0,"[дивиденд, черкизовый, большой, доля, вероятно...",13,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,"[публичный, акционерный, общество, таттелек, t...",7,0,2,21


In [24]:
# переводим данные в формат катбус
data_train = ctb.Pool(X_train, y_train, cat_features=categorical_features)
data_test = ctb.Pool(X_test, y_test, cat_features=categorical_features)

n_fold = 5
tscv = TimeSeriesSplit(n_splits=n_fold, test_size=test_df.shape[0])
params = {
    'loss_function': 'Logloss',
    'iterations': 10000,
    'learning_rate': 0.5,
    'auto_class_weights': 'Balanced',
    'random_seed': 13
}
cv_res = ctb.cv(
    data_train, 
    params, 
    folds=tscv,
    early_stopping_rounds=25, 
    plot=False, 
    # logging_level = 'Silent',
)

Training on fold [0/5]
0:	learn: 0.3160691	test: 0.3311547	best: 0.3311547 (0)	total: 903ms	remaining: 2h 30m 31s
1:	learn: 0.2411719	test: 0.2550359	best: 0.2550359 (1)	total: 1.58s	remaining: 2h 11m 52s
2:	learn: 0.2159290	test: 0.2326714	best: 0.2326714 (2)	total: 2.33s	remaining: 2h 9m 25s
3:	learn: 0.2090073	test: 0.2252785	best: 0.2252785 (3)	total: 3.13s	remaining: 2h 10m 17s
4:	learn: 0.2040851	test: 0.2194172	best: 0.2194172 (4)	total: 3.99s	remaining: 2h 12m 50s
5:	learn: 0.1995905	test: 0.2161810	best: 0.2161810 (5)	total: 4.77s	remaining: 2h 12m 27s
6:	learn: 0.1931963	test: 0.2099375	best: 0.2099375 (6)	total: 5.68s	remaining: 2h 15m 9s
7:	learn: 0.1874857	test: 0.2067986	best: 0.2067986 (7)	total: 6.57s	remaining: 2h 16m 41s
8:	learn: 0.1866590	test: 0.2065262	best: 0.2065262 (8)	total: 7.3s	remaining: 2h 15m 3s
9:	learn: 0.1854089	test: 0.2055427	best: 0.2055427 (9)	total: 8.09s	remaining: 2h 14m 46s
10:	learn: 0.1851473	test: 0.2051414	best: 0.2051414 (10)	total: 8.44s	

In [25]:
best_it = cv_res['test-Logloss-mean'].idxmin()
params['iterations'] = best_it

clf = CatBoostClassifier(**params)
clf.fit(
    data_train,
    verbose=False,
    plot=False
);

In [33]:
check_df = test_df.copy()
# check_df['predict'] = clf.predict(data_test)

thresh = 0.94
check_df['predict'] = (clf.predict_proba(data_test)[:, 1] > thresh).astype(int)#.sum()

test_target = check_df.pivot_table(index = 'key', columns = 'issuerid', values='target')
test_predict = check_df.pivot_table(index = 'key', columns = 'issuerid', values='predict')

print('Score: ', f1_score(test_target, test_predict, average = 'macro'))

Score:  0.16315512697942805


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


+100tf-idf: Score:  0.15-0.16315512697942805
+thresh: 0.12257053058151901 (Score:  0.10617851209359845 based on train)
+dow/hour: 0.09805562123337348
base: Score:  0.09596176962736463
