In [1]:
import pandas as pd
import numpy as np
import re
import pymorphy2
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score, accuracy_score

with open('../data/russian.txt') as f:
    stopwords = f.readlines()

stopwords = [x.replace('\n', '') for x in stopwords]
stopwords = [x for x in stopwords if 'не' not in x]

## Загрузка данных

In [3]:
issuers_df = pd.read_excel('../data/issuers.xlsx', index_col = 0)
issuers_plus_df = pd.read_excel('../data/issuers_plus.xlsx', index_col = 0)
issuers_additional_df = pd.read_excel('../data/names and synonyms.xlsx')
mentions_texts_df = pd.read_pickle('../data/mentions texts.pickle')
sentiment_texts_df = pd.read_pickle('../data/sentiment_texts.pickle')

## Предобработка данных

In [5]:
# добавим найденные синонимы 
issuers_df = issuers_df.drop(['EMITENT_FULL_NAME'], axis = 1).merge(issuers_plus_df[['issuerid', 'EMITENT_FULL_NAME']], on = 'issuerid')

# добавим похожие названия
cols  =['EMITENT_FULL_NAME', 'BGTicker', 'BGTicker.1'] + list(issuers_additional_df.loc[:,'Unnamed: 5':].columns)
issuers_additional_df['EMITENT_ADDITIONAL_NAME'] = issuers_additional_df[cols].apply(lambda x: ', '.join(x[x.notnull()]), axis = 1)
issuers_df= issuers_df.merge(issuers_additional_df[['issuerid', 'EMITENT_ADDITIONAL_NAME']], on = 'issuerid', how = 'inner', validate = '1:1')

# приведем к виду сентимента
mentions_texts_df['SentimentScore'] = np.nan
keep_cols = ['ChannelID', 'MessageID', 'issuerid', 'DatePosted', 'MessageText', 'SentimentScore']

# объединим дубли сентимента
sentiment_texts_df = sentiment_texts_df[sentiment_texts_df['SentimentScore'] != 0].copy() # плохие оценки
sentiment_texts_df = sentiment_texts_df.groupby(['ChannelID', 'MessageID', 'issuerid', 'MessageText', 'DatePosted'], as_index = False)['SentimentScore'].mean()
sentiment_texts_df['SentimentScore'] = sentiment_texts_df['SentimentScore'].apply(np.ceil)

# объединим mentions & sentiments
df = pd.concat([mentions_texts_df[keep_cols], sentiment_texts_df[keep_cols]])
df = df.groupby(['ChannelID', 'MessageID', 'issuerid', 'MessageText', 'DatePosted'], as_index = False)['SentimentScore'].max() # оставим только с оценкой
df.reset_index(drop = True, inplace = True)

# оставим только валидные компании
df = df[df['issuerid'].isin(issuers_df['issuerid'])].copy()

# добавим названия компаний
df = df.merge(issuers_df[['issuerid', 'EMITENT_FULL_NAME', 'EMITENT_ADDITIONAL_NAME']], on = ['issuerid'], how = 'left', validate = 'm:1')

df.head(3)

Unnamed: 0,ChannelID,MessageID,issuerid,MessageText,DatePosted,SentimentScore,EMITENT_FULL_NAME,EMITENT_ADDITIONAL_NAME
0,1001029560,1113,32,У «Ростелекома» вышло приложение «Аллё». Через...,2017-01-25 14:53:12,,"Публичное акционерное общество ""Аэрофлот – рос...","Публичное акционерное общество ""Аэрофлот – рос..."
1,1001029560,1177,62,Встречаем! Новая бумага на российском рынке ак...,2017-02-09 10:01:09,,"Публичное акционерное общество ""Детский мир""","Публичное акционерное общество ""Детский мир"", ..."
2,1001029560,1501,26,Несколько мыслей о текущей ситуации на рынке и...,2017-05-04 06:02:56,,"""Акционерная финансовая корпорация ""Система"" А...","Публичное акционерное общество ""Акционерная фи..."


## Evaluation

In [6]:
# split by date
df.sort_values(['DatePosted'], inplace = True)
df.reset_index(inplace = True, drop = True)
split_date = df['DatePosted'].quantile(0.9) # ~2к для теста

# general train / test
train_df = df[df['DatePosted'] <= split_date].copy()
test_df = df[df['DatePosted'] > split_date].copy()

# 2nd task train / test
train_sentiment_df = train_df[pd.notnull(train_df['SentimentScore'])].copy()
test_sentiment_df = test_df[pd.notnull(test_df['SentimentScore'])].copy()

# multi-label for f1-score
test_df = test_df.groupby(['ChannelID', 'MessageID', 'MessageText'], as_index = False)['issuerid'].apply(list)

# shapes
print('1st task:\nTrain:', train_df.shape[0], '\nTest:', test_df.shape[0])
print('\n2nd task:\nTrain:', train_sentiment_df.shape[0], '\nTest:', test_sentiment_df.shape[0])

train_df.head(3)

1st task:
Train: 17416 
Test: 1446

2nd task:
Train: 7147 
Test: 1927


Unnamed: 0,ChannelID,MessageID,issuerid,MessageText,DatePosted,SentimentScore,EMITENT_FULL_NAME,EMITENT_ADDITIONAL_NAME
0,1041482399,97,58,#Дивиденды Черкизово с большой долей вероятнос...,2016-03-23 21:33:16,,"Публичное акционерное общество ""Группа Черкизово""","Публичное акционерное общество ""Группа Черкизо..."
1,1041482399,1207,152,"Выручка Северстали в $. Славный бум 2011, 5 ле...",2016-07-21 11:41:06,,"Публичное акционерное общество ""Северсталь""","Публичное акционерное общество ""Северсталь"", C..."
2,1041482399,1387,24,"Итоги недели. Все те же Башнефть, Магнит и Лен...",2016-08-05 19:23:45,,"Публичное акционерное общество ""Акрон""","Публичное акционерное общество ""Акрон"", AKRN R..."


In [7]:
# log reg for company
vectorizer = TfidfVectorizer(max_features=5000, stop_words=stopwords, ngram_range=(1, 1))
vectorizer.fit(issuers_df['EMITENT_ADDITIONAL_NAME'])

# train vectors
train_vectors, train_target = vectorizer.transform(pd.concat([train_df['MessageText'], issuers_df['EMITENT_ADDITIONAL_NAME']])), pd.concat([train_df['issuerid'], issuers_df['issuerid']])

# predict & check
clf_company = LogisticRegression(random_state=0, class_weight ='balanced', n_jobs = 8)\
                .fit(train_vectors, train_target)

In [8]:
# get test with multiple companies
mlb = MultiLabelBinarizer(classes=issuers_df['issuerid'].values)
test_vectors, test_target = vectorizer.transform(test_df['MessageText']), mlb.fit_transform(test_df['issuerid'])

# just one class
predict = mlb.transform([[x] for x in clf_company.predict(test_vectors)])
print('One class: ', f1_score(test_target, predict, average = 'macro'))

One class:  0.1972517054489638


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# log reg for sentiment
vectorizer = TfidfVectorizer(max_features=10000, stop_words=stopwords, ngram_range=(1, 1))
train_vectors, train_target = vectorizer.fit_transform(train_sentiment_df['MessageText']), train_sentiment_df['SentimentScore']
test_vectors, test_target = vectorizer.transform(test_sentiment_df['MessageText']), test_sentiment_df['SentimentScore']
clf_sentiment = LogisticRegression(random_state=0, n_jobs = 8)\
                    .fit(train_vectors, train_target)

accuracy_score(test_target, clf_sentiment.predict(test_vectors))

0.5879605604566684

## Train final artifacts

**company**

In [10]:
# log reg for company
vectorizer = TfidfVectorizer(max_features=10000, stop_words=stopwords, ngram_range=(1, 1))
vectorizer.fit(issuers_df['EMITENT_ADDITIONAL_NAME'])

train_vectors, train_target = vectorizer.transform(pd.concat([df['MessageText'], issuers_df['EMITENT_ADDITIONAL_NAME']])), \
            pd.concat([df['issuerid'], issuers_df['issuerid']])

# predict & check
clf_company = LogisticRegression(random_state=0, class_weight ='balanced', n_jobs = 8)\
                .fit(train_vectors, train_target)

# save vectorizer and model
joblib.dump(vectorizer, '../model/vectorizer_company.pkl')
joblib.dump(clf_company, '../model/clf_company.pkl')

['../model/clf_company.pkl']

**sentiment**

In [12]:
df_sentiment = df[pd.notnull(df['SentimentScore'])].copy()

vectorizer = TfidfVectorizer(max_features=10000, stop_words=stopwords, ngram_range=(1, 1))
train_vectors, train_target = vectorizer.fit_transform(df_sentiment['MessageText']), df_sentiment['SentimentScore']

# train
clf_sentiment = LogisticRegression(random_state=0, n_jobs = 8)\
                    .fit(train_vectors, train_target)

# save model
joblib.dump(vectorizer, '../model/vectorizer_sentiment.pkl')
joblib.dump(clf_sentiment, '../model/clf_sentiment.pkl')

['../model/clf_sentiment.pkl']

**check prod**

In [15]:
import json
with open('../data/test_texts.json', "r", encoding="utf-8") as f:
    data = json.load(f)
companies = [int(x) for x in joblib.load('../model/clf_company.pkl').predict(joblib.load('../model/vectorizer_company.pkl').transform(data))]
sentiments = [float(x) for x in joblib.load('../model/clf_sentiment.pkl').predict(joblib.load('../model/vectorizer_sentiment.pkl').transform(data))]
result = [[pair] for pair in zip(companies, sentiments)]
result