In [3]:
import pandas as pd
import numpy as np
import time as tm
import os
from selenium import webdriver
from csv import writer

In [None]:
def parse_news():
    DELAY_TO_DOWNLOAD = 2
    DELAY_TO_SCROLL = 0.1
    TIMES_TO_SCROLL = 300

    if not os.path.exists('/Users/mikezinovenkov/venv/Diploma/all_files/data/news'):
        os.mkdir('/Users/mikezinovenkov/venv/Diploma/all_files/data/news')

    COMPANIES = ["МТС", "Газпром", "Лукойл", "Роснефть", "Транснефть", "Новатэк"]

    driver = webdriver.Chrome(executable_path="chromedriver.exe")

    for company in COMPANIES:
        url = f"https://www.rbc.ru/tags/?tag={company}&category=economics%7Cfinances%7Cpolitics%7Cneweconomy%7Cmoney&project=rbcnews"
        driver.get(url)
        tm.sleep(DELAY_TO_DOWNLOAD)

        for _ in range(TIMES_TO_SCROLL):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            tm.sleep(DELAY_TO_SCROLL)

        articles = driver.find_elements(by='css selector', value='.search-item')


        with open(f"/Users/mikezinovenkov/venv/Diploma/all_files/data/news/{company}.csv", "w", encoding="utf-8", newline="") as file:
            csv_writer = writer(file)
            headers = ["Date", "Category", "Title", "Text"]
            csv_writer.writerow(headers)

            for article in articles:
                if not article.text:
                    continue

                article_text = article.text.split("\n")

                category, date, time = article_text[0].split(", ")
                title = article_text[1]
                text = "NO TEXT"
                if len(article_text) > 2:
                    text = article_text[2]

                csv_writer.writerow([f"{date} {time}", category, title, text])

    driver.quit()

# Создание датафрема обучающих данных

def tr():
    df = pd.read_csv('lenta-ru-news.csv')
    topics = ['Экономика', 'Бизнес', 'Россия', 'Интернет и СМИ', 'Наука и техника']

    tags = ['Бизнес','Техника','Политика','Госэкономика','Рынки','Деньги','Достижения','Мировой бизнес','Финансы компаний','Госрегулирование','Инновации','Киберпреступность','Экономика']
    df['flag_d'] = df['tags'].isin(tags)
    df = df[df['flag_d'] == True]
    df = df.drop(columns={'url','flag_d'})
    items = list(np.random.choice(len(df), 5000, replace=False))
    print(len(items), items[:5])
    df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=False, inplace=True)
    df['flag'] = df['index'].isin(items)
    ans = df[df['flag'] == True]
    ans = ans.drop(columns={'index','flag','text'})
    ans['flag'] = ''
    ans = ans.reindex(columns=['flag','title','topic','tags','date'])
    ans.to_csv('make_classes.csv', index=False)

# Обработка данных для обучения/теста

from nltk.tokenize import WordPunctTokenizer
from razdel import tokenize, sentenize
import pymorphy2

# Лемматизация

def lemmatize(text):
    morph = pymorphy2.MorphAnalyzer()
    words = text.split()
    res = list()
    for word in words:
        p = morph.parse(word)[0]
        res.append(p.normal_form)

    return ' '.join(res)

# Удаление стоп-слов

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import WordPunctTokenizer

def remove_stopwords(text):
       tokenizer = WordPunctTokenizer()
       russian_stopwords = stopwords.words("russian")
       tokens = tokenizer.tokenize(text)
       res = [token for token in tokens if token.lower() not in russian_stopwords\
              and token not in punctuation]

       return ' '.join(res)
def rework_train_data(filename):
    df = pd.read_csv(f'data/news/{filename}.csv', sep=';')
    
    df[df['tags'].isin(['Рынки','Деньги','Финансы компаний'])]['tags'] = 'Финансы'
    df[df['tags'].isin(['Госэкономика'])]['tags'] = 'Экономика'
    df[df['tags'].isin(['Госрегулирование','Киберпреступность'])]['tags'] = 'Политика'
    df[df['tags'].isin(['Мировой бизнес','Техника','Достижения'])]['tags'] = 'Бизнес'
    df = df.fillna('nan')
    df = df[df['flag'] != 'nan']

    for row in range(len(df)):
        df['title'].iloc[row] = lemmatize(df['title'].iloc[row])

    for row in range(len(df)):
        df['title'].iloc[row] = remove_stopwords(df['title'].iloc[row])

    return df

def rework_test_data(name):
    df = pd.read_csv(f'data/news/{name}.csv')
    df = df.drop(columns=['Text'])

    for row in range(len(df)):
        df['Title'].iloc[row] = lemmatize(df['Title'].iloc[row])

    for row in range(len(df)):
        df['Title'].iloc[row] = remove_stopwords(df['Title'].iloc[row])

    return df

In [4]:
names = ['Газпром','Лукойл','МТС','Новатэк','Роснефть','Транснефть']

df_train = rework_train_data('make_classes')

# Обучение модели

x = df_train['title']
y = df_train['flag']
y=y.astype('int')

my_tags = y.unique()

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

model = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier()),
               ])
model.fit(x, y)
months = {'янв':'01','фев':'02','мар':'03','апр':'04','май':'05','июн':'06',\
    'июл':'07','авг':'08','сен':'09','окт':'10','ноя':'11','дек':'12'}
comps = {'Газпром':'GAZP','МТС':'MTSS','Лукойл':'LKOH','Новатэк':'NVTK','Роснефть':'ROSN','Транснефть':'TRNFP'}

for name in names:
    df = rework_test_data(name)
    df['flag'] = model.predict(df['Title'])

    df['true_date'] = df['Date'].apply(lambda x: x[:2] + '-' + months[x[3:6]] + '-' + ('2023' if len(x) < 17 else x[7:11]))
    df = df.drop(columns=['Date'])
    df = df.rename(columns={'true_date':'Date'})

    ans = df.groupby('Date', as_index=False).agg({'flag':'sum'})

    ans.to_csv(f'data/news/done_{comps[name]}.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['tags'].isin(['Рынки','Деньги','Финансы компаний'])]['tags'] = 'Финансы'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['tags'].isin(['Госэкономика'])]['tags'] = 'Экономика'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['tags'].isin(['Госрегулирование','Киберпреступность'])]['tag