In [11]:
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import numpy as np
import os
from imblearn.over_sampling import RandomOverSampler


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")
from pymystem3 import Mystem

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060]
[nltk_data]     Попытка установить соединение была безуспешной, т.к.
[nltk_data]     от другого компьютера за требуемое время не получен
[nltk_data]     нужный отклик, или было разорвано уже установленное
[nltk_data]     соединение из-за неверного отклика уже подключенного
[nltk_data]     компьютера>


In [12]:
mystem = Mystem()    

In [13]:
def not_so_simple_clean(s: str) -> str:
    s = str(s)
    s = s.lower()
    s = re.sub(r"[a-zA-Z\d]+", "", s)
    s = re.sub(r"[^\w\s]", "", s)
    s = re.sub(r"http\S+", "", s)
    s = s.strip()
    s = " ".join(mystem.lemmatize(s))

    return s

In [14]:
df = pd.read_csv("data/df_3000.csv")
df.shape

(22883, 5)

In [15]:
df['combined_text'] = df[['headers', 'sub_headers', 'text']].fillna('').apply(lambda x: ' '.join(x[x != '']), axis=1)

In [16]:
df = df.drop(columns= ["url", 'headers', 'sub_headers', 'text'])

In [None]:
df.head()

In [17]:
temp_batch_dir = os.path.join("data", "temp_batches")
os.makedirs(temp_batch_dir, exist_ok=True)

batch_size = 100

num_batches = np.ceil(len(df) / batch_size).astype(int)

In [18]:
for i in range(num_batches):
    batch_filename = os.path.join("data", "temp_batches", f"batch_{i}.csv")
    
    if os.path.exists(batch_filename):
        print(f"Batch {i} already processed. Skipping...")
        continue
    
    start_index = i * batch_size
    end_index = start_index + batch_size
    batch = df.iloc[start_index:end_index] 
    
    try:
        print(f"Batch {i}/{num_batches} is processening")
        batch_processed = batch['combined_text'].progress_apply(not_so_simple_clean)
        df.loc[start_index:end_index, 'combined_text'] = batch_processed
        
        batch.to_csv(batch_filename)
        print(f"Batch {i} processed and saved.")
        
    except Exception as e:
        print(f"Error processing batch {i}: {e}")
        break  

Batch 0/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 0 processed and saved.
Batch 1/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 1 processed and saved.
Batch 2/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 2 processed and saved.
Batch 3/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 3 processed and saved.
Batch 4/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 4 processed and saved.
Batch 5/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 5 processed and saved.
Batch 6/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 6 processed and saved.
Batch 7/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 7 processed and saved.
Batch 8/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 8 processed and saved.
Batch 9/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 9 processed and saved.
Batch 10/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 10 processed and saved.
Batch 11/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 11 processed and saved.
Batch 12/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 12 processed and saved.
Batch 13/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 13 processed and saved.
Batch 14/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 14 processed and saved.
Batch 15/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 15 processed and saved.
Batch 16/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 16 processed and saved.
Batch 17/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 17 processed and saved.
Batch 18/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 18 processed and saved.
Batch 19/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 19 processed and saved.
Batch 20/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 20 processed and saved.
Batch 21/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 21 processed and saved.
Batch 22/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 22 processed and saved.
Batch 23/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 23 processed and saved.
Batch 24/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 24 processed and saved.
Batch 25/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 25 processed and saved.
Batch 26/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 26 processed and saved.
Batch 27/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 27 processed and saved.
Batch 28/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 28 processed and saved.
Batch 29/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 29 processed and saved.
Batch 30/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 30 processed and saved.
Batch 31/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 31 processed and saved.
Batch 32/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 32 processed and saved.
Batch 33/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 33 processed and saved.
Batch 34/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 34 processed and saved.
Batch 35/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 35 processed and saved.
Batch 36/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 36 processed and saved.
Batch 37/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 37 processed and saved.
Batch 38/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 38 processed and saved.
Batch 39/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 39 processed and saved.
Batch 40/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 40 processed and saved.
Batch 41/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 41 processed and saved.
Batch 42/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 42 processed and saved.
Batch 43/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 43 processed and saved.
Batch 44/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 44 processed and saved.
Batch 45/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 45 processed and saved.
Batch 46/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 46 processed and saved.
Batch 47/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 47 processed and saved.
Batch 48/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 48 processed and saved.
Batch 49/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 49 processed and saved.
Batch 50/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 50 processed and saved.
Batch 51/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 51 processed and saved.
Batch 52/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 52 processed and saved.
Batch 53/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 53 processed and saved.
Batch 54/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 54 processed and saved.
Batch 55/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 55 processed and saved.
Batch 56/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 56 processed and saved.
Batch 57/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 57 processed and saved.
Batch 58/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 58 processed and saved.
Batch 59/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 59 processed and saved.
Batch 60/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 60 processed and saved.
Batch 61/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 61 processed and saved.
Batch 62/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 62 processed and saved.
Batch 63/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 63 processed and saved.
Batch 64/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 64 processed and saved.
Batch 65/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 65 processed and saved.
Batch 66/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 66 processed and saved.
Batch 67/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 67 processed and saved.
Batch 68/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 68 processed and saved.
Batch 69/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 69 processed and saved.
Batch 70/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 70 processed and saved.
Batch 71/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 71 processed and saved.
Batch 72/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 72 processed and saved.
Batch 73/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 73 processed and saved.
Batch 74/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 74 processed and saved.
Batch 75/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 75 processed and saved.
Batch 76/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 76 processed and saved.
Batch 77/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 77 processed and saved.
Batch 78/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 78 processed and saved.
Batch 79/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 79 processed and saved.
Batch 80/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 80 processed and saved.
Batch 81/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 81 processed and saved.
Batch 82/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 82 processed and saved.
Batch 83/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 83 processed and saved.
Batch 84/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 84 processed and saved.
Batch 85/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 85 processed and saved.
Batch 86/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 86 processed and saved.
Batch 87/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 87 processed and saved.
Batch 88/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 88 processed and saved.
Batch 89/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 89 processed and saved.
Batch 90/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 90 processed and saved.
Batch 91/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 91 processed and saved.
Batch 92/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 92 processed and saved.
Batch 93/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 93 processed and saved.
Batch 94/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 94 processed and saved.
Batch 95/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 95 processed and saved.
Batch 96/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 96 processed and saved.
Batch 97/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 97 processed and saved.
Batch 98/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 98 processed and saved.
Batch 99/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 99 processed and saved.
Batch 100/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 100 processed and saved.
Batch 101/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 101 processed and saved.
Batch 102/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 102 processed and saved.
Batch 103/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 103 processed and saved.
Batch 104/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 104 processed and saved.
Batch 105/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 105 processed and saved.
Batch 106/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 106 processed and saved.
Batch 107/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 107 processed and saved.
Batch 108/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 108 processed and saved.
Batch 109/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 109 processed and saved.
Batch 110/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 110 processed and saved.
Batch 111/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 111 processed and saved.
Batch 112/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 112 processed and saved.
Batch 113/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 113 processed and saved.
Batch 114/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 114 processed and saved.
Batch 115/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 115 processed and saved.
Batch 116/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 116 processed and saved.
Batch 117/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 117 processed and saved.
Batch 118/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 118 processed and saved.
Batch 119/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 119 processed and saved.
Batch 120/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 120 processed and saved.
Batch 121/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 121 processed and saved.
Batch 122/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 122 processed and saved.
Batch 123/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 123 processed and saved.
Batch 124/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 124 processed and saved.
Batch 125/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 125 processed and saved.
Batch 126/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 126 processed and saved.
Batch 127/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 127 processed and saved.
Batch 128/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 128 processed and saved.
Batch 129/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 129 processed and saved.
Batch 130/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 130 processed and saved.
Batch 131/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 131 processed and saved.
Batch 132/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 132 processed and saved.
Batch 133/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 133 processed and saved.
Batch 134/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 134 processed and saved.
Batch 135/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 135 processed and saved.
Batch 136/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 136 processed and saved.
Batch 137/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 137 processed and saved.
Batch 138/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 138 processed and saved.
Batch 139/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 139 processed and saved.
Batch 140/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 140 processed and saved.
Batch 141/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 141 processed and saved.
Batch 142/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 142 processed and saved.
Batch 143/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 143 processed and saved.
Batch 144/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 144 processed and saved.
Batch 145/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 145 processed and saved.
Batch 146/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 146 processed and saved.
Batch 147/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 147 processed and saved.
Batch 148/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 148 processed and saved.
Batch 149/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 149 processed and saved.
Batch 150/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 150 processed and saved.
Batch 151/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 151 processed and saved.
Batch 152/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 152 processed and saved.
Batch 153/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 153 processed and saved.
Batch 154/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 154 processed and saved.
Batch 155/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 155 processed and saved.
Batch 156/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 156 processed and saved.
Batch 157/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 157 processed and saved.
Batch 158/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 158 processed and saved.
Batch 159/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 159 processed and saved.
Batch 160/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 160 processed and saved.
Batch 161/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 161 processed and saved.
Batch 162/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 162 processed and saved.
Batch 163/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 163 processed and saved.
Batch 164/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 164 processed and saved.
Batch 165/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 165 processed and saved.
Batch 166/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 166 processed and saved.
Batch 167/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 167 processed and saved.
Batch 168/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 168 processed and saved.
Batch 169/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 169 processed and saved.
Batch 170/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 170 processed and saved.
Batch 171/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 171 processed and saved.
Batch 172/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 172 processed and saved.
Batch 173/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 173 processed and saved.
Batch 174/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 174 processed and saved.
Batch 175/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 175 processed and saved.
Batch 176/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 176 processed and saved.
Batch 177/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 177 processed and saved.
Batch 178/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 178 processed and saved.
Batch 179/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 179 processed and saved.
Batch 180/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 180 processed and saved.
Batch 181/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 181 processed and saved.
Batch 182/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 182 processed and saved.
Batch 183/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 183 processed and saved.
Batch 184/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 184 processed and saved.
Batch 185/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 185 processed and saved.
Batch 186/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 186 processed and saved.
Batch 187/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 187 processed and saved.
Batch 188/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 188 processed and saved.
Batch 189/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 189 processed and saved.
Batch 190/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 190 processed and saved.
Batch 191/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 191 processed and saved.
Batch 192/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 192 processed and saved.
Batch 193/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 193 processed and saved.
Batch 194/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 194 processed and saved.
Batch 195/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 195 processed and saved.
Batch 196/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 196 processed and saved.
Batch 197/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 197 processed and saved.
Batch 198/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 198 processed and saved.
Batch 199/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 199 processed and saved.
Batch 200/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 200 processed and saved.
Batch 201/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 201 processed and saved.
Batch 202/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 202 processed and saved.
Batch 203/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 203 processed and saved.
Batch 204/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 204 processed and saved.
Batch 205/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 205 processed and saved.
Batch 206/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 206 processed and saved.
Batch 207/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 207 processed and saved.
Batch 208/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 208 processed and saved.
Batch 209/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 209 processed and saved.
Batch 210/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 210 processed and saved.
Batch 211/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 211 processed and saved.
Batch 212/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 212 processed and saved.
Batch 213/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 213 processed and saved.
Batch 214/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 214 processed and saved.
Batch 215/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 215 processed and saved.
Batch 216/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 216 processed and saved.
Batch 217/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 217 processed and saved.
Batch 218/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 218 processed and saved.
Batch 219/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 219 processed and saved.
Batch 220/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 220 processed and saved.
Batch 221/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 221 processed and saved.
Batch 222/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 222 processed and saved.
Batch 223/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 223 processed and saved.
Batch 224/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 224 processed and saved.
Batch 225/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 225 processed and saved.
Batch 226/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 226 processed and saved.
Batch 227/229 is processening


  0%|          | 0/100 [00:00<?, ?it/s]

Batch 227 processed and saved.
Batch 228/229 is processening


  0%|          | 0/83 [00:00<?, ?it/s]

Batch 228 processed and saved.


In [19]:
df_combined = pd.concat([pd.read_csv(os.path.join(temp_batch_dir, f)) for f in sorted(os.listdir(temp_batch_dir))], ignore_index=True)

In [29]:
df_combined.to_csv("data/df_3000_lema.csv")

In [20]:
# Удаление временных файлов после успешной обработки, если необходимо
# for f in os.listdir(temp_batch_dir):
#     os.remove(os.path.join(temp_batch_dir, f))
# os.rmdir(temp_batch_dir)

# Обратите внимание, что после загрузки объединенного DataFrame столбцы могут потребовать повторной обработки (например, удаление индекса при чтении CSV)

In [21]:
# df['combined_text'] = df['combined_text'].progress_apply(not_so_simple_clean)

In [30]:
X = df[["combined_text"]]
y = df[["ID"]]
print(X.shape, y.shape)

(22883, 1) (22883, 1)


In [31]:
# %pip install ydata-profiling
from ydata_profiling import ProfileReport
def create_profile_report(df, name="Report"):
  report = ProfileReport(df, title='My Data')
  report.to_file(f"{name}.html")

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=42)

In [42]:
X_train.shape, X_test.shape

((17162, 1), (5721, 1))

In [43]:
unique_names = y_train["ID"].value_counts()
over_strategy = {}
cnt = 2250
for name, count in unique_names.items():
    if count < cnt:
        over_strategy[name] = cnt

In [44]:
over_strategy

{1: 2250, 4: 2250, 2: 2250, 7: 2250, 8: 2250, 5: 2250, 6: 2250, 3: 2250}

In [45]:
ros = RandomOverSampler(random_state=42, sampling_strategy = over_strategy)

X_train, y_train = ros.fit_resample(X_train, y_train)

In [46]:
X_train.shape, X_test.shape

((20281, 1), (5721, 1))

In [47]:
vectorizer = TfidfVectorizer(
    stop_words=russian_stopwords,
    min_df=5,
    ngram_range=(1, 2),
)

In [48]:
%%time
vectorizer.fit(X_train["combined_text"])

CPU times: total: 5.3 s
Wall time: 5.49 s


In [49]:
%%time
bow_train = vectorizer.transform(X_train["combined_text"])
bow_test = vectorizer.transform(X_test["combined_text"])
bow_train.shape, bow_test.shape

CPU times: total: 5.28 s
Wall time: 5.44 s


((20281, 114104), (5721, 114104))

In [50]:
lg = LogisticRegression(max_iter=400, random_state=17, n_jobs= -1)

In [51]:
%%time
lg.fit(bow_train, y_train)

CPU times: total: 78.1 ms
Wall time: 17.4 s


In [52]:
lg_train_pred = lg.predict(bow_train)
lg_test_pred = lg.predict(bow_test)

In [53]:
print(classification_report(y_test, lg_test_pred))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81       719
           1       0.89      0.91      0.90       779
           2       0.94      0.91      0.93       728
           3       0.79      0.95      0.87       315
           4       0.98      0.90      0.94       739
           5       0.78      0.86      0.82       610
           6       0.92      0.93      0.92       420
           7       0.94      0.93      0.94       714
           8       0.90      0.84      0.87       697

    accuracy                           0.89      5721
   macro avg       0.88      0.89      0.89      5721
weighted avg       0.89      0.89      0.89      5721



In [54]:
accuracy_score(y_train, lg_train_pred), accuracy_score(y_test, lg_test_pred)

(0.9451703564912973, 0.8883062401678028)

In [55]:
df_test = pd.read_csv("data/test_news_lem.csv").dropna()

In [None]:
# df_test["combined_text"] = df_test["content"].apply(not_so_simple_clean)

In [None]:
# df_test.to_csv("data/test_news_lem.csv", index = False)

In [56]:
bow_test_news = vectorizer.transform(df_test["combined_text"])

In [57]:
df_test_pred = lg.predict(bow_test_news)

In [58]:
predictions_df = pd.DataFrame(df_test_pred, columns=['topic'])
predictions_df['index'] = predictions_df.index

In [59]:
predictions_df.to_csv("data/precits/predict_5.csv", index=False)