# Задание практика №4

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups


train_data = fetch_20newsgroups(subset='train')
val_data = fetch_20newsgroups(subset='test')

len(train_data['data']), len(val_data['data'])

(11314, 7532)

## Text preprocessing

In [3]:
! pip install -qq spacy

In [4]:
from tqdm import tqdm
import re
from collections import Counter
from spacy.lang.en import stop_words


import nltk
from nltk.corpus import words
nltk.download('words')


DICT_WORDS_COUNT = 1000
stopwords = stop_words.STOP_WORDS

[nltk_data] Error loading words: <urlopen error [Errno 60] Operation
[nltk_data]     timed out>


In [5]:
english_words_corpora = words.words()
len(english_words_corpora)

236736

In [6]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [7]:
def clean_text(text):

    if 'Lines:' in text:
        start = text.index('Lines:') + 5
        text = text[start:]

    text = text.lower()

    text = re.sub(r'[\w\d.]+@[\w\d]+.[\w\d]+', ' ', text)
    text = re.sub(r'(http|https)://[\w\d/.]+', ' ', text)

    text = re.sub(r'[^a-z]', ' ', text)

    text = re.sub(r'([^\w]|[-_])+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text)

    text = text.strip()
    
    words = [
        w for w in text.split() \
            if len(w) > 3 and \
                w not in stopwords and \
                w in english_words_corpora
    ]

    return ' '.join(words)

In [39]:
def delete_pos_tags(text, acceptable_poses: tuple | list):
    tokens = word_tokenize(text, language='english')
    tags = pos_tag(tokens, lang='eng')
    _, pos_tags = zip(*tags)

    pairs = zip(text.split(), pos_tags)
    pairs = filter(lambda x: x[1] not in acceptable_poses, pairs)
    words = [p[0] for p in pairs]

    return ' '.join(words)

In [9]:
ACCEPTABLE_POS_TAGS = [
    'NOUN',
    'ADJ',
    'VERB',
    'NUM',
]

In [10]:
from random import randint, seed
import warnings
warnings.filterwarnings('ignore')

In [11]:
seed(4242)
samples1000 = list(filter(lambda it: randint(0, 9) == 0, zip(train_data['data'], train_data['target'])))
# samples1000 = list(zip(train_data['data'], train_data['target']))
texts, targets = zip(*samples1000)

### Text cleaning

In [12]:
clean_train_texts = [
    clean_text(text) for text in tqdm(texts, desc='Train texts')
]

Train texts: 100%|██████████| 1131/1131 [01:36<00:00, 11.70it/s]


In [40]:
# NOUNS
clean_train_texts1 = [
    delete_pos_tags(text, ACCEPTABLE_POS_TAGS[:1])
    for text in tqdm(clean_train_texts, desc='Delete pos tags')
]

Delete pos tags: 100%|██████████| 1131/1131 [00:01<00:00, 618.15it/s]


In [41]:
# NOUNS, ADJ
clean_train_texts2 = [
    delete_pos_tags(text, ACCEPTABLE_POS_TAGS[:2])
    for text in tqdm(clean_train_texts, desc='Delete pos tags')
]

Delete pos tags:   0%|          | 0/1131 [00:00<?, ?it/s]

Delete pos tags: 100%|██████████| 1131/1131 [00:01<00:00, 629.70it/s]


In [42]:
# NOUNS, ADJ, VERB
clean_train_texts3 = [
    delete_pos_tags(text, ACCEPTABLE_POS_TAGS[:3])
    for text in tqdm(clean_train_texts, desc='Delete pos tags')
]

Delete pos tags: 100%|██████████| 1131/1131 [00:01<00:00, 620.23it/s]


In [43]:
# NOUNS, ADJ, VERB, NUM
clean_train_texts4 = [
    delete_pos_tags(text, ACCEPTABLE_POS_TAGS[:4])
    for text in tqdm(clean_train_texts, desc='Delete pos tags')
]

Delete pos tags: 100%|██████████| 1131/1131 [00:01<00:00, 625.79it/s]


In [44]:
clean_train_texts1[0]

'plus finally gave ghost weekend starting life market machine sooner intended looking maybe bunch hopefully somebody answer anybody know dirt round supposed summer haven access wondering anybody anybody price line like went recently impression display probably swing disk feel better display great store good solicit people worth taking disk size money active display realize real subjective question computer store figured somebody actually machine daily prove helpful perform thanks bunch advance post summary news reading time premium corner electrical engineering dangerous truth'

### Train CatBoost, RF

In [None]:
! pip install -qq catboost

In [21]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [58]:
def get_catboost_results(df):
    X_train, X_val, y_train, y_val = train_test_split(
        df.drop('target', axis=1),
        df['target'],
        test_size=0.2,
        shuffle=True,
        random_state=42,
    )
                
    clf = CatBoostClassifier(
        iterations=2500,
        # verbose=200,
        logging_level='Silent',
    )
    clf.fit(
        X=X_train,
        y=y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=50,
    )

    y_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='macro')

    return acc, f1

In [23]:
def get_rf_results(df):
    best_acc, best_f1 = 0, 0

    for md in [8, 16, 32, 64]:
        md = int(md)

        dff = df.dropna()

        X_train, X_val, y_train, y_val = train_test_split(
            dff.drop('target', axis=1),
            dff['target'],
            test_size=0.2,
            shuffle=True,
            random_state=42,
        )
                    
        clf = RandomForestClassifier(
            max_depth=md,
            random_state=42,
        )
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average='macro')

        best_acc = max(best_acc, acc)
        best_f1 = max(best_acc, f1)
    
    return best_acc, best_f1

## BOW, TF-IDF, LSI, LDA

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
def train_bow(clean_train_texts, th=300):
    tokenized_documents = [simple_preprocess(text) for text in clean_train_texts]
    dictionary = corpora.Dictionary(tokenized_documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    df = pd.DataFrame(targets, columns=['target'])
    
    for i in range(th):
        df[f'{i}'] = 0
    
    for i, b in enumerate(bow_corpus):
        for (idx, count) in b:
            if idx < th:
                df.loc[i, f'{idx}'] = count

    # train catboost & RF
    cb_acc, cb_f1 = get_catboost_results(df)
    rf_acc, rf_f1 = get_rf_results(df)
    
    return cb_acc, cb_f1, rf_acc, rf_f1

In [88]:
def train_tfidf(clean_train_texts, th=300):
    tokenized_documents = [simple_preprocess(text) for text in clean_train_texts]
    dictionary = corpora.Dictionary(tokenized_documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    df = pd.DataFrame(targets, columns=['target'])
    
    for i in range(th):
        df[f'{i}'] = 0
    
    for i, b in enumerate(corpus_tfidf):
        for (idx, val) in b:
            if idx < th:
                df.loc[i, f'{idx}'] = val

    # train catboost & RF
    cb_acc, cb_f1 = get_catboost_results(df)
    rf_acc, rf_f1 = get_rf_results(df)
    
    return cb_acc, cb_f1, rf_acc, rf_f1

In [89]:
def train_lsi(clean_train_texts):
    tokenized_documents = [simple_preprocess(text) for text in clean_train_texts]
    dictionary = corpora.Dictionary(tokenized_documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20)

    document_topic_vectors = []
    for doc_bow in corpus_tfidf:
        document_topic_vector = lsi_model[doc_bow]
        document_topic_vectors.append(document_topic_vector)
    
    df = pd.DataFrame(targets, columns=['target'])
    
    docs_vectors = []

    for doc_idx in tqdm(range(len(df))):
        doc_bow = corpus_tfidf[doc_idx]
        document_topic_vector = lsi_model[doc_bow]

        if document_topic_vector:
            _, vec = zip(*document_topic_vector)
        else:
            vec = [None] * 20

        docs_vectors.append(vec)
    
    df[[f'vec{i}' for i in range(20)]] = docs_vectors

    # train catboost & RF
    cb_acc, cb_f1 = get_catboost_results(df)
    rf_acc, rf_f1 = get_rf_results(df)
    
    return cb_acc, cb_f1, rf_acc, rf_f1

In [90]:
def train_lda(clean_train_texts):
    tokenized_documents = [simple_preprocess(text) for text in clean_train_texts]
    dictionary = corpora.Dictionary(tokenized_documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lda_model = models.LdaModel(corpus_tfidf, num_topics=100, id2word=dictionary, passes=15)

    document_topic_vectors = []

    for i, doc_bow in enumerate(bow_corpus):
        document_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
        document_topic_vector = [topic_prob for _, topic_prob in document_topics]
        document_topic_vectors.append(document_topic_vector)
    
    df = pd.DataFrame(document_topic_vectors)
    df['target'] = targets

    # train catboost & RF
    cb_acc, cb_f1 = get_catboost_results(df)
    rf_acc, rf_f1 = get_rf_results(df)
    
    return cb_acc, cb_f1, rf_acc, rf_f1

## Training

In [91]:
clean_texts = [
    clean_train_texts1,
    clean_train_texts2,
    clean_train_texts3,
    clean_train_texts4,
]

vectorizers = [
    ('BOW', train_bow),
    ('TF-IDF', train_tfidf),
    ('LSI', train_lsi),
    ('LDA', train_lda),
]

In [92]:
for i, ct in enumerate(clean_texts):
    for v_name, v in vectorizers:
        cb_acc, cb_f1, rf_acc, rf_f1 = v(ct)
        print(f'TAGS: {ACCEPTABLE_POS_TAGS[:i+1]}; vectorizer: {v_name}')
        print('RESULTS:')
        print(f'CatBoost: acc={cb_acc:.4f}, f1={cb_f1:.4f}')
        print(f'      RF: acc={rf_acc:.4f}, f1={rf_f1:.4f}')
        print()

TAGS: ['NOUN']; vectorizer: BOW
RESULTS:
CatBoost: acc=0.1982, f1=0.1708
      RF: acc=0.2203, f1=0.2203

TAGS: ['NOUN']; vectorizer: TF-IDF
RESULTS:
CatBoost: acc=0.1938, f1=0.1800
      RF: acc=0.2291, f1=0.2291



100%|██████████| 1131/1131 [00:00<00:00, 7454.66it/s]


TAGS: ['NOUN']; vectorizer: LSI
RESULTS:
CatBoost: acc=0.5066, f1=0.4870
      RF: acc=0.4802, f1=0.4802

TAGS: ['NOUN']; vectorizer: LDA
RESULTS:
CatBoost: acc=0.3128, f1=0.2856
      RF: acc=0.2643, f1=0.2643

TAGS: ['NOUN', 'ADJ']; vectorizer: BOW
RESULTS:
CatBoost: acc=0.1982, f1=0.1708
      RF: acc=0.2203, f1=0.2203

TAGS: ['NOUN', 'ADJ']; vectorizer: TF-IDF
RESULTS:
CatBoost: acc=0.1938, f1=0.1800
      RF: acc=0.2291, f1=0.2291



100%|██████████| 1131/1131 [00:00<00:00, 8954.66it/s]


TAGS: ['NOUN', 'ADJ']; vectorizer: LSI
RESULTS:
CatBoost: acc=0.4714, f1=0.4383
      RF: acc=0.4581, f1=0.4581

TAGS: ['NOUN', 'ADJ']; vectorizer: LDA
RESULTS:
CatBoost: acc=0.2247, f1=0.2009
      RF: acc=0.2731, f1=0.2731

TAGS: ['NOUN', 'ADJ', 'VERB']; vectorizer: BOW
RESULTS:
CatBoost: acc=0.1982, f1=0.1708
      RF: acc=0.2203, f1=0.2203

TAGS: ['NOUN', 'ADJ', 'VERB']; vectorizer: TF-IDF
RESULTS:
CatBoost: acc=0.1938, f1=0.1800
      RF: acc=0.2291, f1=0.2291



100%|██████████| 1131/1131 [00:00<00:00, 8596.05it/s]


TAGS: ['NOUN', 'ADJ', 'VERB']; vectorizer: LSI
RESULTS:
CatBoost: acc=0.4934, f1=0.4602
      RF: acc=0.5022, f1=0.5022

TAGS: ['NOUN', 'ADJ', 'VERB']; vectorizer: LDA
RESULTS:
CatBoost: acc=0.2467, f1=0.2262
      RF: acc=0.2203, f1=0.2203

TAGS: ['NOUN', 'ADJ', 'VERB', 'NUM']; vectorizer: BOW
RESULTS:
CatBoost: acc=0.1982, f1=0.1708
      RF: acc=0.2203, f1=0.2203

TAGS: ['NOUN', 'ADJ', 'VERB', 'NUM']; vectorizer: TF-IDF
RESULTS:
CatBoost: acc=0.1938, f1=0.1800
      RF: acc=0.2291, f1=0.2291



100%|██████████| 1131/1131 [00:00<00:00, 7254.84it/s]


TAGS: ['NOUN', 'ADJ', 'VERB', 'NUM']; vectorizer: LSI
RESULTS:
CatBoost: acc=0.4846, f1=0.4590
      RF: acc=0.4890, f1=0.4890

TAGS: ['NOUN', 'ADJ', 'VERB', 'NUM']; vectorizer: LDA
RESULTS:
CatBoost: acc=0.2555, f1=0.2190
      RF: acc=0.2643, f1=0.2643

