# Кластеризация с фильтрацией POS

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups


train_data = fetch_20newsgroups(subset='train')
val_data = fetch_20newsgroups(subset='test')

len(train_data['data']), len(val_data['data'])

(11314, 7532)

## Text preprocessing

In [3]:
! pip install -qq spacy

In [4]:
from tqdm import tqdm
import re
from collections import Counter
from spacy.lang.en import stop_words


import nltk
from nltk.corpus import words
nltk.download('words')


DICT_WORDS_COUNT = 1000
stopwords = stop_words.STOP_WORDS

[nltk_data] Downloading package words to /Users/ktann/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [5]:
english_words_corpora = words.words()
len(english_words_corpora)

236736

In [6]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [7]:
def clean_text(text):

    if 'Lines:' in text:
        start = text.index('Lines:') + 5
        text = text[start:]

    text = text.lower()

    text = re.sub(r'[\w\d.]+@[\w\d]+.[\w\d]+', ' ', text)
    text = re.sub(r'(http|https)://[\w\d/.]+', ' ', text)

    text = re.sub(r'[^a-z]', ' ', text)

    text = re.sub(r'([^\w]|[-_])+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text)

    text = text.strip()
    
    words = [
        w for w in text.split() \
            if len(w) > 3 and \
                w not in stopwords and \
                w in english_words_corpora
    ]

    return ' '.join(words)

In [8]:
def delete_pos_tags(text, acceptable_poses: tuple | list):
    tokens = word_tokenize(text, language='english')
    tags = pos_tag(tokens, lang='eng')
    _, pos_tags = zip(*tags)

    pairs = zip(text.split(), pos_tags)
    pairs = filter(lambda x: x[1] not in acceptable_poses, pairs)
    words = [p[0] for p in pairs]

    return ' '.join(words)

In [9]:
ACCEPTABLE_POS_TAGS = [
    'NOUN',
    'ADJ',
    'VERB',
    'NUM',
]

In [10]:
from random import randint, seed
import warnings
warnings.filterwarnings('ignore')

In [11]:
seed(4242)
samples1000 = list(filter(lambda it: randint(0, 9) == 0, zip(train_data['data'], train_data['target'])))
# samples1000 = list(zip(train_data['data'], train_data['target']))
texts, targets = zip(*samples1000)

### Text cleaning

In [12]:
clean_train_texts = [
    clean_text(text) for text in tqdm(texts, desc='Train texts')
]

Train texts:   0%|          | 0/1131 [00:00<?, ?it/s]

Train texts: 100%|██████████| 1131/1131 [01:38<00:00, 11.51it/s]


In [13]:
# NOUNS
clean_train_texts1 = [
    delete_pos_tags(text, ACCEPTABLE_POS_TAGS[:1])
    for text in tqdm(clean_train_texts, desc='Delete pos tags')
]

Delete pos tags: 100%|██████████| 1131/1131 [00:01<00:00, 640.52it/s]


In [14]:
# NOUNS, ADJ
clean_train_texts2 = [
    delete_pos_tags(text, ACCEPTABLE_POS_TAGS[:2])
    for text in tqdm(clean_train_texts, desc='Delete pos tags')
]

Delete pos tags: 100%|██████████| 1131/1131 [00:01<00:00, 648.85it/s]


In [15]:
# NOUNS, ADJ, VERB
clean_train_texts3 = [
    delete_pos_tags(text, ACCEPTABLE_POS_TAGS[:3])
    for text in tqdm(clean_train_texts, desc='Delete pos tags')
]

Delete pos tags: 100%|██████████| 1131/1131 [00:01<00:00, 636.59it/s]


In [16]:
# NOUNS, ADJ, VERB, NUM
clean_train_texts4 = [
    delete_pos_tags(text, ACCEPTABLE_POS_TAGS[:4])
    for text in tqdm(clean_train_texts, desc='Delete pos tags')
]

Delete pos tags: 100%|██████████| 1131/1131 [00:01<00:00, 592.11it/s]


In [17]:
clean_train_texts1[0]

'plus finally gave ghost weekend starting life market machine sooner intended looking maybe bunch hopefully somebody answer anybody know dirt round supposed summer haven access wondering anybody anybody price line like went recently impression display probably swing disk feel better display great store good solicit people worth taking disk size money active display realize real subjective question computer store figured somebody actually machine daily prove helpful perform thanks bunch advance post summary news reading time premium corner electrical engineering dangerous truth'

### Train KMeans, DBSCAN, Agglomerative

In [18]:
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [19]:
def get_kmeans_results(df):
    dff = df.dropna()

    clusterisator = KMeans(n_clusters=20, random_state=42)
    dff['cluster'] = clusterisator.fit_predict(dff.drop('target', axis=1))

    res = silhouette_score(dff.drop(['target', 'cluster'], axis=1), dff['cluster'])
    return res

In [20]:
def get_dbscan_results(df):
    dff = df.dropna()

    clusterisator = DBSCAN(eps=0.055, min_samples=2)
    dff['cluster'] = clusterisator.fit_predict(dff.drop('target', axis=1))

    res = silhouette_score(dff.drop(['target', 'cluster'], axis=1), dff['cluster'])
    return res

In [21]:
def get_agglo_results(df):
    dff = df.dropna()

    clusterisator = AgglomerativeClustering(n_clusters=20)
    dff['cluster'] = clusterisator.fit_predict(dff.drop('target', axis=1))

    res = silhouette_score(dff.drop(['target', 'cluster'], axis=1), dff['cluster'])
    return res

## BOW, TF-IDF, LSI, LDA

In [22]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
def train_bow(clean_train_texts, th=300):
    tokenized_documents = [simple_preprocess(text) for text in clean_train_texts]
    dictionary = corpora.Dictionary(tokenized_documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    df = pd.DataFrame(targets, columns=['target'])
    
    for i in range(th):
        df[f'{i}'] = 0
    
    for i, b in enumerate(bow_corpus):
        for (idx, count) in b:
            if idx < th:
                df.loc[i, f'{idx}'] = count

    kmeans_res = get_kmeans_results(df)
    dbscan_res = get_dbscan_results(df)
    agglo_res = get_agglo_results(df)
    
    return kmeans_res, dbscan_res, agglo_res

In [24]:
def train_tfidf(clean_train_texts, th=300):
    tokenized_documents = [simple_preprocess(text) for text in clean_train_texts]
    dictionary = corpora.Dictionary(tokenized_documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    df = pd.DataFrame(targets, columns=['target'])
    
    for i in range(th):
        df[f'{i}'] = 0
    
    for i, b in enumerate(corpus_tfidf):
        for (idx, val) in b:
            if idx < th:
                df.loc[i, f'{idx}'] = val

    kmeans_res = get_kmeans_results(df)
    dbscan_res = get_dbscan_results(df)
    agglo_res = get_agglo_results(df)
    
    return kmeans_res, dbscan_res, agglo_res

In [25]:
def train_lsi(clean_train_texts):
    tokenized_documents = [simple_preprocess(text) for text in clean_train_texts]
    dictionary = corpora.Dictionary(tokenized_documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20)

    document_topic_vectors = []
    for doc_bow in corpus_tfidf:
        document_topic_vector = lsi_model[doc_bow]
        document_topic_vectors.append(document_topic_vector)
    
    df = pd.DataFrame(targets, columns=['target'])
    
    docs_vectors = []

    for doc_idx in tqdm(range(len(df))):
        doc_bow = corpus_tfidf[doc_idx]
        document_topic_vector = lsi_model[doc_bow]

        if document_topic_vector:
            _, vec = zip(*document_topic_vector)
        else:
            vec = [None] * 20

        docs_vectors.append(vec)
    
    df[[f'vec{i}' for i in range(20)]] = docs_vectors

    kmeans_res = get_kmeans_results(df)
    dbscan_res = get_dbscan_results(df)
    agglo_res = get_agglo_results(df)
    
    return kmeans_res, dbscan_res, agglo_res

In [26]:
def train_lda(clean_train_texts):
    tokenized_documents = [simple_preprocess(text) for text in clean_train_texts]
    dictionary = corpora.Dictionary(tokenized_documents)
    bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lda_model = models.LdaModel(corpus_tfidf, num_topics=100, id2word=dictionary, passes=15)

    document_topic_vectors = []

    for i, doc_bow in enumerate(bow_corpus):
        document_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
        document_topic_vector = [topic_prob for _, topic_prob in document_topics]
        document_topic_vectors.append(document_topic_vector)
    
    df = pd.DataFrame(document_topic_vectors)
    df['target'] = targets

    kmeans_res = get_kmeans_results(df)
    dbscan_res = get_dbscan_results(df)
    agglo_res = get_agglo_results(df)
    
    return kmeans_res, dbscan_res, agglo_res

## Training

In [27]:
clean_texts = [
    clean_train_texts1,
    clean_train_texts2,
    clean_train_texts3,
    clean_train_texts4,
]

vectorizers = [
    ('BOW', train_bow),
    ('TF-IDF', train_tfidf),
    ('LSI', train_lsi),
    ('LDA', train_lda),
]

In [28]:
for i, ct in enumerate(clean_texts):
    for v_name, v in vectorizers:
        kmeans_res, dbscan_res, agglo_res = v(ct)
        print(f'TAGS: {ACCEPTABLE_POS_TAGS[:i+1]}; vectorizer: {v_name}')
        print(f'KMeans: {kmeans_res:.4f}, DBSCAN: {dbscan_res:.4f}, Agglo: {agglo_res:.4f}')
        print()

TAGS: ['NOUN']; vectorizer: BOW
KMeans: 0.0837, DBSCAN: -0.3611, Agglo: 0.2898

TAGS: ['NOUN']; vectorizer: TF-IDF
KMeans: 0.0432, DBSCAN: -0.2949, Agglo: 0.0913



100%|██████████| 1131/1131 [00:00<00:00, 8263.00it/s]


TAGS: ['NOUN']; vectorizer: LSI
KMeans: 0.1569, DBSCAN: -0.3824, Agglo: 0.1098

TAGS: ['NOUN']; vectorizer: LDA
KMeans: 0.0499, DBSCAN: -0.2225, Agglo: 0.0184

TAGS: ['NOUN', 'ADJ']; vectorizer: BOW
KMeans: 0.0837, DBSCAN: -0.3611, Agglo: 0.2898

TAGS: ['NOUN', 'ADJ']; vectorizer: TF-IDF
KMeans: 0.0432, DBSCAN: -0.2949, Agglo: 0.0913



100%|██████████| 1131/1131 [00:00<00:00, 6500.09it/s]


TAGS: ['NOUN', 'ADJ']; vectorizer: LSI
KMeans: 0.1158, DBSCAN: -0.3808, Agglo: 0.0942

TAGS: ['NOUN', 'ADJ']; vectorizer: LDA
KMeans: 0.0709, DBSCAN: -0.2174, Agglo: 0.0650

TAGS: ['NOUN', 'ADJ', 'VERB']; vectorizer: BOW
KMeans: 0.0837, DBSCAN: -0.3611, Agglo: 0.2898

TAGS: ['NOUN', 'ADJ', 'VERB']; vectorizer: TF-IDF
KMeans: 0.0432, DBSCAN: -0.2949, Agglo: 0.0913



100%|██████████| 1131/1131 [00:00<00:00, 7728.43it/s]


TAGS: ['NOUN', 'ADJ', 'VERB']; vectorizer: LSI
KMeans: 0.1173, DBSCAN: -0.3711, Agglo: 0.1370

TAGS: ['NOUN', 'ADJ', 'VERB']; vectorizer: LDA
KMeans: 0.0782, DBSCAN: -0.2260, Agglo: 0.0522

TAGS: ['NOUN', 'ADJ', 'VERB', 'NUM']; vectorizer: BOW
KMeans: 0.0837, DBSCAN: -0.3611, Agglo: 0.2898

TAGS: ['NOUN', 'ADJ', 'VERB', 'NUM']; vectorizer: TF-IDF
KMeans: 0.0432, DBSCAN: -0.2949, Agglo: 0.0913



100%|██████████| 1131/1131 [00:00<00:00, 8694.03it/s]


TAGS: ['NOUN', 'ADJ', 'VERB', 'NUM']; vectorizer: LSI
KMeans: 0.1240, DBSCAN: -0.3743, Agglo: 0.1143

TAGS: ['NOUN', 'ADJ', 'VERB', 'NUM']; vectorizer: LDA
KMeans: 0.0887, DBSCAN: -0.1963, Agglo: 0.0495

