In [16]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MiniBatchKMeans

from collections import Counter 
from sklearn.metrics import silhouette_samples, silhouette_score
import os
import random
import re

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

Lemmatizing was used in the Preprocessing now we will do word embedding
https://stackoverflow.com/questions/23877375/word2vec-lemmatization-of-corpus-before-training

In [2]:
df = pd.read_csv('data/p_content.csv')
df1 = df[['ID_GodotObject','content','titletext']]
print(df1.shape)
df1.head()

(103, 3)


Unnamed: 0,ID_GodotObject,content,titletext
0,2000115059032,medizinisch Personal Umgang Labor müssen Probe...,Maskenpflicht medizinisch Personal Umgang Coro...
1,2000116305030,Einführung Maskenpflicht Regierung verschärfen...,schrittweise Einführung Maskenpflicht Öffentli...
2,2000116325081,Ende Sicht Regierung setzen Maske bei Einkauf ...,Regierung setzen Maske bei Einkauf Test Freist...
3,2000116346340,Supermarkt spätestens ab Montag Entscheidung M...,Maskenpflicht Supermarkt spätestens ab Montag ...
4,2000116371728,Clemens Auer italienisch spanisch Verhältnis v...,Sonderbeauftragter Clemens Auer italienisch sp...


The model produces high-dimensional vectors, where the size parameter sets the number of dimensions. The optimal number of dimensions depends on the size of the dataset. In our case, 100 dimensions seem to be working very well. min_count parameter controls the minimum frequency of words.



https://dylancastillo.co/nlp-snippets-cluster-documents-using-word2vec/


## Apply function to remove duplicates


Duplicate words can be considered as additional context for the model and can potentially improve the quality of the word embeddings. However, if you have a very large number of duplicates, it may slow down the training process and potentially lead to overfitting. In such cases, it might be beneficial to remove duplicates to speed up training and improve generalization.

Overall, whether or not to remove duplicates when using Word2vec will depend on the specific data and the goals of your word embedding task. It is a good idea to try both approaches and see which one gives better results on your data.

In [3]:
text_columns = ["content", "titletext"]
df1["merged_text"] = df1[text_columns].apply(lambda x: " | ".join(x), axis=1)
df1["tokens"] = df1["merged_text"].map(lambda x: x.split())

# Remove duplicated after preprocessing
_, idx = np.unique(df1["tokens"], return_index=True)
df1 = df1.iloc[idx, :]

print(df1.shape)
df1.head()
                

(103, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["merged_text"] = df1[text_columns].apply(lambda x: " | ".join(x), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["tokens"] = df1["merged_text"].map(lambda x: x.split())


Unnamed: 0,ID_GodotObject,content,titletext,merged_text,tokens
84,2000128312219,50 Teilnehmer begräbnissen letzter Abschied se...,VfGH 50 Teilnehmer begräbnissen unverhältnismäßig,50 Teilnehmer begräbnissen letzter Abschied se...,"[50, Teilnehmer, begräbnissen, letzter, Abschi..."
37,2000120217222,Ampel Wien bleiben Orang acht neu Bezirk Orang...,neu Ampel Wien bleiben Orang acht neu Bezirk O...,Ampel Wien bleiben Orang acht neu Bezirk Orang...,"[Ampel, Wien, bleiben, Orang, acht, neu, Bezir..."
7,2000116717900,Anton bleiben Quarantan fast fünfte freiwillig...,Paznaun Anton bleiben Quarantan fast fünfte fr...,Anton bleiben Quarantan fast fünfte freiwillig...,"[Anton, bleiben, Quarantan, fast, fünfte, frei..."
48,2000122442386,Antwort wie vieler Mensch Weihnachten Silveste...,wie vieler Mensch Weihnachten Silvester Werkta...,Antwort wie vieler Mensch Weihnachten Silveste...,"[Antwort, wie, vieler, Mensch, Weihnachten, Si..."
67,2000124163685,Anzeige Samstag Polizei nehmen fünf Person fes...,Anzeige Samstag Wien,Anzeige Samstag Polizei nehmen fünf Person fes...,"[Anzeige, Samstag, Polizei, nehmen, fünf, Pers..."


### Check for common words

In [19]:
docs = df1["merged_text"].values
tokenized_docs = df1["tokens"].values
ids = df1["ID_GodotObject"].values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)
    
vocab.most_common(10)


[('der', 808),
 ('werden', 246),
 ('Foto', 244),
 ('in', 221),
 ('ab', 204),
 ('Wien', 201),
 ('gelten', 198),
 ('sein', 197),
 ('Maskenpflicht', 194),
 ('mehr', 191)]

### Generate Vectors from document

In [24]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [21]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=42)
model.wv.most_similar("Maske")

[('der', 0.9998356699943542),
 ('in', 0.9998061060905457),
 ('bei', 0.999804675579071),
 ('werden', 0.9998003840446472),
 ('mehr', 0.9997920989990234),
 ('geben', 0.9997724890708923),
 ('dürfen', 0.9997721314430237),
 ('neu', 0.9997711181640625),
 ('können', 0.9997584223747253),
 ('bleiben', 0.9997560381889343)]

In [None]:
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(103, 100)

NOW TRY TO FIT KNN

In [44]:
n = 5

kmeans = KMeans(n_clusters = n)
kmeans.fit(vectorized_docs)
y_kmeans = kmeans.predict(vectorized_docs)
df2 = df1[['ID_GodotObject']].copy()
df2['cluster_value'] = y_kmeans
df2.to_csv('data/feature/knn_clustering.csv', encoding='utf-8', index=False)

df2.head()

Unnamed: 0,ID_GodotObject,cluster_value
84,2000128312219,2
37,2000120217222,0
7,2000116717900,0
48,2000122442386,1
67,2000124163685,4


now I decided to print the text for each article in each cluster and write the result to a csv file, for later extracting impiortand opinions per cluster

In [46]:
for i in range(n):
    rslt_df = df2[df2['cluster_value'] == i].copy()
    txt_df = df1[['ID_GodotObject','merged_text']].copy()
    rslt_df = pd.merge(rslt_df, txt_df, on="ID_GodotObject")
    rslt_df.to_csv('data/feature/clusters/' + str(i) + '_cluster.csv', encoding='utf-8', index=False)
    print(rslt_df.shape)




(31, 3)
(27, 3)
(5, 3)
(16, 3)
(24, 3)
