In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MiniBatchKMeans

from collections import Counter 
from sklearn.metrics import silhouette_samples, silhouette_score
import os
import random
import re

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

Lemmatizing was used in the Preprocessing now we will do word embedding
https://stackoverflow.com/questions/23877375/word2vec-lemmatization-of-corpus-before-training

In [2]:
df = pd.read_csv('../../data/p_content.csv')
df1 = df[['ID_GodotObject','merged_text','tokens']]
print(df1.shape)
print(df1.dtypes)
df1.head()

(103, 3)
ID_GodotObject     int64
merged_text       object
tokens            object
dtype: object


Unnamed: 0,ID_GodotObject,merged_text,tokens
0,2000115059032,﻿20sars-cov-2maskenpflicht medizinisch_Persona...,"['\ufeff20sars-cov-2maskenpflicht', 'medizinis..."
1,2000116305030,﻿20_Coronavirus schrittweise Einführung Masken...,"['\ufeff20_Coronavirus', 'schrittweise', 'Einf..."
2,2000116325081,﻿20kein Ende Sicht Regierung setzen Maske bei ...,"['\ufeff20kein', 'Ende', 'Sicht', 'Regierung',..."
3,2000116346340,﻿20_Coronavirus Maskenpflicht Supermarkt späte...,"['\ufeff20_Coronavirus', 'Maskenpflicht', 'Sup..."
4,2000116371728,﻿20_Coronavirus Sonderbeauftragter Clemens Aue...,"['\ufeff20_Coronavirus', 'Sonderbeauftragter',..."


The model produces high-dimensional vectors, where the size parameter sets the number of dimensions. The optimal number of dimensions depends on the size of the dataset. In our case, 100 dimensions seem to be working very well. min_count parameter controls the minimum frequency of words.



https://dylancastillo.co/nlp-snippets-cluster-documents-using-word2vec/


## Apply function to remove duplicates


Duplicate words can be considered as additional context for the model and can potentially improve the quality of the word embeddings. However, if you have a very large number of duplicates, it may slow down the training process and potentially lead to overfitting. In such cases, it might be beneficial to remove duplicates to speed up training and improve generalization.

Overall, whether or not to remove duplicates when using Word2vec will depend on the specific data and the goals of your word embedding task. It is a good idea to try both approaches and see which one gives better results on your data.

In [3]:
# Remove duplicated after preprocessing
#_, idx = np.unique(df1["tokens"], return_index=True)
#df1 = df1.iloc[idx, :]

#print(df1.shape)
#df1.head()
                

### Check for common words

In [4]:
docs = df1["merged_text"].tolist()
tokenized_docs = df1["merged_text"].map(lambda x: x.split())
tokenized_docs = [[token for token in tokens if token != '--']for tokens in tokenized_docs]
#tokenized_docs = df1["tokens"].tolist()
ids = df1["ID_GodotObject"].tolist()
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)
    
vocab.most_common(10)

[('werden', 246),
 ('ab', 208),
 ('Wien', 200),
 ('Maskenpflicht', 199),
 ('gelten', 199),
 ('geben', 192),
 ('sein', 188),
 ('mehr', 178),
 ('Österreich', 165),
 ('Person', 164)]

### Generate Vectors from document

In [5]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [6]:
#model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=42)
#model.wv.most_similar("Ende")

In [7]:
#vectorized_docs = vectorize(tokenized_docs, model=model)
#len(vectorized_docs), len(vectorized_docs[0])

In [8]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=42)
model.train(tokenized_docs, total_examples=len(tokenized_docs), epochs=10)
print(model.wv.most_similar("Maske")) 

vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

[('mund-nasen-schutz', 0.9989187717437744), ('Supermärkt', 0.9984762072563171), ('weiterhin', 0.9983648657798767), ('Klasse', 0.9983058571815491), ('Ffp2-mask', 0.998256266117096), ('einhalten', 0.9982370138168335), ('dafür', 0.9982022643089294), ('verpflichtend', 0.9981527328491211), ('außerdem', 0.9981510043144226), ('müssen', 0.9981332421302795)]


(103, 100)

NOW TRY TO FIT KNN

In [9]:
n = 7

kmeans = KMeans(n_clusters = n)
kmeans.fit(vectorized_docs)
y_kmeans = kmeans.predict(vectorized_docs)
df2 = df1[['ID_GodotObject']].copy()
df2['cluster_value'] = y_kmeans
df2.to_csv('../../data/feature/knn_clustering.csv', encoding='utf-8', index=False)

df2.head()

Unnamed: 0,ID_GodotObject,cluster_value
0,2000115059032,5
1,2000116305030,3
2,2000116325081,6
3,2000116346340,4
4,2000116371728,2


now I decided to print the text for each article in each cluster and write the result to a csv file, for later extracting impiortand opinions per cluster

In [10]:
for i in range(n):
    rslt_df = df2[df2['cluster_value'] == i].copy()
    txt_df = df1[['ID_GodotObject','merged_text']].copy()
    rslt_df = pd.merge(rslt_df, txt_df, on="ID_GodotObject")
    rslt_df.to_csv('../../data/feature/clusters/' + str(i) + '_cluster.csv', encoding='utf-8', index=False)
    #print(rslt_df.shape)


