In [1]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from collections import Counter
from pathlib import Path

In [2]:
import sklearn
print(sklearn.__version__)

0.23.2


In [3]:
def get_vectors(filepath):
    data = dict()
    with open(filepath, "r") as f:
        for i, line in enumerate(f):
            dwt, vector = tuple(line.strip("\n").split("\t"))
            vector = [float(v) for v in vector.split()]
            if dwt in data:
                data[dwt].append(vector)
            else: 
                data[dwt] = [vector]
    return data

In [73]:
def get_sentences(filepath):
    data = dict()
    with open(filepath, "r") as f:
        for i, line in enumerate(f):
            dwt, n_type, sentence = tuple(line.strip("\n").split("\t"))
            if int(n_type) != 1:
                for dwt in dwt.split("; "):
                    if dwt in data:
                        data[dwt].append(sentence)
                    else:
                        data[dwt] = [sentence]
                continue    
            if dwt in data:
                data[dwt].append(sentence)
            else: 
                data[dwt] = [sentence]
    return data    

In [5]:
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
#clustering_model = AgglomerativeClustering(n_clusters=None, metric="cosine", distance_threshold=1.5)

In [6]:
#[1,2,2].count(1)

In [38]:
def np_cos(vectors, centroid):
    return np.dot(vectors, centroid) / (np.linalg.norm(vectors, axis=1)*np.linalg.norm(centroid))

In [8]:
def find_example(vectors, sentences, ):
    pass

In [90]:
def get_clusters(vectors_at, sentences_at, filename, cl_model, min_frq = 5):
    
    vectors_at = Path(vectors_at)
    sentences_at = Path(sentences_at)
    
    data = get_vectors(vectors_at / filename)
    sentences = get_sentences(sentences_at / filename)
    
#     for s in sentences["V1_kulturberika"]:
#         print(s[:30])
    
    for key in sorted(data.keys()):
        assert len(data[key]) == len(sentences[key]), f"Not the same number of sentences as vectors! {len(data[key])} vectors vs. {len(sentences[key])} sentences"
        
        l = len(data[key])
        
        if l < 2:
            print()
            print(key.upper())
            print("No. sentences", l)
            continue
        
        arr = np.array(data[key])
        arr_norm = arr /  np.linalg.norm(arr, axis=1, keepdims=True)
        cl_model.fit(arr_norm)
        
        label_counts = Counter(cl_model.labels_)
        labels = dict()
        
        
        for label, vector in zip(cl_model.labels_, arr):
            if label in labels:
                labels[label].append(vector)
            else:
                labels[label] = [vector]
        
        centroids = dict()
        examples  = dict()
        for label in labels.keys():
            centroid = np.array(labels[label]).mean(axis=0)
            centroids[label] = centroid
            #print(centroid)
            similarities = np_cos(data[key], centroid)
            idx_max = np.argmax(similarities)
            examples[label] = (sentences[key][idx_max], similarities[idx_max])
        
        print()
        print(str(key).upper())
        print("No. sentences", l)
        print("No. labels", len(set(cl_model.labels_)))
        
        for label in labels:
            n = label_counts[label]
            if n >= min_frq:
                print(f"LABEL_{label}, (n = {n})")
                ex, v = examples[label]
                print(f"({v:.2f}) {ex}")
                
#         for label in sorted(centroids.keys()):
#             print(label)
#             print(centroids[label].tolist())
        
        

In [87]:
vec_path = "/home/max/Corpora/flashback-pol-time/yearly/contexts/vectors/sts_fbmodel"
snt_path = "/home/max/Corpora/flashback-pol-time/yearly/contexts/files"
filename = "2015.txt"

In [91]:
get_clusters(
    vectors_at = vec_path, 
    sentences_at = snt_path, 
    filename = filename, 
    cl_model = clustering_model,
    min_frq = 20
)


A1_GLOBALISTISK
No. sentences 171
No. labels 15
LABEL_0, (n = 22)
(0.76) det är ett multikulturellt företag med en öppen anti-rasistisk / globalistisk agenda .
LABEL_2, (n = 25)
(0.77) han hör ju till de globalistiska grupperingar inom bl.a. media som vill destabilisera europa med massinvandring , globalism och krigshetsande anti-rysk propaganda .

N1C_BERIKAREX
No. sentences 9
No. labels 1

N1C_GLOBALISTX
No. sentences 228
No. labels 18
LABEL_3, (n = 23)
(0.80) ja , men det beror väl närmast på att vänstern i sverige ( v+mp ) valde att kort efter murens fall dra mer åt extremvänsterhåll med folkförnekelse , genustrams och fri invandring för att motivera sin existens , med det enda resultatet att de numera har gjort sig till globalistkapitalets stjärtgossar .
LABEL_1, (n = 20)
(0.76) globalistförespråkare , skulle jag kalla alla vänsterradikala , liberaldårar och nyttiga idioter som destabiliserar sina samhällen i globalismens namn .
LABEL_4, (n = 21)
(0.79) det patetiska , folkförräd