### Imports

In [1]:
import pandas as pd
import numpy as np
import torch

# NLP preprocessing
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Bert
from transformers import BertTokenizer, BertModel
# Sentence Transformers
from sentence_transformers import SentenceTransformer

# Dimensionality reduction
from sklearn.decomposition import PCA

# Plotting
import plotly.express as px

ModuleNotFoundError: No module named 'torch'

### Wien Museum

In [None]:
wm_data = pd.read_csv("data/wien_museum.csv")
wm_data.columns

In [None]:
wm_data.head()

In [None]:
# resonable columns
wm_data.columns[[0,3,4,5,6,7,8]]

In [None]:
wm_filtered = wm_data[wm_data.columns[[0,3,4,5,6,7,8]]]
#wm_filtered.head()

In [None]:
#full_text = wm_filtered[wm_filtered.columns[1:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)
#wm_filtered.insert(1, "full_text", full_text, True)

In [None]:
wm_filtered = wm_filtered.assign(full_text = wm_filtered[wm_filtered.columns[1:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1))

In [None]:
wm_filtered["full_text"][0]

In [None]:
def preprocessing(text_data: pd.DataFrame) -> pd.DataFrame:

    helper = text_data.copy(deep = True)
    helper = helper.assign(pre_text = helper["full_text"]) 
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    cachedStopWords = stopwords.words('german')
    
    # noch kein Stemmer -> könnt ma auch probieren hab online ein bisschen geschaut und die
    # Qualität von deutschen Stemmern ist relativ bescheiden
    
    # iterate over all documents and tokenize each text
    i = 0
    for text in helper["full_text"]:
        # text = text.lower()
        # remove special characters and do tokenization
        text = np.array(tokenizer.tokenize(text))
        #remove stopwords
        text = [word for word in text if not word in cachedStopWords]
        
        helper.at[i,"pre_text"] = text
        i += 1
        
    return helper

wm_preprocessed = preprocessing(wm_filtered)

In [None]:
print(wm_preprocessed["full_text"][0])
print()
print(wm_preprocessed["pre_text"][0])

### BERT

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# model = BertModel.from_pretrained("bert-base-multilingual-cased")

In [None]:
# text = "Hallo ich bin Bert"
# tokenizer.tokenize(text)

In [None]:
# from tqdm import tqdm

# sentence_embeddings_bert_mean = list()
# with torch.no_grad():
#    i = 0
#    texts = wm_preprocessed["pre_text"][:100]
#    for idx in tqdm(range(len(texts))):
#        encoded_input = tokenizer(' '.join(texts[idx]), return_tensors='pt')
 #       output = model(**encoded_input)
 #       mean_sentence_embedding = np.mean(output[0].numpy(), axis = 1)[0]
 #       sentence_embeddings_bert_mean.append(mean_sentence_embedding)

In [None]:
#sentence_embeddings_bert_mean= np.array(sentence_embeddings_bert_mean)
#sentence_embeddings_bert_mean.shape

### Sentence Transformers

In [None]:
# model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
#text_list = []
#for text in wm_preprocessed["pre_text"]:
#    text_list.append(' '.join(text))

In [None]:
#sentence_embeddings = model.encode(text_list)
#sentence_embeddings.shape

#### Save sentence embeddings

In [None]:
# most precise
#np.savetxt('sentence_embeddings.csv', sentence_embeddings, delimiter=',')

####  Load sentence embeddings

In [None]:
# most precise
sentence_embeddings_txt = np.loadtxt('data/sentence_embeddings_wien_museum.csv', delimiter=',')

In [None]:
sentence_embeddings = sentence_embeddings_txt

### Dimensionality Reduction

In [None]:
test_embs = sentence_embeddings[:]
test_embs.shape

#### PCA

In [None]:
# standardize data
stand_test_embs = (test_embs - np.mean(test_embs, axis=0)) / np.std(test_embs, axis=0)

In [None]:
for n in [1,3,5,10,25,50,100,250,512]:
    stand_pca = PCA(n_components=n)
    embeddings_stand_pca = stand_pca.fit_transform(stand_test_embs)
    print(n, sum(stand_pca.explained_variance_ratio_))
    

In [None]:
# components to try -> 50,75,100,150,200,250, 512

**Mean Sentence Embedding from Bert**

In [None]:
#pca = PCA(n_components=3)
#embeddings_pca_bert = pca.fit_transform(sentence_embeddings_bert_mean)
#pca.explained_variance_ratio_
#sum(pca.explained_variance_ratio_)

#### TSNE -> kein brauchbares Ergebniss

#### LDA, KernelPCA

* Matrix Decomposition Algorithms + Manifold Learning Algorithm-> scikit-learn
* Unsupervised Learning Skript + Slides

### Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch
# from sklearn.cluster import AffinityPropagation -> zu wenig RAM

#### K-Means

In [None]:
nr_clusters = 8
k_means = KMeans(n_clusters=nr_clusters, random_state=123).fit(sentence_embeddings)
k_means_labels = k_means.labels_
print(k_means.labels_)
print(k_means.cluster_centers_)

#### Plot and color by cluster

In [None]:
#wm_preprocessed

In [None]:
length = 50
df_k_means = wm_filtered[:][["id","classifications", "subjects"]]
df_k_means["title"] = wm_filtered["title"][:].apply(lambda x: x[:length] if len(x)>length else x)
df_k_means["x"] = embeddings_pca[:,0]
df_k_means["y"] = embeddings_pca[:,1]
df_k_means["z"] = embeddings_pca[:,2]
df_k_means["label"] = k_means_labels

df_k_means.fillna('NaN', inplace=True)
df_k_means.head()


title = "Visualization of: Sentence Transformer Vectors + PCA + K-means Color"

In [None]:
df_k_means

In [None]:
fig = px.scatter_3d(df_k_means, x='x', y='y', z='z', 
                    color='label', hover_name="title", hover_data=["id", "classifications"],
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(
                        font = dict(size = 10)
                        ), 
                  
                  hoverlabel=dict(
                        font_size=9,
                        )
                )

fig.update_traces(marker_size = 3)

fig.show()

#### Plot statistics of the clusters

In [None]:
counts = df_k_means["classifications"].value_counts().to_frame(name = "counts_full")
counts.index.name = 'classifications'
counts.reset_index(inplace=True)

for i in range(nr_clusters):
    only_one_cluster = df_k_means.loc[df_k_means['label'] == i]
    counts_cluster = only_one_cluster["classifications"].value_counts().to_frame(name = f"counts_{i}")
    counts_cluster.index.name = 'classifications'
    counts_cluster.reset_index(inplace=True)
    counts = pd.merge(counts, counts_cluster, on='classifications', how='left')
counts = counts.fillna(0)

In [None]:
counts

In [None]:
import altair as alt

In [None]:
# plot on full dataset
alt.data_transformers.disable_max_rows()
bars = alt.Chart(counts).mark_bar().encode(
    x=alt.X('classifications', sort='-y'),
    y='counts_full',
)

bars.configure_axis(labelLimit=400)

In [None]:
#counts

In [None]:
import altair_saver 
alt.renderers.enable('altair_saver', fmts=['vega-lite', 'png', 'pdf'])


bars = alt.Chart(counts).transform_fold(
    ['counts_0', 'counts_1', 'counts_2', 'counts_3', 'counts_4', 
     'counts_5', 'counts_6', 'counts_7'],
).mark_bar().encode(
    alt.X('classifications', sort='-y', axis=alt.Axis(labelAngle=-90)),
    y='value:Q',
    color=alt.Color('key:N', legend=alt.Legend(
        orient='none',
        legendX=130, legendY=-40,
        direction='horizontal',
        titleAnchor='middle'))
).properties(
    height=500
)


bars.configure_axis(labelLimit=400)

#bars.save('x.pdf')

In [None]:
fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='label', hover_name="title", hover_data=["id", "classifications"],
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(
                        font = dict(size = 10)
                        ), 
                  
                  hoverlabel=dict(
                        font_size=9,
                        )
                )

fig.update_traces(marker_size = 3)

fig.show()

#### DBSCAN

In [None]:
# manhattan distance
# cosine distance
dbs = DBSCAN(eps=5, min_samples=520).fit(sentence_embeddings)
dbs_labels = dbs.labels_

In [None]:
print(dbs_labels)
print(len(set(dbs_labels)))

In [None]:
length = 50
df_dbs = wm_filtered[:][["id","classifications", "subjects"]]
df_dbs["title"] = wm_filtered["title"][:].apply(lambda x: x[:length] if len(x)>length else x)
df_dbs["x"] = embeddings_pca[:,0]
df_dbs["y"] = embeddings_pca[:,1]
df_dbs["z"] = embeddings_pca[:,2]
df_dbs["label"] = dbs_labels

df_dbs.fillna('NaN', inplace=True)
df_dbs.head()

data = df_dbs
title = "Visualization of: Sentence Transformer Vectors + PCA + df_dbs Color"

In [None]:
fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='label', hover_name="title", hover_data=["id", "classifications"],
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(
                        font = dict(size = 10)
                        ), 
                  
                  hoverlabel=dict(
                        font_size=9,
                        )
                )

fig.update_traces(marker_size = 3)

fig.show()

In [None]:
#optics = OPTICS().fit(sentence_embeddings)
#print(optics.labels_)
#print(optics.cluster_centers_)

In [None]:
gm = GaussianMixture(random_state=0).fit(sentence_embeddings)

In [None]:
brc = Birch(n_clusters=None).fit(sentence_embeddings)
print(brc.labels_)

### Plotting

#### PCA

In [None]:
length = 50
df_pca = wm_filtered[:][["classifications", "subjects"]]
df_pca["title"] = wm_filtered["title"][:].apply(lambda x: x[:length] if len(x)>length else x)
df_pca["x"] = embeddings_pca[:,0]
df_pca["y"] = embeddings_pca[:,1]
df_pca["z"] = embeddings_pca[:,2]

df_pca.fillna('NaN', inplace=True)
df_pca.head()

data = df_pca
title = "Visualization of: Sentence Transformer Vectors + PCA"

In [None]:
fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='subjects', hover_name="title",
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(
                        font = dict(size = 10)
                        ), 
                  
                  hoverlabel=dict(
                        font_size=9,
                        )
                )
fig.show()

**Bert sentence vectors**

In [None]:
length = 50
test_df_pca_bert = wm_filtered[:100][["classifications", "subjects"]]
test_df_pca_bert["title"] = wm_filtered["title"][:100].apply(lambda x: x[:length] if len(x)>length else x)
test_df_pca_bert["x"] = embeddings_pca_bert[:,0]
test_df_pca_bert["y"] = embeddings_pca_bert[:,1]
test_df_pca_bert["z"] = embeddings_pca_bert[:,2]
test_df_pca_bert.fillna('NaN', inplace=True)
test_df_pca_bert.head()

data = test_df_pca_bert
title = "Visualization of: Bert Sentence Vectors + PCA"

In [None]:
fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='subjects', hover_name="title",
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(
                        font = dict(size = 10)
                        ), 
                  
                  hoverlabel=dict(
                        font_size=9,
                        )
                )
fig.show()

# ähnlich aber weiß nicht ob es wirklich besser ist???

#### KernelPCA

In [None]:
length = 30
test_dataframe_kpca = wm_filtered[:100][["classifications", "subjects"]]
test_dataframe_kpca["title"] = wm_filtered["title"][:100].apply(lambda x: x[:length] if len(x)>length else x)
test_dataframe_kpca["x"] = embeddings_kpca[:,0]
test_dataframe_kpca["y"] = embeddings_kpca[:,1]
test_dataframe_kpca["z"] = embeddings_kpca[:,2]
test_dataframe_kpca.fillna('NaN', inplace=True)
test_dataframe_kpca.head()

data = test_dataframe_kpca
title = "Visualization of: Sentence Transformer Vectors + KernelPCA('poly')"

In [None]:
fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='subjects', hover_name="title",
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(
                        font = dict(size = 10)
                        ), 
                  
                  hoverlabel=dict(
                        font_size=9,
                        )
                )
fig.show()

#### Factor Analysis

In [None]:
from sklearn.decomposition import FactorAnalysis

method = 'FactorAnalysis'

kpca = FactorAnalysis(n_components=3, random_state=0)
embeddings_kpca = kpca.fit_transform(test_embs)


length = 30
test_dataframe_kpca = wm_filtered[:100][["classifications", "subjects"]]
test_dataframe_kpca["title"] = wm_filtered["title"][:100].apply(lambda x: x[:length] if len(x)>length else x)
test_dataframe_kpca["x"] = embeddings_kpca[:,0]
test_dataframe_kpca["y"] = embeddings_kpca[:,1]
test_dataframe_kpca["z"] = embeddings_kpca[:,2]
test_dataframe_kpca.fillna('NaN', inplace=True)
test_dataframe_kpca.head()

data = test_dataframe_kpca
title = f"Visualization of: Sentence Transformer Vectors + {method})"



fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='subjects', hover_name="title",
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(
                        font = dict(size = 10)
                        ), 
                  hoverlabel=dict(
                        font_size=9,
                        )
                )
fig.show()

#### Truncated SVD

In [None]:
from sklearn.decomposition import TruncatedSVD
# trennt Kurrier/Transportrucksack Fotos schlechter

method = 'TruncatedSVD'

kpca = TruncatedSVD(n_components=3, random_state=0)
embeddings_kpca = kpca.fit_transform(test_embs)


length = 30
test_dataframe_kpca = wm_filtered[:100][["classifications", "subjects"]]
test_dataframe_kpca["title"] = wm_filtered["title"][:100].apply(lambda x: x[:length] if len(x)>length else x)
test_dataframe_kpca["x"] = embeddings_kpca[:,0]
test_dataframe_kpca["y"] = embeddings_kpca[:,1]
test_dataframe_kpca["z"] = embeddings_kpca[:,2]
test_dataframe_kpca.fillna('NaN', inplace=True)
test_dataframe_kpca.head()

data = test_dataframe_kpca
title = f"Visualization of: Sentence Transformer Vectors + {method})"



fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='subjects', hover_name="title",
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(
                        font = dict(size = 10)
                        ), 
                  hoverlabel=dict(
                        font_size=9,
                        )
                )
fig.show()

#### Increase number of samples

In [None]:
n_components = 3

transformed_embeddings = list()
for n in [100, 1000, 10000, 15000, 30000, 45000, 1000000]:
    test_embs = sentence_embeddings[:n]
    print("Nr of samples:", test_embs.shape[0])
    
    pca = PCA(n_components=n_components)
    embeddings_pca = pca.fit_transform(test_embs)
    
    transformed_embeddings.append((n,embeddings_pca))
    print(pca.explained_variance_ratio_)
    print(sum(pca.explained_variance_ratio_))
    print()
    

In [None]:
n,embedding = transformed_embeddings[0]

length = 50
df = wm_filtered[:n][["classifications", "subjects"]]
df["title"] = wm_filtered["title"][:n].apply(lambda x: x[:length] if len(x)>length else x)
df["x"] = embedding[:,0]
df["y"] = embedding[:,1]
df["z"] = embedding[:,2]
df.fillna('NaN', inplace=True)
df.head()

data = df
title = f"Visualization of: Sentence Transformer Vectors + PCA + {n} Samples"


fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='subjects', hover_name="title",
                    width=1100, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(font = dict(size = 10)),
                  hoverlabel=dict(font_size=9)
                )
fig.show()

In [None]:
n,embedding = transformed_embeddings[1]

length = 50
df = wm_filtered[:n][["classifications", "subjects"]]
df["title"] = wm_filtered["title"][:n].apply(lambda x: x[:length] if len(x)>length else x)
df["x"] = embedding[:,0]
df["y"] = embedding[:,1]
df["z"] = embedding[:,2]
df.fillna('NaN', inplace=True)
df.head()

data = df
title = f"Visualization of: Sentence Transformer Vectors + PCA + {n} Samples"


fig = px.scatter_3d(data, x='x', y='y', z='z', 
                    color='subjects', hover_name="title",
                    width=1200, height=800,# adjust height and width
                    title=title)

fig.update_layout(legend = dict(font = dict(size = 10)),
                  hoverlabel=dict(font_size=9)
                )
fig.show()

In [None]:
Dass soll einen Fehler produzieren

### MAK

In [None]:
#import xml.etree.ElementTree as ET

#xmlTree = ET.parse(path)

#data = list()
#for elem in xmlTree.iter():
#    data.append((elem.tag,elem.text))
#    print(elem)
#    print(elem.text)

#### Load data

In [None]:
mak_1 = pd.read_csv("data/mak_1.csv")
mak_2 = pd.read_csv("data/mak_2.csv")
mak_3 = pd.read_csv("data/mak_3.csv")

In [None]:
frames = [mak_1, mak_2, mak_3]
mak = pd.concat(frames)

mak

In [None]:
mak.columns

In [None]:
interesting = [0,2,4,15,16,17,21,28,29,31,34,36,38]
mak.columns[interesting]

In [None]:
mak_filtered = mak[mak.columns[interesting]]
#mak_filtered

In [None]:
mak_filtered = mak_filtered.assign(full_text = mak_filtered[mak_filtered.columns[1:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1))

In [None]:
mak_filtered["full_text"][0]

In [None]:
mak_filtered.reset_index(drop=True, inplace=True)

In [None]:
def preprocessing(text_data: pd.DataFrame) -> pd.DataFrame:

    helper = text_data.copy(deep = True)
    helper = helper.assign(pre_text = helper["full_text"]) 
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    cachedStopWords = stopwords.words('german')
    
    # print(helper)
    # noch kein Stemmer -> könnt ma auch probieren hab online ein bisschen geschaut und die
    # Qualität von deutschen Stemmern ist relativ bescheiden
    
    # iterate over all documents and tokenize each text
    i = 0
    for text in helper["full_text"]:
        # text = text.lower()
        # remove special characters and do tokenization
        text = np.array(tokenizer.tokenize(text))
        #remove stopwords
        text = [word for word in text if not word in cachedStopWords]
        
        helper.at[i,"pre_text"] = text
        i += 1
        
    return helper

mak_preprocessed = preprocessing(mak_filtered)

In [None]:
#mak_preprocessed

#### Sentence Transformers

In [None]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
text_list = []
for text in mak_preprocessed["pre_text"]:
    text_list.append(' '.join(text))

In [None]:
sentence_embeddings_mak = model.encode(text_list)
sentence_embeddings_mak.shape

In [None]:
sentence_embeddings_mak

#### Save sentence embeddings

In [None]:
# most precise
np.savetxt('sentence_embeddings_mak.csv', sentence_embeddings_mak, delimiter=',')

####  Load sentence embeddings

In [None]:
# most precise
#sentence_embeddings_mak = np.loadtxt('sentence_embeddings_mak.csv', delimiter=',')

### Belvedere

In [None]:
bel = pd.read_csv("data/belvedere.csv").reset_index()
#bel

In [None]:
bel_filtered = bel[bel.columns[[0,1,2,3,4,11,12,14,16,22]]]

In [None]:
bel_filtered = bel_filtered.assign(full_text = bel_filtered[bel_filtered.columns[1:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1))

In [None]:
bel_filtered["full_text"][0]

In [None]:
bel_preprocessed = preprocessing(bel_filtered)

In [None]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
text_list = []
for text in bel_preprocessed["pre_text"]:
    text_list.append(' '.join(text))

In [None]:
sentence_embeddings_bel = model.encode(text_list)
sentence_embeddings_bel.shape

In [None]:
sentence_embeddings_bel

In [None]:
np.savetxt('sentence_embeddings_bel.csv', sentence_embeddings_bel, delimiter=',')

In [None]:
x = np.array([[1,2,3]])
y = np.array([[2,3,4]])

np.concatenate([x,y], axis=0)

### Distance matrix experiments

In [None]:
import sklearn
import sklearn.neighbors as neighbors
import scipy.spatial as spatial

In [None]:
sentence_embeddings.shape

In [None]:
x = sentence_embeddings[:30]
print(x.shape)

x_ids = list(range(30))
print(x_ids)

In [None]:
# standardize data
stand_sentence_embeddings = (sentence_embeddings - np.mean(sentence_embeddings, axis=0)) / np.std(sentence_embeddings, axis=0)
stand_pca = PCA(n_components=3)
coordinates_3d = stand_pca.fit_transform(stand_sentence_embeddings)

In [None]:
triplet_dataframe = wm_filtered[["id","classifications", "subjects", "full_text"]]
triplet_dataframe["title"] = wm_filtered["title"][:].apply(lambda x: x[:75] if len(x)>75 else x)
triplet_dataframe["x"] = coordinates_3d[:,0]
triplet_dataframe["y"] = coordinates_3d[:,1]
triplet_dataframe["z"] = coordinates_3d[:,2]
triplet_dataframe.fillna('NaN', inplace=True)

#triplet_dataframe

#### sklearn

In [None]:
#euc = sklearn.metrics.pairwise_distances(x, sentence_embeddings, metric="euclidean")
#cos = sklearn.metrics.pairwise_distances(x, sentence_embeddings, metric="cosine")
#man = sklearn.metrics.pairwise_distances(x, sentence_embeddings, metric="manhattan")

In [None]:
def calc_distances(query, data, metric, k):
    
    if query.shape[0] == 1:
        # claculate distance of query to all data points in data
        dists = sklearn.metrics.pairwise_distances(query, data, metric=metric)[0]
        # zip ids to distances for sorting
        zipped_dists = np.array(list(zip(range(len(dists)),dists)))
        # sort according to distances
        sorted_dists = np.array(sorted(zipped_dists, key  = lambda x: x[1]))

        min_k_ids = sorted_dists[1:k+1, 0]
        min_k_dists = sorted_dists[1:k+1, 1]

        max_k_ids = sorted_dists[-k:, 0]
        max_k_dists = sorted_dists[-k:, 1]

        return min_k_ids, min_k_dists, max_k_ids, max_k_dists
    
    else:
        # claculate distance of query to all data points in data
        dists_matrix = sklearn.metrics.pairwise_distances(query, data, metric=metric)
        #print(dists_matrix.shape)
        
        results = list()
        for dists in dists_matrix:
            # zip ids to distances for sorting
            zipped_dists = np.array(list(zip(range(len(dists)),dists)))
            # sort according to distances
            sorted_dists = np.array(sorted(zipped_dists, key  = lambda x: x[1]))

            min_k_ids = sorted_dists[1:k+1, 0]
            min_k_dists = sorted_dists[1:k+1, 1]
            
            max_k_ids = sorted_dists[-k:, 0]
            max_k_dists = sorted_dists[-k:, 1]

            results.append((min_k_ids, min_k_dists, max_k_ids, max_k_dists))

        return results

In [None]:
def create_triplets(query, query_ids, embeddings):
    metric = "cosine"
    k = 15
    
    neighborhoods = calc_distances(query, embeddings, metric, k)
    
    rng = np.random.default_rng()
    res = list()
    triplets = list()
    
    i = 0
    for min_k_ids, min_k_dists, max_k_ids, max_k_dists in neighborhoods:
        min_id = rng.integers(k)
        max_id = rng.integers(k)

        res.append((int(min_k_ids[min_id]), min_k_dists[min_id],
                    int(max_k_ids[max_id]), max_k_dists[max_id]))
        
        triplets.append([query_ids[i], int(min_k_ids[min_id]), int(max_k_ids[max_id])])
        
        i += 1
        
    return neighborhoods, triplets

In [None]:
neighborhoods, triplets = create_triplets(x, x_ids, stand_sentence_embeddings)

In [None]:
def display_one_triplet(triplets, dataframe):
    
    length = len(triplets)
    
    rng = np.random.default_rng()
    idx = rng.integers(length)
    
    sample_id, similar_id, dissimilar_id = triplets[idx]
    
    sample_data = dataframe.loc[sample_id]
    similar_data = dataframe.loc[similar_id]
    dissimilar_data = dataframe.loc[dissimilar_id]
        
    sample_text = sample_data["full_text"]
    similar_text = similar_data["full_text"]
    disimilar_text = dissimilar_data["full_text"]
    
    triplet_df = dataframe.loc[triplets[idx]]
    triplet_df = triplet_df.assign(label = ["sample", "similar", "disimilar"])
    
    print("Sample:\n", sample_text)
    print()
    print("Similar:\n", similar_text)
    print()
    print("Dissimilar:\n", disimilar_text)
    print()
    
    fig = px.scatter_3d(triplet_df, x='x', y='y', z='z', 
                    color='label', hover_name="title", hover_data=["id", "classifications"],
                    width=1100, height=800,# adjust height and width
                    title="One triplet")

    fig.update_layout(legend = dict(
                            font = dict(size = 10)
                            ), 

                      hoverlabel=dict(
                            font_size=9,
                            )
                    )

    fig.update_traces(marker_size = 3)

    fig.show()

In [None]:
def display_all_triplets(triplets, dataframe):
    
    triplets_df = pd.DataFrame()
    i = 0
    for triplet in triplets:
        
        triplet_df = dataframe.loc[triplet]
        triplet_df = triplet_df.assign(label = ["sample", "similar", "disimilar"])
        triplet_df = triplet_df.assign(sample_id = str(i))
        
        triplets_df = pd.concat([triplets_df, triplet_df])
        
        i += 1
            
    triplets_df.reset_index(drop=True, inplace=True)
    
    
    fig = px.scatter_3d(triplets_df, x='x', y='y', z='z', 
                    color='sample_id', hover_name="title", hover_data=["id", "classifications", "label"],
                    width=1100, height=800,# adjust height and width
                    title="All triplets")

    fig.update_layout(legend = dict(
                            font = dict(size = 10)
                            ), 

                      hoverlabel=dict(
                            font_size=9,
                            )
                    )

    fig.update_traces(marker_size = 3)

    fig.show()
    
    
display_all_triplets(triplets, triplet_dataframe)

In [None]:
def display_all_dis_similar(query_ids, neighborhoods, dataframe):
    
    idx = query_ids[0]
    neighborhood = neighborhoods[idx]
    
    sim_ids = neighborhood[0]
    dissim_ids = neighborhood[2]
    
    sample_df = dataframe.loc[idx].to_frame().transpose()
    sample_df = sample_df.assign(label = ["sample"])
    
    sim_df = dataframe.loc[sim_ids]
    sim_df = sim_df.assign(label = ["similar"]*len(sim_ids))
    
    dissim_df = dataframe.loc[dissim_ids]
    dissim_df = dissim_df.assign(label = ["disimilar"]*len(dissim_ids))
    
    df = pd.concat([sample_df, sim_df, dissim_df])
    df.reset_index(drop=True, inplace=True)
    print(df)
    
    fig = px.scatter_3d(df, x='x', y='y', z='z', 
                    color='label', hover_name="title", hover_data=["id", "classifications"],
                    width=1100, height=800,# adjust height and width
                    title="All similar and dissimlar samples for one sample")

    fig.update_layout(legend = dict(
                            font = dict(size = 10)
                            ), 

                      hoverlabel=dict(
                            font_size=9,
                            )
                    )

    fig.update_traces(marker_size = 3)

    fig.show()
    
plot_neighborhoods(x_ids, neighborhoods, triplet_dataframe)    

In [None]:
length = 50
triplet_dataframe = wm_filtered[:][["id","classifications", "subjects"]]
triplet_dataframe["title"] = wm_filtered["title"][:].apply(lambda x: x[:length] if len(x)>length else x)
triplet_dataframe["x"] = embeddings_pca[:,0]
triplet_dataframe["y"] = embeddings_pca[:,1]
triplet_dataframe["z"] = embeddings_pca[:,2]

df_k_means.fillna('NaN', inplace=True)
df_k_means.head()

display_one_triplet(triplets, wm_filtered)

In [None]:
wm_data.loc[[34575, 34078]]

In [None]:
sklearn_kdTree = neighbors.KDTree(X)#, metric="")

In [None]:
sklearn_kdTree.valid_metrics

### Combine datasets


In [33]:
# wien museum
wm_data = pd.read_csv("data/wien_museum.csv")
# mak
mak_1 = pd.read_csv("data/mak_1.csv")
mak_2 = pd.read_csv("data/mak_2.csv")
mak_3 = pd.read_csv("data/mak_3.csv")
mak = pd.concat([mak_1, mak_2, mak_3])
# belvedere
bel = pd.read_csv("data/belvedere.csv").reset_index()

## extract interesting columns
wm_filtered = wm_data[wm_data.columns[[0,3,4,5,6,7,8]]]
mak_filtered = mak[mak.columns[[0,2,4,5,15,16,17,21,28,29,31,34,36,38]]]
mak_filtered.reset_index(drop=True, inplace=True)
bel_filtered = bel[bel.columns[[0,10,1,2,3,4,11,12,14,16,21,22]]]

  mak_1 = pd.read_csv("data/mak_1.csv")
  mak_3 = pd.read_csv("data/mak_3.csv")


In [34]:
# create one column that contains all text data
wm_filtered = wm_filtered.assign(full_text = wm_filtered[wm_filtered.columns[1:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1))
mak_filtered = mak_filtered.assign(full_text = mak_filtered[mak_filtered.columns[1:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1))
bel_filtered = bel_filtered.assign(full_text = bel_filtered[bel_filtered.columns[2:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1))
#print("Filtering done")

In [35]:
wm_filtered = wm_filtered.assign(id_column_name = "id")
wm_filtered = wm_filtered.assign(museum = "wm")
wm_filtered = wm_filtered.assign(url=wm_data["url"])
wm_filtered = wm_filtered.assign(media_url=wm_data["multimedia_default"])
wm_to_combine = wm_filtered[["id","id_column_name", "title", "museum" ]]
#wm_to_combine.head()

wm_filtered.head()

Unnamed: 0,id,title,artistsProducers,classifications,dates,districts,subjects,full_text,id_column_name,dataset_name,museum,url,media_url
0,1902639,Handbeschriebener Zettel mit Angebot zur Nachb...,,Grafik- und Fotosammlung | Hand- und Druckschr...,2020,,"Handschrift, handgeschriebener Text",Handbeschriebener Zettel mit Angebot zur Nachb...,id,wm,wm,https://sammlung.wienmuseum.at/objekt/1902639/,https://sammlung.wienmuseum.at/openapi-images/...
1,1902640,Zetteldokumentation zu den Corona-Balkonkonzer...,,Gebrauchsgegenstände | Erinnerungsgegenstände ...,2020,17. Bezirk: Hernals,"Handschrift, handgeschriebener Text",Zetteldokumentation zu den Corona-Balkonkonzer...,id,wm,wm,https://sammlung.wienmuseum.at/objekt/1902640/,https://sammlung.wienmuseum.at/openapi-images/...
2,1902641,Zetteldokumentation zu den Corona-Balkonkonzer...,,Gebrauchsgegenstände | Erinnerungsgegenstände ...,2020,17. Bezirk: Hernals,"Handschrift, handgeschriebener Text",Zetteldokumentation zu den Corona-Balkonkonzer...,id,wm,wm,https://sammlung.wienmuseum.at/objekt/1902641/,https://sammlung.wienmuseum.at/openapi-images/...
3,1902642,Zetteldokumentation zu den Corona-Balkonkonzer...,,Grafik- und Fotosammlung | Gebrauchsgrafik,2020,,,Zetteldokumentation zu den Corona-Balkonkonzer...,id,wm,wm,https://sammlung.wienmuseum.at/objekt/1902642/,https://sammlung.wienmuseum.at/openapi-images/...
4,1902643,Zetteldokumentation zu den Corona-Balkonkonzer...,,Gebrauchsgegenstände | Erinnerungsgegenstände ...,2020,17. Bezirk: Hernals,"Handschrift, handgeschriebener Text",Zetteldokumentation zu den Corona-Balkonkonzer...,id,wm,wm,https://sammlung.wienmuseum.at/objekt/1902643/,https://sammlung.wienmuseum.at/openapi-images/...


In [44]:
wm_for_dataset = wm_filtered[["museum", "id", "url", "media_url"]]
wm_for_dataset

Unnamed: 0,museum,id,url,media_url
0,wm,1902639,https://sammlung.wienmuseum.at/objekt/1902639/,https://sammlung.wienmuseum.at/openapi-images/...
1,wm,1902640,https://sammlung.wienmuseum.at/objekt/1902640/,https://sammlung.wienmuseum.at/openapi-images/...
2,wm,1902641,https://sammlung.wienmuseum.at/objekt/1902641/,https://sammlung.wienmuseum.at/openapi-images/...
3,wm,1902642,https://sammlung.wienmuseum.at/objekt/1902642/,https://sammlung.wienmuseum.at/openapi-images/...
4,wm,1902643,https://sammlung.wienmuseum.at/objekt/1902643/,https://sammlung.wienmuseum.at/openapi-images/...
...,...,...,...,...
62586,wm,1034409,https://sammlung.wienmuseum.at/objekt/1034409/,https://sammlung.wienmuseum.at/openapi-images/...
62587,wm,1033920,https://sammlung.wienmuseum.at/objekt/1033920/,https://sammlung.wienmuseum.at/openapi-images/...
62588,wm,1033912,https://sammlung.wienmuseum.at/objekt/1033912/,https://sammlung.wienmuseum.at/openapi-images/...
62589,wm,910481,https://sammlung.wienmuseum.at/objekt/910481/,https://sammlung.wienmuseum.at/openapi-images/...


In [37]:
mak_filtered = mak_filtered.assign(id_column_name = "priref")
mak_filtered = mak_filtered.assign(dataset_name = "mak")
mak_filtered = mak_filtered.assign(id = mak_filtered["priref"])

mak_filtered = mak_filtered.assign(museum = "mak")
mak_filtered = mak_filtered.assign(url=mak_filtered["priref"].apply(lambda x: f"https://sammlung.mak.at/sammlung_online?id=collect-{x}"))
#mak_filtered = mak_filtered.assign(media_url=wm_data["multimedia_default"])

mak_to_combine = mak_filtered[["id","id_column_name", "title", "dataset_name" ]]
#mak_to_combine.head()

In [38]:
mak_for_dataset = mak_filtered[["museum", "id", "url"]]
mak_for_dataset

Unnamed: 0,museum,id,url
0,mak,10,https://sammlung.mak.at/sammlung_online?id=col...
1,mak,100,https://sammlung.mak.at/sammlung_online?id=col...
2,mak,1000,https://sammlung.mak.at/sammlung_online?id=col...
3,mak,10000,https://sammlung.mak.at/sammlung_online?id=col...
4,mak,10001,https://sammlung.mak.at/sammlung_online?id=col...
...,...,...,...
259593,mak,99960,https://sammlung.mak.at/sammlung_online?id=col...
259594,mak,9997,https://sammlung.mak.at/sammlung_online?id=col...
259595,mak,9998,https://sammlung.mak.at/sammlung_online?id=col...
259596,mak,9999,https://sammlung.mak.at/sammlung_online?id=col...


In [41]:
bel_filtered = bel_filtered.assign(id_column_name = "Identifier")
bel_filtered = bel_filtered.assign(dataset_name = "bel")
bel_filtered = bel_filtered.assign(id = bel_filtered["Identifier"])
bel_filtered = bel_filtered.assign(title = bel_filtered["Title"])

bel_filtered = bel_filtered.assign(museum = "bel")
bel_filtered = bel_filtered.assign(url=bel["IsShownAt"])
bel_filtered = bel_filtered.assign(media_url=bel["Object"])

bel_to_combine = bel_filtered[["id","id_column_name", "title", "dataset_name" ]]
#bel_to_combine.head()

In [43]:
bel_for_dataset = bel_filtered[["museum", "id", "url", "media_url"]]
bel_for_dataset

Unnamed: 0,museum,id,url,media_url
0,bel,1093a-p,https://sammlung.belvedere.at/objects/97/16-an...,https://sammlung.belvedere.at/internal/media/d...
1,bel,3986,https://sammlung.belvedere.at/objects/2780/abe...,https://sammlung.belvedere.at/internal/media/d...
2,bel,5474,https://sammlung.belvedere.at/objects/4225/abe...,https://sammlung.belvedere.at/internal/media/d...
3,bel,3873,https://sammlung.belvedere.at/objects/2667/abe...,https://sammlung.belvedere.at/internal/media/d...
4,bel,5100,https://sammlung.belvedere.at/objects/3851/abe...,https://sammlung.belvedere.at/internal/media/d...
...,...,...,...,...
5108,bel,7240,https://sammlung.belvedere.at/objects/7028/zwe...,https://sammlung.belvedere.at/internal/media/d...
5109,bel,11046/51,https://sammlung.belvedere.at/objects/53934/zw...,https://sammlung.belvedere.at/internal/media/d...
5110,bel,9328,https://sammlung.belvedere.at/objects/10142/zw...,https://sammlung.belvedere.at/internal/media/d...
5111,bel,3293,https://sammlung.belvedere.at/objects/2090/zwe...,https://sammlung.belvedere.at/internal/media/d...


In [50]:
df_combined_dataset = pd.concat([wm_for_dataset, mak_for_dataset, bel_for_dataset]).reset_index(drop=True)
df_combined_dataset

Unnamed: 0,museum,id,url,media_url
0,wm,1902639,https://sammlung.wienmuseum.at/objekt/1902639/,https://sammlung.wienmuseum.at/openapi-images/...
1,wm,1902640,https://sammlung.wienmuseum.at/objekt/1902640/,https://sammlung.wienmuseum.at/openapi-images/...
2,wm,1902641,https://sammlung.wienmuseum.at/objekt/1902641/,https://sammlung.wienmuseum.at/openapi-images/...
3,wm,1902642,https://sammlung.wienmuseum.at/objekt/1902642/,https://sammlung.wienmuseum.at/openapi-images/...
4,wm,1902643,https://sammlung.wienmuseum.at/objekt/1902643/,https://sammlung.wienmuseum.at/openapi-images/...
...,...,...,...,...
327297,bel,7240,https://sammlung.belvedere.at/objects/7028/zwe...,https://sammlung.belvedere.at/internal/media/d...
327298,bel,11046/51,https://sammlung.belvedere.at/objects/53934/zw...,https://sammlung.belvedere.at/internal/media/d...
327299,bel,9328,https://sammlung.belvedere.at/objects/10142/zw...,https://sammlung.belvedere.at/internal/media/d...
327300,bel,3293,https://sammlung.belvedere.at/objects/2090/zwe...,https://sammlung.belvedere.at/internal/media/d...


In [52]:
df_combined = pd.concat([wm_to_combine, mak_to_combine, bel_to_combine]).reset_index(drop=True)
#df_combined

In [51]:
# wm 
# mak
# bel
combined = np.loadtxt('data/sentence_embeddings_combined.csv', delimiter=',')
# 2 min runtime

KeyboardInterrupt: 

In [None]:
stand_combined_embs = (combined - np.mean(combined, axis=0)) / np.std(combined, axis=0)
pca_3d = PCA(n_components=3)
embeddings_3d = pca_3d.fit_transform(stand_combined_embs)
#print(3, sum(pca_3d.explained_variance_ratio_))

In [None]:
df_combined["x"] = embeddings_3d[:,0]
df_combined["y"] = embeddings_3d[:,1]
df_combined["z"] = embeddings_3d[:,2]

In [None]:
df_combined

In [None]:
fig = px.scatter_3d(df_combined, 
                    x='x', y='y', z='z', 
                    color='dataset_name', 
                    hover_name="title", # what to show when hovered over,
                    width=2500, height=1250, # adjust height and width
                    title="Combined Data")

# make set size for legend and hover label
fig.update_layout(showlegend=True,
                 legend = dict(
                        font = dict(size = 10)
                        ), 
                hoverlabel=dict(
                        font_size=10,
                        )
                )

# set marker size
fig.update_traces(marker_size = 3)
fig.show()

load each embedding separately and combine later

In [53]:
se_wm = np.loadtxt('data/sentence_embeddings_wm.csv', delimiter=',')
se_mak = np.loadtxt('data/sentence_embeddings_mak.csv', delimiter=',')
se_bel = np.loadtxt('data/sentence_embeddings_bel.csv', delimiter=',')

In [54]:
# rename ind to sep
combined_sep = np.concatenate((se_wm, se_mak, se_bel), axis=0)
combined_sep.shape

(327302, 512)

In [55]:
n = 100
stand_combined_embs_sep = (combined_sep - np.mean(combined_sep, axis=0)) / np.std(combined_sep, axis=0)
pca_3d_sep = PCA(n_components=n)
embeddings_3d_sep = pca_3d_sep.fit_transform(stand_combined_embs_sep)
print(n, sum(pca_3d_sep.explained_variance_ratio_))

100 0.8000651852083122


In [58]:
dataset = df_combined_dataset.copy(deep=True)
for i in range(n):
    dataset[i]=embeddings_3d_sep[:,i]

  dataset[i]=embeddings_3d_sep[:,i]


In [64]:
dataset.to_csv("data/combined_dataset_se=100d.csv")

In [None]:
df_combined_sep = df_combined.copy(deep=True)
df_combined_sep["x"] = embeddings_3d_sep[:,0]
df_combined_sep["y"] = embeddings_3d_sep[:,1]
df_combined_sep["z"] = embeddings_3d_sep[:,2]

In [None]:
df_combined_sep

In [None]:
df_combined

In [None]:
fig = px.scatter_3d(df_combined_sep, 
                    x='x', y='y', z='z', 
                    color='dataset_name', 
                    hover_name="title", # what to show when hovered over,
                    width=2500, height=1250, # adjust height and width
                    title="Combined Data")

# make set size for legend and hover label
fig.update_layout(showlegend=True,
                 legend = dict(
                        font = dict(size = 10)
                        ), 
                hoverlabel=dict(
                        font_size=10,
                        )
                )

# set marker size
fig.update_traces(marker_size = 3)
fig.show()

In [None]:
df_combined

In [None]:
stand_combined_embs_sep = (combined_sep - np.mean(combined_sep, axis=0)) / np.std(combined_sep, axis=0)
pca_3d_sep = PCA(n_components=3)
embeddings_3d_sep = pca_3d_sep.fit_transform(stand_combined_embs_sep)
print(3, sum(pca_3d_sep.explained_variance_ratio_))

In [3]:
len(list(range(4,104)))

100