In [193]:
import pandas as pd
import numpy as np
import json
import pickle
from bs4 import BeautifulSoup
from collections import defaultdict
from plotly import graph_objs as go
from sklearn.manifold import TSNE
import umap

In [194]:
#load data
troet_cafe = pd.read_pickle('Data/troet.pickle')
sueden_social = pd.read_pickle('Data/sueden.pickle')
nrw_social = pd.read_pickle('Data/nrw.pickle')

In [195]:
#add unique user id to every toot
troet_cafe["unique_user_id"] = troet_cafe["account"].apply(lambda x: x["acct"])
sueden_social["unique_user_id"] = sueden_social["account"].apply(lambda x: x["acct"])
nrw_social["unique_user_id"] = nrw_social["account"].apply(lambda x: x["acct"])

### jaccard similarity

In [196]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection / union)

In [197]:
#compare users on the three instances
troet_cafe_user = troet_cafe['unique_user_id'].unique()
sueden_social_user = sueden_social['unique_user_id'].unique()
nrw_social_user = nrw_social['unique_user_id'].unique()

In [188]:
print("Jaccard Similarity between Troet Cafe and Sueden Social: ", jaccard_similarity(troet_cafe_user, sueden_social_user))
print("Jaccard Similarity between Troet Cafe and NRW Social: ", jaccard_similarity(troet_cafe_user, nrw_social_user))
print("Jaccard Similarity between Sueden Social and NRW Social: ", jaccard_similarity(sueden_social_user, nrw_social_user))

Jaccard Similarity between Troet Cafe and Sueden Social:  0.40209383481969757
Jaccard Similarity between Troet Cafe and NRW Social:  0.4347986701145179
Jaccard Similarity between Sueden Social and NRW Social:  0.4939448595722752


### prepare embedding comparison

In [166]:
#Filter users with less than 10 toots
troet_cafe = troet_cafe.query("no_toots > 10").reset_index(drop=True)
sueden_social = sueden_social.query("no_toots > 10").reset_index(drop=True)
nrw_social = nrw_social.query("no_toots > 10").reset_index(drop=True)

#concatenate all dataframes and drop duplicates
df_all = pd.concat([troet_cafe, sueden_social, nrw_social])
df_all = df_all.drop_duplicates(subset=['url']).reset_index(drop=True)

In [167]:
#create dictionary with user id as key and list of embeddings as value
dict_user_emb = defaultdict(list)
for i in range(len(df_all)):
    dict_user_emb[df_all['unique_user_id'].iloc[i]].append(df_all['embedding'].iloc[i])

#calculate mean of embeddings for each user
dict_user_emb = {k: np.mean(v, axis=0) for k, v in dict_user_emb.items()}

In [168]:
#one hot encoding for server
list_avg_emb = []
list_user_server = []

for key, value in dict_user_emb.items():
    list_avg_emb.append(value)

    one_hot = [0, 0, 0]

    if key in troet_cafe['unique_user_id'].unique():
        one_hot[0] = 1  

    if key in sueden_social['unique_user_id'].unique():
        one_hot[1] = 1

    if key in nrw_social['unique_user_id'].unique():
        one_hot[2] = 1
    
    list_user_server.append(one_hot)

### visualize embeddings using tsne

In [174]:
#convert list of embeddings to numpy arrays
x = list(list_avg_emb)
X = np.array(x)

#tsne of embeddings
tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(X)

#split embeddings into list of embeddings for each server
troet_cafe_reduced_emb = []
sueden_social_reduced_emb = []
nrw_social_reduced_emb = []
multiple_server = []

for i in range(len(list_user_server)):
    if list_user_server[i] == [1, 0, 0]:
        troet_cafe_reduced_emb.append(tsne_results[i])    
    elif list_user_server[i] == [0, 1, 0]:
        sueden_social_reduced_emb.append(tsne_results[i])
    elif list_user_server[i] == [0, 0, 1]:
        nrw_social_reduced_emb.append(tsne_results[i])
    else:
        multiple_server.append(tsne_results[i])


#plot list of embeddings
troet_cafe_reduced_emb = np.array(troet_cafe_reduced_emb)
sueden_social_reduced_emb = np.array(sueden_social_reduced_emb)
nrw_social_reduced_emb = np.array(nrw_social_reduced_emb)
multiple_server = np.array(multiple_server)

fig = go.Figure()
fig.add_trace(go.Scatter(x=troet_cafe_reduced_emb[:,0], y=troet_cafe_reduced_emb[:,1], mode='markers', name='Troet Cafe'))
fig.add_trace(go.Scatter(x=sueden_social_reduced_emb[:,0], y=sueden_social_reduced_emb[:,1], mode='markers', name='Sueden Social'))
fig.add_trace(go.Scatter(x=nrw_social_reduced_emb[:,0], y=nrw_social_reduced_emb[:,1], mode='markers', name='NRW Social'))
fig.add_trace(go.Scatter(x=multiple_server[:,0], y=multiple_server[:,1], mode='markers', name='Multiple Servers'))


fig.update_layout(
    title="TSNE of User Embeddings",
    xaxis_title="TSNE 1",
    yaxis_title="TSNE 2",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)

fig.show()
fig.write_html("tsne_user_embeddings.html")

### visualize embeddings using umap

In [175]:
#umap of embeddings
reducer = umap.UMAP()
embedding = reducer.fit_transform(X)

#split embeddings into list of embeddings for each server
troet_cafe_reduced_emb = []
sueden_social_reduced_emb = []
nrw_social_reduced_emb = []
multiple_server = []


for i in range(len(list_user_server)):
    if list_user_server[i] == [1, 0, 0]:
        troet_cafe_reduced_emb.append(embedding[i])    
    elif list_user_server[i] == [0, 1, 0]:
        sueden_social_reduced_emb.append(embedding[i])
    elif list_user_server[i] == [0, 0, 1]:
        nrw_social_reduced_emb.append(embedding[i])
    else:
        multiple_server.append(embedding[i])

#plot list of embeddings
troet_cafe_reduced_emb = np.array(troet_cafe_reduced_emb)
sueden_social_reduced_emb = np.array(sueden_social_reduced_emb)
nrw_social_reduced_emb = np.array(nrw_social_reduced_emb)
multiple_server = np.array(multiple_server)

fig = go.Figure()
fig.add_trace(go.Scatter(x=troet_cafe_reduced_emb[:,0], y=troet_cafe_reduced_emb[:,1], mode='markers', name='Troet Cafe'))
fig.add_trace(go.Scatter(x=sueden_social_reduced_emb[:,0], y=sueden_social_reduced_emb[:,1], mode='markers', name='Sueden Social'))
fig.add_trace(go.Scatter(x=nrw_social_reduced_emb[:,0], y=nrw_social_reduced_emb[:,1], mode='markers', name='NRW Social'))
fig.add_trace(go.Scatter(x=multiple_server[:,0], y=multiple_server[:,1], mode='markers', name='Multiple Servers'))


fig.update_layout(
    title="UMAP of User Embeddings",
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)

fig.show()
fig.write_html("umap_user_embeddings.html")

### Test

In [171]:
agreements = len(set(nrw_social["url"]).intersection(set(sueden_social["url"])))

print(f"There are {agreements} agreements between the two lists.")

There are 20345 agreements between the two lists.


In [172]:
agreements = len(set(nrw_social["url"]).intersection(set(troet_cafe["url"])))

print(f"There are {agreements} agreements between the two lists.")

There are 28544 agreements between the two lists.


In [173]:
agreements = len(set(sueden_social["url"]).intersection(set(troet_cafe["url"])))

print(f"There are {agreements} agreements between the two lists.")

There are 22825 agreements between the two lists.
