In [48]:
import os
import sys
from itertools import combinations


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sentence_transformers
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_rand_score, rand_score, mutual_info_score, adjusted_mutual_info_score, fowlkes_mallows_score
from sklearn.metrics.cluster import contingency_matrix
import pickle
from tqdm.notebook import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_input.load_data import load_corpus

In [2]:
model_name = 'paraphrase-multilingual-mpnet-base-v2'
corpus = load_corpus("../../data")

In [38]:
model = sentence_transformers.SentenceTransformer(model_name)

In [6]:
embeddings = model.encode(corpus.argument.to_list(), show_progress_bar=True)

Batches:   0%|          | 0/1013 [00:00<?, ?it/s]

In [8]:
pickle.dump(embeddings, open("../../data/corpus_embeddings.pkl", "wb"))

In [3]:
embeddings = pickle.load(open("../../data/corpus_embeddings.pkl", "rb"))

In [4]:
crps = pd.read_json("../../data/corpus.jsonl", lines=True)

In [5]:
corpus.argument.to_list() == crps.argument.to_list()

True

Testing clustering for overlap to full profiles

In [6]:
P_ID = {name:i for i, name in enumerate(crps.demographic_profile.apply(lambda x: [x[attr] for attr in list(x.keys())[:-1]]).astype(str).unique())}

In [7]:
len(P_ID.keys())

1038

In [8]:
corpus["P_ID"] = crps.demographic_profile.apply(lambda x: [x[attr] for attr in list(x.keys())[:-1]]).astype(str).apply(lambda x: P_ID[x])

In [13]:
kmeans = KMeans(n_clusters=len(P_ID.keys()), random_state=0)
kmeans.fit(embeddings)
X = kmeans.labels_
Y = corpus.P_ID.to_list()
adjusted_rand_score(X, Y)

0.0057567834688547195

Clustering for individual attributes

In [14]:
# nominal attributes
for col in ['stance', 'topic', 'gender', 'residence', 'civil_status', 'denomination', 'education', 'political_spectrum']:
    mapping = {val:i for i, val in enumerate(corpus[f"{col}"].unique())}
    corpus[f"{col}_num"] = corpus[f"{col}"].apply(lambda x: mapping[x])
# ordinal attributes
# age
mapping = {
    "18-34":0,
    "35-49":1,
    "50-64":2,
    "65+":3
}
corpus["age_num"] = corpus["age"].apply(lambda x: mapping[x])

In [28]:
for attribute in ["gender_num", "age_num", "education_num", "denomination_num", "residence_num", "political_spectrum_num", "stance_num", "topic_num"]:
    n_clusters = len(corpus[attribute].unique())
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(embeddings)
    X = kmeans.labels_
    y = corpus[attribute].to_list()
    print(f"Adjusted Rand Score for {attribute}: {adjusted_rand_score(X, y)}")



Adjusted Rand Score for gender_num: 0.0008942619759932484
Adjusted Rand Score for age_num: -0.001137410864742708
Adjusted Rand Score for education_num: 0.0007345807987916514
Adjusted Rand Score for denomination_num: 0.00040783745708385637
Adjusted Rand Score for residence_num: 0.0006700647807115293
Adjusted Rand Score for political_spectrum_num: -0.0010603167491186306
Adjusted Rand Score for stance_num: 0.000524420741516086
Adjusted Rand Score for topic_num: 0.19471668155307859


Cluster for issue_id

In [19]:
issues_id = {name:i for i, name in enumerate(corpus.important_political_issues.astype(str).unique())}

In [20]:
len(issues_id.keys())

120

In [21]:
corpus["issues_id"] = corpus.important_political_issues.astype(str).apply(lambda x: issues_id[x])

In [23]:
corpus.important_political_issues.apply(len).max()

6

In [22]:
kmeans = KMeans(n_clusters=len(issues_id.keys()), random_state=0)
kmeans.fit(embeddings)
X = kmeans.labels_
Y = corpus.issues_id.to_list()
adjusted_rand_score(X, Y)

0.003091884256469536

In [24]:
corpus["first"] = corpus.important_political_issues.apply(lambda x: x[0] if len(x) > 0 else None)

In [27]:
kmeans = KMeans(n_clusters=len(corpus["first"].unique()), random_state=0)
kmeans.fit(embeddings)
X = kmeans.labels_
Y = corpus["first"].to_list()
adjusted_rand_score(X, Y)

-0.0006181048244818353

Combinations of attributes

In [37]:
attributes = ["gender_num", "age_num", "education_num", "denomination_num", "residence_num", "political_spectrum_num", "stance_num", "topic_num", "issues_id"]
for attribute_a, attribute_b in tqdm(combinations(attributes, 2), total=len(list(combinations(attributes, 2)))):
    # constructing a unique id for the combination of the two attributes
    attributes_id = {name:i for i, name in enumerate(corpus[[attribute_a, attribute_b]].apply(lambda x: (x[0], x[1]), axis=1).astype(str).unique())}
    # clustering
    kmeans = KMeans(n_clusters=len(attributes_id.keys()), random_state=0)
    kmeans.fit(embeddings)
    X = kmeans.labels_
    Y = corpus[[attribute_a, attribute_b]].apply(lambda x: (x[0], x[1]), axis=1).astype(str).apply(lambda x: attributes_id[x]).to_list()
    # calculating the adjusted rand score
    print(f"Adjusted Rand Score for {attribute_a} and {attribute_b}: {adjusted_rand_score(X, Y)}")

  0%|          | 0/36 [00:00<?, ?it/s]

Adjusted Rand Score for gender_num and age_num: 0.0018522705209112849
Adjusted Rand Score for gender_num and education_num: 0.0010161029550364506
Adjusted Rand Score for gender_num and denomination_num: 0.0018973983823261076
Adjusted Rand Score for gender_num and residence_num: 0.0015521705678500453
Adjusted Rand Score for gender_num and political_spectrum_num: 0.0013411862577388653
Adjusted Rand Score for gender_num and stance_num: 0.0037217497091557505
Adjusted Rand Score for gender_num and topic_num: 0.18084802066235958
Adjusted Rand Score for gender_num and issues_id: 0.0031894203356265445
Adjusted Rand Score for age_num and education_num: 0.0007354544876367023
Adjusted Rand Score for age_num and denomination_num: 0.000569581478830517
Adjusted Rand Score for age_num and residence_num: -0.0008948336430417577
Adjusted Rand Score for age_num and political_spectrum_num: 0.0002995339123927528
Adjusted Rand Score for age_num and stance_num: 0.0016400918106708518
Adjusted Rand Score for a

Doing the clustering per topic per attribute

In [49]:
for topic in tqdm(corpus["topic"].unique(), total=len(corpus["topic"].unique())): 
    # gather indices of the topic
    indices = corpus[corpus["topic"] == topic].index
    subcorpus = corpus.loc[indices]
    # only consider the embeddings of the topic
    embeddings_subset = embeddings[indices]
    for attribute in ["gender_num", "age_num", "education_num", "denomination_num", "residence_num", "political_spectrum_num", "issues_id"]:
        n_clusters = len(corpus[attribute].unique())
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(embeddings_subset)
        X = kmeans.labels_
        y = subcorpus[attribute].to_list()
        print(f"{topic}, {attribute}:\nRI={rand_score(X, y)}, ARI={adjusted_rand_score(X, y)}, MI={mutual_info_score(X, y)}, AMI={adjusted_mutual_info_score(X, y)}, FMI={fowlkes_mallows_score(X, y)}\n{contingency_matrix(X, y)}\n\n")

  0%|          | 0/12 [00:00<?, ?it/s]

Immigration, gender_num:
RI=0.5148420169016499, ARI=0.015939741351086514, MI=0.0019813789111226, AMI=0.002924112910076566, FMI=0.5727911604576772
[[ 523  371]
 [1509  802]]


Immigration, age_num:
RI=0.6003398637818171, ARI=0.0013564354776844025, MI=0.0021574098107036838, AMI=0.0005686955850499256, FMI=0.2758735844364618
[[282 238 230  53]
 [254 235 282  55]
 [332 290 279  49]
 [200 186 190  50]]


Immigration, education_num:
RI=0.7193665873975783, ARI=0.003688822120502029, MI=0.04906063256909662, AMI=0.010886660256576936, FMI=0.14917097922324513
[[ 56 162 121   2  16   7  17  26  16   3   7   0   5]
 [ 37 114  82   1   9   5  12  19   9   1   2   2   5]
 [ 18  15  31   1   0   1  11  19  16   0   1   0   1]
 [ 21  50  25   3   4   0   7  11   7   1   1   1   2]
 [  7  30  32   0   1   1   6   3   5   0   0   0   1]
 [ 21  66  64   2  14   4  11  10   5   1   2   0   0]
 [ 14  54  31   0   2   0   3   8   5   0   1   0   4]
 [ 60 159 116   4  17   9  18  25  15   1   5   0   0]
 [ 16  

In [41]:
len(embeddings)

32387

In [54]:
for topic in tqdm(corpus["topic"].unique(), total=len(corpus["topic"].unique())): 
    # gather indices of the topic
    indices = corpus[corpus["topic"] == topic].index
    subcorpus = corpus.loc[indices]
    # only consider the embeddings of the topic
    embeddings_subset = embeddings[indices]
    kmeans = KMeans(n_clusters=len(attributes_id.keys()), random_state=0)
    kmeans.fit(embeddings_subset)
    X = kmeans.labels_
    attributes = ["gender_num", "age_num", "education_num", "denomination_num", "residence_num", "political_spectrum_num", "stance_num", "topic_num", "issues_id"]
    for attribute_a, attribute_b in combinations(attributes, 2):
        # constructing a unique id for the combination of the two attributes
        attributes_id = {name:i for i, name in enumerate(subcorpus[[attribute_a, attribute_b]].apply(lambda x: (x[0], x[1]), axis=1).astype(str).unique())}
        y = subcorpus[[attribute_a, attribute_b]].apply(lambda x: (x[0], x[1]), axis=1).astype(str).apply(lambda x: attributes_id[x]).to_list()
        # print(f"{topic}, {attribute_a}-{attribute_b}:\nRI={rand_score(X, y)}, ARI={adjusted_rand_score(X, y)}, MI={mutual_info_score(X, y)}, AMI={adjusted_mutual_info_score(X, y)}, FMI={fowlkes_mallows_score(X, y)}\n{contingency_matrix(X, y)}\n\n")
        print(f"{topic}, {attribute_a}-{attribute_b}:\nRI={rand_score(X, y)}, ARI={adjusted_rand_score(X, y)}, MI={mutual_info_score(X, y)}, AMI={adjusted_mutual_info_score(X, y)}, FMI={fowlkes_mallows_score(X, y)}\n\n")

  0%|          | 0/12 [00:00<?, ?it/s]

Immigration, gender_num-age_num:
RI=0.8355446877051113, ARI=0.004022457737055631, MI=0.4543176126400551, AMI=0.03597700327662408, FMI=0.04223860753287338


Immigration, gender_num-education_num:
RI=0.8681710264665268, ARI=0.002972580751884134, MI=0.7390677191928007, AMI=0.03636958464061899, FMI=0.03525167551302264


Immigration, gender_num-denomination_num:
RI=0.8324288477157064, ARI=0.005608018343812916, MI=0.5604114518158041, AMI=0.041915632187458716, FMI=0.046837262286254865


Immigration, gender_num-residence_num:
RI=0.5293766956670776, ARI=0.0007993784616091125, MI=0.17591273451750294, AMI=0.016116766871351725, FMI=0.0578435462422135


Immigration, gender_num-political_spectrum_num:
RI=0.8401900120948658, ARI=0.012685996376458477, MI=0.760653412203699, AMI=0.07951888521510249, FMI=0.06409485656184148


Immigration, gender_num-stance_num:
RI=0.7211461492167552, ARI=0.006297957738327927, MI=0.3809171510073206, AMI=0.06752780523590046, FMI=0.06293217931764487


Immigration, gender_nu

In [56]:
for topic in tqdm(corpus["topic"].unique(), total=len(corpus["topic"].unique())): 
    # gather indices of the topic
    indices = corpus[corpus["topic"] == topic].index
    subcorpus = corpus.loc[indices]
    # only consider the embeddings of the topic
    embeddings_subset = embeddings[indices]
    kmeans = KMeans(n_clusters=len(attributes_id.keys()), random_state=0)
    kmeans.fit(embeddings_subset)
    X = kmeans.labels_
    attributes = ["gender_num", "age_num", "education_num", "denomination_num", "residence_num", "political_spectrum_num", "stance_num", "topic_num", "issues_id"]
    for attribute_a, attribute_b, attribute_c in combinations(attributes, 3):
        # constructing a unique id for the combination of the three attributes
        attributes_id = {name:i for i, name in enumerate(subcorpus[[attribute_a, attribute_b, attribute_c]].apply(lambda x: (x[0], x[1], x[2]), axis=1).astype(str).unique())}
        y = subcorpus[[attribute_a, attribute_b, attribute_c]].apply(lambda x: (x[0], x[1], x[2]), axis=1).astype(str).apply(lambda x: attributes_id[x]).to_list()
        # print(f"{topic}, {attribute_a}-{attribute_b}:\nRI={rand_score(X, y)}, ARI={adjusted_rand_score(X, y)}, MI={mutual_info_score(X, y)}, AMI={adjusted_mutual_info_score(X, y)}, FMI={fowlkes_mallows_score(X, y)}\n{contingency_matrix(X, y)}\n\n")
        print(f"{topic}, {attribute_a}-{attribute_b}-{attribute_c}:\nRI={rand_score(X, y)}, ARI={adjusted_rand_score(X, y)}, MI={mutual_info_score(X, y)}, AMI={adjusted_mutual_info_score(X, y)}, FMI={fowlkes_mallows_score(X, y)}\n\n")

  0%|          | 0/12 [00:00<?, ?it/s]

Immigration, gender_num-age_num-education_num:
RI=0.8179366275774627, ARI=0.0040633119672228105, MI=0.12218049265315875, AMI=0.014226577360736688, FMI=0.0824824558577515


Immigration, gender_num-age_num-denomination_num:
RI=0.8102971909138538, ARI=0.010676300257366749, MI=0.09247787325431134, AMI=0.016381037064468636, FMI=0.10122826144055977


Immigration, gender_num-age_num-residence_num:
RI=0.7482059282371295, ARI=0.005228741647760243, MI=0.03360772896593236, AMI=0.008283719928396658, FMI=0.15283658017405988


Immigration, gender_num-age_num-political_spectrum_num:
RI=0.8119799548536248, ARI=0.009684373292177618, MI=0.11595700626869372, AMI=0.020495578261166245, FMI=0.09792675084331919


Immigration, gender_num-age_num-stance_num:
RI=0.7883258251678382, ARI=0.009898159201074709, MI=0.0730446407830716, AMI=0.024996628340223228, FMI=0.12361406769246276


Immigration, gender_num-age_num-topic_num:
RI=0.7344758209804048, ARI=0.004895905880021881, MI=0.02319343643332447, AMI=0.0080279765

In [58]:
embeddings[0].shape

(768,)

In [59]:
for topic in tqdm(corpus["topic"].unique(), total=len(corpus["topic"].unique())): 
    # gather indices of the topic
    indices = corpus[corpus["topic"] == topic].index
    subcorpus = corpus.loc[indices]
    # only consider the embeddings of the topic
    embeddings_subset = embeddings[indices]
    kmeans = KMeans(n_clusters=len(attributes_id.keys()), random_state=0)
    kmeans.fit(embeddings_subset)
    X = kmeans.labels_
    attributes = ["gender_num", "age_num", "education_num", "denomination_num", "residence_num", "political_spectrum_num", "stance_num", "topic_num", "issues_id"]
    for attribute_a, attribute_b, attribute_c, attribute_d in combinations(attributes, 4):
        # constructing a unique id for the combination of the four attributes
        attributes_id = {name:i for i, name in enumerate(subcorpus[[attribute_a, attribute_b, attribute_c, attribute_d]].apply(lambda x: (x[0], x[1], x[2], x[3]), axis=1).astype(str).unique())}
        y = subcorpus[[attribute_a, attribute_b, attribute_c, attribute_d]].apply(lambda x: (x[0], x[1], x[2], x[3]), axis=1).astype(str).apply(lambda x: attributes_id[x]).to_list()
        # print(f"{topic}, {attribute_a}-{attribute_b}:\nRI={rand_score(X, y)}, ARI={adjusted_rand_score(X, y)}, MI={mutual_info_score(X, y)}, AMI={adjusted_mutual_info_score(X, y)}, FMI={fowlkes_mallows_score(X, y)}\n{contingency_matrix(X, y)}\n\n")
        print(f"{topic}, {attribute_a}-{attribute_b}-{attribute_c}:\nRI={rand_score(X, y)}, ARI={adjusted_rand_score(X, y)}, MI={mutual_info_score(X, y)}, AMI={adjusted_mutual_info_score(X, y)}, FMI={fowlkes_mallows_score(X, y)}\n\n")

  0%|          | 0/12 [00:00<?, ?it/s]

Immigration, gender_num-age_num-education_num:
RI=0.9727779822803398, ARI=0.02095456801765695, MI=1.745850024139615, AMI=0.07430869000038086, FMI=0.0349500663905576


Immigration, gender_num-age_num-education_num:
RI=0.9569011824143377, ARI=0.012643963665658797, MI=1.147969679843555, AMI=0.06327191244781329, FMI=0.03354940150146449


Immigration, gender_num-age_num-education_num:
RI=0.9766419121184323, ARI=0.026677702309274798, MI=1.9169811374110308, AMI=0.08284641007567263, FMI=0.038577439964130114


Immigration, gender_num-age_num-education_num:
RI=0.9701361987063752, ARI=0.021790652854363123, MI=1.4788829788251494, AMI=0.08410943467984931, FMI=0.037279614933428594


Immigration, gender_num-age_num-education_num:
RI=0.9518877534127582, ARI=0.012169642712109548, MI=1.0138471156472404, AMI=0.06190202575239837, FMI=0.03496453257039115


Immigration, gender_num-age_num-education_num:
RI=0.9852222553321609, ARI=0.04036322149423888, MI=2.811592450334386, AMI=0.09878051528120745, FMI=0.0509