# First `politician2vec` demo

## To do
- Descriptive statistics/viz for entire dataset
- Settle on centroid calculation method (handle outliers better, remove alt method?)
- Compile populism-related words for constructing axes
- Determine possibilities of comparative analysis
- Consistently change "topic" to "party"/"cluster" or the like

In [1]:
%pip uninstall politician2vec -y

Found existing installation: politician2vec 0.0.1
Uninstalling politician2vec-0.0.1:
  Successfully uninstalled politician2vec-0.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install git+ssh://git@github.com/mathiasbruun/politician2vec.git

Collecting git+ssh://****@github.com/mathiasbruun/politician2vec.git
  Cloning ssh://****@github.com/mathiasbruun/politician2vec.git to /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-req-build-7_1xy0e5
  Running command git clone -q 'ssh://****@github.com/mathiasbruun/politician2vec.git' /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-req-build-7_1xy0e5
Building wheels for collected packages: politician2vec
  Building wheel for politician2vec (setup.py) ... [?25ldone
[?25h  Created wheel for politician2vec: filename=politician2vec-0.0.1-py3-none-any.whl size=27418 sha256=54eca48890b4bf060429ce14c88f06e1c5172f1c5298f82fef97eb3503aa62e5
  Stored in directory: /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-ephem-wheel-cache-64_l8cex/wheels/ac/44/62/b0b3ddf2882cd1b1d1cc4e060c5c525b951ae01496d65cd472
Successfully built politician2vec
Installing collected packages: politician2vec
Successfully installed politician2vec-0.0.1
Note: you may need to rest

In [3]:
from politician2vec import Politician2Vec
from politician2vec.utils import *
import pickle
import pandas as pd
import numpy as np
import multiprocessing
available_workers = multiprocessing.cpu_count()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
test_data_path = 'data/clean/combined/subset_party_imputed_v2.p'

In [5]:
with open(test_data_path, 'rb') as p:
    test_data = pickle.load(p)

test_data.shape

(349594, 7)

In [6]:
# parl_data = test_data.loc[
#     (test_data['source'] == 'parliament') &
#     ~(test_data['party'].isin(['SIU', 'NQ', 'JF', 'SP']))
# ]
#
# tw_data = test_data.loc[
#     test_data['source'] == 'twitter'
# ]
#
meta_data = test_data.loc[
    (test_data['source'] == 'meta') &
    test_data['doc'].notna()
]

In [7]:
meta_data['party'].value_counts(dropna = False)

V      7182
SF     4900
S      3465
LA     3424
KF     2825
DF     2059
NB     1909
RV     1565
EL      823
FG      689
ALT     175
DD       76
M        28
KD       18
SIU       9
UFG       5
Name: party, dtype: int64

In [8]:
grouped_docs = meta_data.groupby(['full_name', 'party'])['doc'].apply('. '.join).reset_index()

In [9]:
#docs = [doc for doc in tw_data.doc]
docs = [doc for doc in grouped_docs.doc]
parties = np.array([party for party in grouped_docs.party])
#parties = {i: party for i, party in enumerate(set(grouped_docs.party))}

In [10]:
#party_lab_map = {party: i for i, party in enumerate(set(parties))}

In [11]:
#num_party_labs = grouped_docs['party'].map(party_lab_map).tolist()

In [10]:
ngram_options = {
    'min_count': 5,
    'threshold': 1,
    'delimiter': ' '
}

In [12]:
pol2vec_model = Politician2Vec(
    documents = docs,
    custom_clusters = parties,
    party_inference_method = 'mean',
    tokenizer = preproc_docs,
    embedding_model = 'doc2vec',
    min_count = 50,
    ngram_vocab = True,
    ngram_vocab_args = ngram_options,
    speed = 'fast-learn', # CHANGE FOR REAL RUNS
    workers = available_workers
    #doc2vec_vector_size = 300,
    #doc2vec_window = 8,
    #doc2vec_samples_threshold = 1e-5
)

2022-11-26 17:03:09,568 - politician2vec - INFO - Pre-processing documents for training
2022-11-26 17:03:27,115 - politician2vec - INFO - Creating joint document/word embedding
2022-11-26 17:06:44,001 - politician2vec - INFO - Estimating party positions using mean...
2022-11-26 17:06:44,263 - politician2vec - INFO - All done!


In [14]:
# TODO: This should probably be implemented as a method of the Politician2Vec class
#
# NOTE: As of 2022-11-26, it has been decided to keep party 'inference' parallel to the original top2vec implementation.
#       This funtionality may prove useful in outlier detection, but--importantly--it requires the ex ante known party affilaitions
#       to be mapped to model output ex post instead of relying on inferred parties. Specifically, the 'estimated' party affiliations
#       may vary from ground truth labels in the case of semantically outlying politicians with only few data points.

def inspect_topic(politician2vec_model, topic_idx, n_docs=None, query_substr=None):
    '''
    Print top words and top docs for a given
    topic.
    -------
    manual_num (int):  automatically assigned topic number (i.e. 0-indexed).
    
    n_docs (int, optional): n top documents to print for a given topic.
        Default is to print all docs within a given topic.
    
    query_substr (str, optional): if specified, only documents containing
        this substring will be printed. Cannot be specified with n_docs,
        as this would return only results within a subset of topic docs.
    '''

    num_topics = politician2vec_model.get_num_topics()
    topic_words, word_scores, topic_nums = politician2vec_model.get_topics(num_topics)

    # Get topic sizes so we know max n docs
    topic_sizes, topic_nums = politician2vec_model.get_topic_sizes()
    docs_to_return = topic_sizes[topic_idx]

    # Override n docs to return, if specified
    if n_docs:
        docs_to_return = n_docs

    # Get docs for input topic id
    documents, document_scores, document_ids = politician2vec_model.search_documents_by_topic(
        topic_num=topic_idx,
        num_docs=docs_to_return
        )

    # Limit output to docs containign certain substring, if specified
    if query_substr and n_docs:
        raise Exception('Please do NOT specify n_docs with substring query!\nOtherwise the search is only carried out for a subset of topic docs.')
    
    # Throw exception if substring query attempted on subset of docs!
    elif query_substr:
        documents = [doc for doc in documents if query_substr in doc.lower()]

    # Print output
    print('--- TOP 50 WORDS ---\n', topic_words[topic_idx], '\n')

    print(f'--- TOP {docs_to_return} DOCS. SUBSTRING QUERY: {query_substr} (n = {len(documents)}) ---\n', documents)

In [15]:
pol2vec_model.get_documents_topics([143])

(array([12]),
 array([0.5790218], dtype=float32),
 array([['danmark fremad', 'fremad kan', 'tørvikanvi fremad',
         'politik plads', 'ved 💚', 'kommer løse', 'så rørt',
         'klippekortsordningen så', 'hævder lige', 'samfund danner',
         'fremad virkelighedenkalder', 'kræver løsninger',
         'lige medlemskab', 'tigger hjælp', 'så hovedløst', 'gøre livet',
         'løsninger vores', 'partiet både', 'vendes så', 'gøre næste',
         'mulighed komme', 'tage næste', 'næste store', 'flere får',
         'partiet plads', 'løse klima', 'hjemmehjælpen så', 'tid plads',
         'tungen lige', 'gøre nytte', 'reserver plads',
         'omstændigheder så', 'erfaringer kan', 'kan tage', 'tid se',
         'politiske administrative', 'så kan', 'dag tage',
         'vores finanslovsudspil', 'grønne løsninger', 'store græssere',
         'løse store', 'ingen kæmper', 'gymnasietid får', 'se lige',
         'mener brug', 'politiske sæson', 'lige nui', '💪 stadig',
         'fremad tø

In [16]:
word_vectors = pol2vec_model.model.wv
word_vectors.most_similar(positive = ['borgerlige'], topn = 25)

[('nye', 0.5160134434700012),
 ('seier', 0.4674854576587677),
 ('udlændinge', 0.4106956422328949),
 ('christensen', 0.404934287071228),
 ('borgerliges', 0.402588427066803),
 ('udvises', 0.3882119953632355),
 ('🦢', 0.38634029030799866),
 ('politikerne', 0.3846708834171295),
 ('venstrefløjen', 0.36683985590934753),
 ('partier', 0.3573037087917328),
 ('asylstop', 0.3544532060623169),
 ('politikernes', 0.3408563435077667),
 ('df', 0.336365282535553),
 ('peter', 0.3342103958129883),
 ('islam', 0.33332639932632446),
 ('uacceptabelt', 0.3332825303077698),
 ('jobcentrene', 0.3306253254413605),
 ('stemte', 0.32915198802948),
 ('dom', 0.32888272404670715),
 ('indflydelse', 0.3235616683959961),
 ('nedlægge', 0.3232681155204773),
 ('boje', 0.32262131571769714),
 ('kriminelle', 0.32082024216651917),
 ('vanvid', 0.32075250148773193),
 ('nej', 0.31394249200820923)]

In [21]:
inspect_topic(pol2vec_model, 12, n_docs=3, query_substr=None)

--- TOP 50 WORDS ---
 ['danmark fremad' 'fremad kan' 'tørvikanvi fremad' 'politik plads' 'ved 💚'
 'kommer løse' 'så rørt' 'klippekortsordningen så' 'hævder lige'
 'samfund danner' 'fremad virkelighedenkalder' 'kræver løsninger'
 'lige medlemskab' 'tigger hjælp' 'så hovedløst' 'gøre livet'
 'løsninger vores' 'partiet både' 'vendes så' 'gøre næste'
 'mulighed komme' 'tage næste' 'næste store' 'flere får' 'partiet plads'
 'løse klima' 'hjemmehjælpen så' 'tid plads' 'tungen lige' 'gøre nytte'
 'reserver plads' 'omstændigheder så' 'erfaringer kan' 'kan tage' 'tid se'
 'politiske administrative' 'så kan' 'dag tage' 'vores finanslovsudspil'
 'grønne løsninger' 'store græssere' 'løse store' 'ingen kæmper'
 'gymnasietid får' 'se lige' 'mener brug' 'politiske sæson' 'lige nui'
 '💪 stadig' 'fremad tør'] 

--- TOP 3 DOCS. SUBSTRING QUERY: None (n = 3) ---
 ['Kom med i "Det politiske mødested", hvor vi finder løsninger på tidens udfordringer - gennem samtale om fælles erfaringer, fremfor regneark.'

In [22]:
grouped_docs.loc[grouped_docs['doc'] == 'Vi kan alle bidrage til et bedre miljø i hverdagen.\nVi prøver med små midler at bruge naturens egne processer!']

Unnamed: 0,full_name,party,doc
143,Orla Hav,S,Vi kan alle bidrage til et bedre miljø i hverd...


In [102]:
model_path = 'embedding_models/politician2vec_test_meta_v1.txt'

In [103]:
pol2vec_model.save(model_path)

### Viz dev below

In [68]:
pol2vec_model, doc2vec_model = load_politician2vec_from_txt(model_path)

Loading Politician2Vec model...
Retrieving document embedding...
All done!


In [55]:
doc2vec2tensor(
    doc2vec_model,
    temp_w2v_path = 'tensorboard_input/temp/doc_tensor_meta.w2v',
    tsv_prefix = 'tensorboard_input/meta',
    output_docvecs = True,
    output_wordvecs = False
)

You have elected to extract only document vectors.
Please note that further preprocessing -- such as filtering based on topics of
interest -- may be desired in order to facilitate TensorBoard visualisation.
Please see get_doc_topic_df(), vector_subset2tensor_without_words(), and
metadata2tensor()

Saving temp w2v file and converting to tensor. This may take a while...


2022-11-25 11:47:41,558 - word2vec2tensor - INFO - running /Users/mathiasbruun/me/anaconda3/lib/python3.7/site-packages/gensim/scripts/word2vec2tensor.py -i tensorboard_input/temp/doc_tensor_meta.w2v -o tensorboard_input/meta
2022-11-25 11:47:41,559 - keyedvectors - INFO - loading projection weights from tensorboard_input/temp/doc_tensor_meta.w2v
2022-11-25 11:47:41,601 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (194, 300) matrix of type float32 from tensorboard_input/temp/doc_tensor_meta.w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-11-25T11:47:41.590260', 'gensim': '4.1.2', 'python': '3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 05:57:50) \n[Clang 11.1.0 ]', 'platform': 'Darwin-21.4.0-x86_64-i386-64bit', 'event': 'load_word2vec_format'}
2022-11-25 11:47:41,643 - word2vec2tensor - INFO - 2D tensor file saved to tensorboard_input/meta_tensor.tsv
2022-11-25 11:47:41,644 - word2vec2tensor - INFO - Tensor metadata file saved to tensorboard_in

In [70]:
n_words = len(doc2vec_model.wv)
n_docs = len(doc2vec_model.dv)
vocab = pol2vec_model.vocab

In [417]:
## TWITTER DEEPLEARN
#topic_labels = {
#    0: 'Venstre',
#    1: 'Socialdemokratiet',
#    2: 'Dansk_Folkeparti',
#    3: 'Enhedslisten',
#    4: 'Radikale_Venstre',
#    5: 'SF',
#    6: 'Konservative',
#    7: 'Liberal_Alliance',
#    8: 'Alternativet',
#    9: 'Nye_Borgerlige',
#    10: 'Frie_Grønne',
#    11: 'Kristendemokraterne',
#    12: 'Moderaterne'
#}

In [419]:
## PARLIAMENT FASTLEARN
#topic_labels = {
#    0: 'Socialdemokratiet',
#    1: 'Venstre',
#    2: 'Dansk_Folkeparti',
#    3: 'Enhedslisten',
#    4: 'Radikale Venstre',
#    5: 'SF',
#    6: 'Konservative',
#    7: 'Liberal_Alliance',
#    8: 'UFG',
#    9: 'Alternativet',
#    10: 'Danmarksdemokraterne',
#    11: 'Nye Borgerlige',
#    12: 'Frie Grønne',
#    13: 'Moderaterne',
#    14: 'Kristendemokraterne'
#}

In [24]:
# META FASTLEARN
topic_labels = {
   0: 'Socialdemokratiet',
   1: 'Venstre',
   2: 'Dansk_Folkeparti',
   3: 'SF',
   4: 'Radikale_Venstre',
   5: 'Konservative',
   6: 'Enhedslisten',
   7: 'Liberal_Alliance',
   8: 'Alternativet',
   9: 'Danmarksdemokraterne',
   10: 'Nye_Borgerlige',
   11: 'Frie_Grønne',
   12: 'UFG',
   13: 'Kristendemokraterne',
   14: 'Grønlandsk',
   15: 'Moderaterne'
}

In [25]:
topic_df = get_doc_topic_df(
    pol2vec_model,
    no_substantive_topics = 16,
    snippets = True
)

Topic sizes before filtering (topic 16 is "Other"):

[[ 0 44]
 [ 1 42]
 [ 2 22]
 [ 3 16]
 [ 4 14]
 [ 5 14]
 [ 6 14]
 [ 7 10]
 [ 8  5]
 [ 9  5]
 [10  5]
 [11  3]
 [12  3]
 [13  2]
 [14  1]
 [15  1]]


In [26]:
# TODO: Okay, we have clearly made a mistake. By swapping party classes for politicians,
# we are now telling the model to find politician-level centroids wrt. politician superdocs, yielding...
# ... well, obviously, 154 centroids, correspodning to the 154 super documents. That makes no sense,
# since the centroid and the superdocument will be the same by construction.
#
# We either want document-level centroids clustered by politician OR politician-level centroids clustered
# by party. So politician superdocs + input party label array.
#
# TODO: Also extract topic vectors themselves, which we usually never do!
#
# TODO: Calculate medioid instead of centroid? To account for noisy estimates of politician positions?
#
# TODO: Perhaps chunk the politician-level superdocuments according to a certain special character used
# to join them? Could increase performance in doc2vec.

In [27]:
topic_df['snippet'] = grouped_docs['full_name']
topic_df['party'] = topic_df['top'].map(topic_labels)

In [28]:
parties

array(['SIU', 'LA', 'ALT', 'S', 'RV', 'S', 'V', 'S', 'RV', 'SF', 'S', 'V',
       'SF', 'S', 'S', 'V', 'KF', 'S', 'S', 'KF', 'KF', 'V', 'S', 'SF',
       'LA', 'V', 'SF', 'EL', 'ALT', 'S', 'LA', 'V', 'V', 'S', 'S', 'DD',
       'DF', 'DD', 'DF', 'KF', 'DF', 'V', 'EL', 'S', 'V', 'EL', 'V', 'V',
       'EL', 'FG', 'SF', 'V', 'V', 'V', 'EL', 'LA', 'S', 'SF', 'RV', 'S',
       'SF', 'V', 'V', 'SF', 'V', 'V', 'S', 'DF', 'S', 'KD', 'RV', 'S',
       'LA', 'V', 'S', 'DF', 'DF', 'SF', 'SF', 'V', 'S', 'S', 'KF', 'RV',
       'RV', 'DF', 'DF', 'V', 'SF', 'KD', 'RV', 'V', 'V', 'DF', 'S', 'NB',
       'UFG', 'V', 'LA', 'S', 'S', 'LA', 'DD', 'DF', 'RV', 'V', 'V', 'S',
       'KF', 'EL', 'S', 'V', 'EL', 'V', 'DF', 'V', 'DF', 'V', 'DF', 'RV',
       'KF', 'S', 'S', 'DF', 'S', 'NB', 'V', 'DF', 'M', 'S', 'KF', 'S',
       'V', 'V', 'DF', 'DF', 'RV', 'KF', 'S', 'KF', 'EL', 'NB', 'LA', 'S',
       'KF', 'EL', 'EL', 'KF', 'S', 'EL', 'NB', 'S', 'DF', 'NB', 'DF',
       'DF', 'SF', 'V', 'RV', 'RV', 'S', 'KF

In [29]:
pol2vec_model.doc_top

array([14,  7,  8,  0,  4,  0,  1,  0,  4,  3,  0,  1,  3,  0,  0,  1,  5,
        0,  0,  5,  5,  1,  0,  3,  7,  1,  3,  6,  8,  0,  7,  1,  1,  0,
        0,  9,  2,  9,  2,  5,  2,  1,  6,  0,  1,  6,  1,  1,  6, 11,  3,
        1,  1,  1,  6,  7,  0,  3,  4,  0,  3,  9,  1,  3,  1,  1,  0,  0,
        0, 13,  0,  0,  7,  1,  0,  2,  2,  3,  3,  1,  0,  0,  5,  4,  4,
        2,  2,  1,  3, 13,  4,  1,  1,  2,  0, 10, 12,  1,  7,  0,  0,  7,
        9,  2,  4,  1,  1,  0,  5,  6,  0,  1,  6,  1,  2,  1,  2,  1,  2,
        4,  5,  0,  0,  2,  0, 10,  1,  2, 15,  0,  5,  0,  1,  1,  2,  2,
        4,  5,  0,  5,  6, 10,  7, 12,  5,  6,  6,  5,  2,  6, 10,  0,  2,
       10,  2,  2,  3,  1,  4,  4,  0,  5,  8,  3,  0,  0,  2,  6,  3,  3,
       11,  7, 12,  0,  0,  4,  1,  4,  1,  2,  8, 11,  6,  9,  1,  5,  6,
        1,  0,  1,  8,  1,  0,  3,  1,  0,  1,  1,  7,  0,  4])

In [30]:
import umap
import umap.plot
umap.plot.output_notebook()

In [31]:
colormap = {
    'Socialdemokratiet': '#C8042C',
    'Venstre': '#1A4289',
    'Dansk_Folkeparti': '#265BA4',
    'Enhedslisten': '#BF2850',
    'SF': '#B42D27',
    'Konservative': '#24573E',
    'Radikale_Venstre': '#D82E8A',
    'Liberal_Alliance': '#E7B657',
    'Alternativet': '#75FB4C',
    'Nye_Borgerlige': '#1E4B57',
    'Kristendemokraterne': '#566197',
    'Frie_Grønne': '#E4F04E',
    'Danmarksdemokraterne': '#3470BC',
    'Moderaterne': '#7A308B',
    'UFG': 'red'
}

In [32]:
topic_df['color'] = topic_df['party'].map(colormap)
cmap = topic_df[['snippet', 'color']].set_index('snippet').to_dict()

In [33]:
mapper = umap.UMAP(n_neighbors=15, metric='cosine', n_components=2).fit(pol2vec_model.document_vectors)

In [34]:
grouped_docs.loc[grouped_docs.full_name == 'Lars Løkke Rasmussen']

Unnamed: 0,full_name,party,doc
96,Lars Løkke Rasmussen,UFG,"Kom med i ""Det politiske mødested"", hvor vi fi..."
97,Lars Løkke Rasmussen,V,Danmark bør ledes fra midten. Snart flyver Sør...


In [35]:
topic_df.loc[topic_df['snippet'] == 'Orla Hav']

Unnamed: 0,doc,top,snippet,party,color
143,143,12,Orla Hav,UFG,red


In [36]:
p = umap.plot.interactive(
    mapper,
    labels = topic_df['top'].map(topic_labels),
    color_key=cmap['color'],
    point_size = 8,
    hover_data = topic_df
    )
umap.plot.show(p)

In [78]:
len(pol2vec_model.topic_vectors)

16

In [79]:
metadata2tensor(
    topic_df,
    metadata_path = 'tensorboard_input/meta_metadata.tsv',
    label_list = topic_labels
)

In [80]:
# test read of tensor output
with open('tensorboard_input/meta_tensor.tsv','r') as r:
    lines_test = r.readlines()

len(lines_test)

194

In [81]:
def partyvecs2tensor(party_vecs, out_path):
    vec_strs = []

    for vec in party_vecs:
        vec_str = ''.join([str(val) +'\t' for val in vec]).rstrip('\t')
        vec_strs.append(vec_str)

    tensor_str = '\n'.join(vec_strs)
    
    with open(out_path, 'w') as f:
        f.write(tensor_str)

In [82]:
def concat_tensors(tensor_files, out_path):
    tensor_strs = []
    
    for file in tensor_files:
        tensor_str = pd.read_csv(file, sep = '\t', header = None)
        tensor_strs.append(tensor_str)
    
    concat_tensor = pd.concat(tensor_strs)
    
    concat_tensor.to_csv(out_path, sep = '\t', header = False, index = False)

In [83]:
partyvecs2tensor(
    party_vecs = pol2vec_model.topic_vectors,
    out_path = 'tensorboard_input/meta_parties.tsv'
)

In [84]:
label_df = pd.DataFrame(topic_labels, index = ['doc']).T
#label_df['topic'] = ''

In [85]:
label_df.to_csv(
    'tensorboard_input/meta_parties_metadata.tsv',
    sep = '\t',
    header = False,
    index = False
)

In [86]:
# Combine docvecs and partyvecs
politician_file = 'tensorboard_input/meta_tensor.tsv'
party_file = 'tensorboard_input/meta_parties.tsv'

tensor_files = [politician_file, party_file]
concat_tensors(tensor_files, 'tensorboard_input/meta_combined.tsv')

In [87]:
# Combine metadata
politician_file = 'tensorboard_input/meta_metadata.tsv'
party_file = 'tensorboard_input/meta_parties_metadata.tsv'

tensor_files = [politician_file, party_file]
concat_tensors(tensor_files, 'tensorboard_input/meta_combined_metadata.tsv')