# First `politician2vec` demo

## To do
- Descriptive statistics/viz for entire dataset
- Settle on centroid calculation method (handle outliers better, remove alt method?)
- Compile populism-related words for constructing axes
- Determine possibilities of comparative analysis
- Consistently change "topic" to "party"/"cluster" or the like

In [12]:
%pip uninstall politician2vec -y

Found existing installation: politician2vec 0.0.1
Uninstalling politician2vec-0.0.1:
  Successfully uninstalled politician2vec-0.0.1
Note: you may need to restart the kernel to use updated packages.


In [13]:
%pip install git+ssh://git@github.com/mathiasbruun/politician2vec.git

Collecting git+ssh://****@github.com/mathiasbruun/politician2vec.git
  Cloning ssh://****@github.com/mathiasbruun/politician2vec.git to /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-req-build-fsg34oka
  Running command git clone -q 'ssh://****@github.com/mathiasbruun/politician2vec.git' /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-req-build-fsg34oka
Building wheels for collected packages: politician2vec
  Building wheel for politician2vec (setup.py) ... [?25ldone
[?25h  Created wheel for politician2vec: filename=politician2vec-0.0.1-py3-none-any.whl size=27418 sha256=58a4ae4ee4e1350bf9fb2793ad8a56ea6660655007757c07b8e1aad22e19f845
  Stored in directory: /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-ephem-wheel-cache-hngzo8rd/wheels/ac/44/62/b0b3ddf2882cd1b1d1cc4e060c5c525b951ae01496d65cd472
Successfully built politician2vec
Installing collected packages: politician2vec
Successfully installed politician2vec-0.0.1
Note: you may need to rest

In [14]:
from politician2vec import Politician2Vec
from politician2vec.utils import *
import pickle
import pandas as pd
import numpy as np
import multiprocessing
available_workers = multiprocessing.cpu_count()

In [15]:
test_data_path = 'data/clean/combined/subset_party_imputed_v2.p'

In [16]:
with open(test_data_path, 'rb') as p:
    test_data = pickle.load(p)

test_data.shape

(349594, 7)

In [19]:
# parl_data = test_data.loc[
#     (test_data['source'] == 'parliament') &
#     ~(test_data['party'].isin(['SIU', 'NQ', 'JF', 'SP']))
# ]
#
# tw_data = test_data.loc[
#     test_data['source'] == 'twitter'
# ]
#
meta_data = test_data.loc[
    (test_data['source'] == 'meta') &
    test_data['doc'].notna()
]

In [20]:
meta_data['party'].value_counts(dropna = False)

V      7307
SF     4869
S      3441
LA     3424
KF     2700
DF     2087
NB     1909
RV     1593
EL      823
FG      675
ALT     220
DD       48
M        28
KD       14
SIU       9
UFG       5
Name: party, dtype: int64

In [21]:
grouped_docs = meta_data.groupby(['full_name', 'party'])['doc'].apply('. '.join).reset_index()

In [22]:
#docs = [doc for doc in tw_data.doc]
docs = [doc for doc in grouped_docs.doc]
parties = np.array([party for party in grouped_docs.party])

In [23]:
ngram_options = {
    'min_count': 5,
    'threshold': 1,
    'delimiter': ' '
}

In [24]:
pol2vec_model = Politician2Vec(
    documents = docs,
    custom_clusters = parties,
    party_inference_method = 'mean',
    tokenizer = preproc_docs,
    embedding_model = 'doc2vec',
    min_count = 50,
    ngram_vocab = True,
    ngram_vocab_args = ngram_options,
    speed = 'fast-learn', # CHANGE FOR REAL RUNS
    workers = available_workers
    #doc2vec_vector_size = 300,
    #doc2vec_window = 8,
    #doc2vec_samples_threshold = 1e-5
)

2022-11-25 11:34:03,751 - politician2vec - INFO - Pre-processing documents for training
2022-11-25 11:34:24,489 - politician2vec - INFO - Creating joint document/word embedding
2022-11-25 11:37:33,078 - politician2vec - INFO - Estimating party positions using mean...
2022-11-25 11:37:33,237 - politician2vec - INFO - All done!


In [25]:
# TODO: This should probably be implemented as a method of the Politician2Vec class
def inspect_topic(politician2vec_model, topic_idx, n_docs=None, query_substr=None):
    '''
    Print top words and top docs for a given
    topic.
    -------
    manual_num (int):  automatically assigned topic number (i.e. 0-indexed).
    
    n_docs (int, optional): n top documents to print for a given topic.
        Default is to print all docs within a given topic.
    
    query_substr (str, optional): if specified, only documents containing
        this substring will be printed. Cannot be specified with n_docs,
        as this would return only results within a subset of topic docs.
    '''

    num_topics = politician2vec_model.get_num_topics()
    topic_words, word_scores, topic_nums = politician2vec_model.get_topics(num_topics)

    # Get topic sizes so we know max n docs
    topic_sizes, topic_nums = politician2vec_model.get_topic_sizes()
    docs_to_return = topic_sizes[topic_idx]

    # Override n docs to return, if specified
    if n_docs:
        docs_to_return = n_docs

    # Get docs for input topic id
    documents, document_scores, document_ids = politician2vec_model.search_documents_by_topic(
        topic_num=topic_idx,
        num_docs=docs_to_return
        )

    # Limit output to docs containign certain substring, if specified
    if query_substr and n_docs:
        raise Exception('Please do NOT specify n_docs with substring query!\nOtherwise the search is only carried out for a subset of topic docs.')
    
    # Throw exception if substring query attempted on subset of docs!
    elif query_substr:
        documents = [doc for doc in documents if query_substr in doc.lower()]

    # Print output
    print('--- TOP 50 WORDS ---\n', topic_words[topic_idx], '\n')

    print(f'--- TOP {docs_to_return} DOCS. SUBSTRING QUERY: {query_substr} (n = {len(documents)}) ---\n', documents)

In [26]:
word_vectors = pol2vec_model.model.wv
word_vectors.most_similar(positive = ['borgerlige'], topn = 25)

[('seier', 0.5170028209686279),
 ('nye', 0.5106543302536011),
 ('christensen', 0.4494542181491852),
 ('borgerliges', 0.4007401466369629),
 ('🦢', 0.3813905417919159),
 ('peter', 0.3760186433792114),
 ('partier', 0.370720237493515),
 ('udlændinge', 0.351047158241272),
 ('asylstop', 0.3485393226146698),
 ('venstrefløjen', 0.3371717631816864),
 ('forklarer', 0.3317083716392517),
 ('større', 0.32527339458465576),
 ('udvises', 0.3219779133796692),
 ('politikerne', 0.3218039870262146),
 ('parti', 0.32109203934669495),
 ('islam', 0.3183261752128601),
 ('df', 0.31765878200531006),
 ('politikernes', 0.31552839279174805),
 ('netop', 0.3065086901187897),
 ('kriminelle', 0.30640098452568054),
 ('boje', 0.3052135109901428),
 ('jobcentrene', 0.30413007736206055),
 ('nedlægge', 0.3030591309070587),
 ('lande', 0.2970431447029114),
 ('rigsretssag', 0.29516732692718506)]

In [51]:
inspect_topic(pol2vec_model, 0, n_docs=1, query_substr=None)

--- TOP 50 WORDS ---
 ['🌹 tak' '🌹 valgkampen' 'gerne 🌹' '🌹 så' '😊 🌹' 'folketinget 🌹' '🌹 velfærd'
 '️ 🌹' 'forskel 🌹' 'kampen 🌹' 'tech-skat sammen' 'virkelig gode'
 'mere valgkampen' '🧓 🌹' 'hele valgkampen' '🌹 synes' 'socialdemokratiet 🌹'
 'gerne gode' 'store teknologivirksomheder' '🌹 ❤' 'tak gode'
 'godt hinanden' '🌹 🌹' '💚 🌹' 'godt gerne' 'velfærd 🌹' 'tider 🌹' 'først 🌹'
 'sandra 🌹' '🌹 del' 'krydret god' 'valgkamp dag' 'godt se'
 'resten valgkampen' 'hovedkontor sammen' '🌹 🇩' 'stor tak' 'utryg tid'
 'god valgkamp' 'politik grøn' 'politikere bor' 'minde hinanden'
 'tid store' '🌹 😊' 'tid opgaverne' 'andre gode' '🌹 socialdemokratiet'
 'sammen kan' 'tak god' 'valgkampen 🙏'] 

--- TOP 1 DOCS. SUBSTRING QUERY: None (n = 1) ---
 ['Socialdemokratiet vil sætte velfærden først, indføre en ret til tidligere folkepension for de mest nedslidte og give en bæredygtig verden videre til vores børn. Det kan lade sig gøre. Men det kræver det en ny regering, der vil gøre gode tider bedre for alle.🌹 #LevStær

In [52]:
model_path = 'embedding_models/politician2vec_test_meta_v1.txt'

In [53]:
pol2vec_model.save(model_path)

### Viz dev below

In [54]:
pol2vec_model, doc2vec_model = load_politician2vec_from_txt(model_path)

Loading Politician2Vec model...
Retrieving document embedding...
All done!


In [55]:
doc2vec2tensor(
    doc2vec_model,
    temp_w2v_path = 'tensorboard_input/temp/doc_tensor_meta.w2v',
    tsv_prefix = 'tensorboard_input/meta',
    output_docvecs = True,
    output_wordvecs = False
)

You have elected to extract only document vectors.
Please note that further preprocessing -- such as filtering based on topics of
interest -- may be desired in order to facilitate TensorBoard visualisation.
Please see get_doc_topic_df(), vector_subset2tensor_without_words(), and
metadata2tensor()

Saving temp w2v file and converting to tensor. This may take a while...


2022-11-25 11:47:41,558 - word2vec2tensor - INFO - running /Users/mathiasbruun/me/anaconda3/lib/python3.7/site-packages/gensim/scripts/word2vec2tensor.py -i tensorboard_input/temp/doc_tensor_meta.w2v -o tensorboard_input/meta
2022-11-25 11:47:41,559 - keyedvectors - INFO - loading projection weights from tensorboard_input/temp/doc_tensor_meta.w2v
2022-11-25 11:47:41,601 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (194, 300) matrix of type float32 from tensorboard_input/temp/doc_tensor_meta.w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-11-25T11:47:41.590260', 'gensim': '4.1.2', 'python': '3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 05:57:50) \n[Clang 11.1.0 ]', 'platform': 'Darwin-21.4.0-x86_64-i386-64bit', 'event': 'load_word2vec_format'}
2022-11-25 11:47:41,643 - word2vec2tensor - INFO - 2D tensor file saved to tensorboard_input/meta_tensor.tsv
2022-11-25 11:47:41,644 - word2vec2tensor - INFO - Tensor metadata file saved to tensorboard_in

In [56]:
n_words = len(doc2vec_model.wv)
n_docs = len(doc2vec_model.dv)
vocab = pol2vec_model.vocab

In [417]:
## TWITTER DEEPLEARN
#topic_labels = {
#    0: 'Venstre',
#    1: 'Socialdemokratiet',
#    2: 'Dansk_Folkeparti',
#    3: 'Enhedslisten',
#    4: 'Radikale_Venstre',
#    5: 'SF',
#    6: 'Konservative',
#    7: 'Liberal_Alliance',
#    8: 'Alternativet',
#    9: 'Nye_Borgerlige',
#    10: 'Frie_Grønne',
#    11: 'Kristendemokraterne',
#    12: 'Moderaterne'
#}

In [419]:
## PARLIAMENT FASTLEARN
#topic_labels = {
#    0: 'Socialdemokratiet',
#    1: 'Venstre',
#    2: 'Dansk_Folkeparti',
#    3: 'Enhedslisten',
#    4: 'Radikale Venstre',
#    5: 'SF',
#    6: 'Konservative',
#    7: 'Liberal_Alliance',
#    8: 'UFG',
#    9: 'Alternativet',
#    10: 'Danmarksdemokraterne',
#    11: 'Nye Borgerlige',
#    12: 'Frie Grønne',
#    13: 'Moderaterne',
#    14: 'Kristendemokraterne'
#}

In [57]:
# META FASTLEARN
topic_labels = {
   0: 'Socialdemokratiet',
   1: 'Venstre',
   2: 'Dansk_Folkeparti',
   3: 'SF',
   4: 'Radikale_Venstre',
   5: 'Enhedslisten',
   6: 'Konservative',
   7: 'Liberal_Alliance',
   8: 'Alternativet',
   9: 'Nye_Borgerlige',
   10: 'UFG',
   11: 'Danmarksdemokraterne',
   12: 'Grøndlandsk',
   13: 'Frie_Grønne',
   14: 'Kristendemokraterne',
   15: 'Moderaterne'
}

In [73]:
topic_df = get_doc_topic_df(
    pol2vec_model,
    no_substantive_topics = 16,
    snippets = True
)

Topic sizes before filtering (topic 16 is "Other"):

[[ 0 42]
 [ 1 40]
 [ 2 23]
 [ 3 16]
 [ 4 14]
 [ 5 14]
 [ 6 13]
 [ 7 10]
 [ 8  8]
 [ 9  5]
 [10  3]
 [11  2]
 [12  1]
 [13  1]
 [14  1]
 [15  1]]


In [74]:
# TODO: Okay, we have clearly made a mistake. By swapping party classes for politicians,
# we are now telling the model to find politician-level centroids wrt. politician superdocs, yielding...
# ... well, obviously, 154 centroids, correspodning to the 154 super documents. That makes no sense,
# since the centroid and the superdocument will be the same by construction.
#
# We either want document-level centroids clustered by politician OR politician-level centroids clustered
# by party. So politician superdocs + input party label array.
#
# TODO: Also extract topic vectors themselves, which we usually never do!
#
# TODO: Calculate medioid instead of centroid? To account for noisy estimates of politician positions?
#
# TODO: Perhaps chunk the politician-level superdocuments according to a certain special character used
# to join them? Could increase performance in doc2vec.

In [75]:
grouped_docs.head()

Unnamed: 0,full_name,party,doc
0,Aki-Matilda Høegh-Dam,SIU,✨Sapiillutit inuiaqatigiinni peqataasarit.✨\n\...
1,Alex Vanopslagh,LA,Røde Alex har kun hånlig latter til overs for ...
2,Alternativet,ALT,Til tonerne af jazz og smagen af lækker cider ...
3,Anders Kronborg,S,Udvikling til hjemstavnen. Vestjylland skal ig...
4,Andreas Steenberg,RV,Hvor dum og uretfærdig kan vores udlændingelov...


In [76]:
topic_df['snippet'] = grouped_docs['full_name']

In [77]:
topic_df

Unnamed: 0,doc,top,snippet
0,0,12,Aki-Matilda Høegh-Dam
1,1,7,Alex Vanopslagh
2,2,8,Alternativet
3,3,0,Anders Kronborg
4,4,4,Andreas Steenberg
...,...,...,...
189,189,1,Ulla Tørnæs
190,190,1,Venstre
191,191,7,Villum Christensen
192,192,0,Yildiz Akdogan


In [78]:
len(pol2vec_model.topic_vectors)

16

In [79]:
metadata2tensor(
    topic_df,
    metadata_path = 'tensorboard_input/meta_metadata.tsv',
    label_list = topic_labels
)

In [80]:
# test read of tensor output
with open('tensorboard_input/meta_tensor.tsv','r') as r:
    lines_test = r.readlines()

len(lines_test)

194

In [81]:
def partyvecs2tensor(party_vecs, out_path):
    vec_strs = []

    for vec in party_vecs:
        vec_str = ''.join([str(val) +'\t' for val in vec]).rstrip('\t')
        vec_strs.append(vec_str)

    tensor_str = '\n'.join(vec_strs)
    
    with open(out_path, 'w') as f:
        f.write(tensor_str)

In [82]:
def concat_tensors(tensor_files, out_path):
    tensor_strs = []
    
    for file in tensor_files:
        tensor_str = pd.read_csv(file, sep = '\t', header = None)
        tensor_strs.append(tensor_str)
    
    concat_tensor = pd.concat(tensor_strs)
    
    concat_tensor.to_csv(out_path, sep = '\t', header = False, index = False)

In [83]:
partyvecs2tensor(
    party_vecs = pol2vec_model.topic_vectors,
    out_path = 'tensorboard_input/meta_parties.tsv'
)

In [84]:
label_df = pd.DataFrame(topic_labels, index = ['doc']).T
#label_df['topic'] = ''

In [85]:
label_df.to_csv(
    'tensorboard_input/meta_parties_metadata.tsv',
    sep = '\t',
    header = False,
    index = False
)

In [86]:
# Combine docvecs and partyvecs
politician_file = 'tensorboard_input/meta_tensor.tsv'
party_file = 'tensorboard_input/meta_parties.tsv'

tensor_files = [politician_file, party_file]
concat_tensors(tensor_files, 'tensorboard_input/meta_combined.tsv')

In [87]:
# Combine metadata
politician_file = 'tensorboard_input/meta_metadata.tsv'
party_file = 'tensorboard_input/meta_parties_metadata.tsv'

tensor_files = [politician_file, party_file]
concat_tensors(tensor_files, 'tensorboard_input/meta_combined_metadata.tsv')