# First `politician2vec` demo

## To do
- Descriptive statistics/viz for entire dataset
- Settle on centroid calculation method (handle outliers better, remove alt method?)
- Compile populism-related words for constructing axes
- Determine possibilities of comparative analysis
- Consistently change "topic" to "party"/"cluster" or the like

In [1]:
!pip uninstall politician2vec -y

Found existing installation: politician2vec 0.0.1
Uninstalling politician2vec-0.0.1:
  Successfully uninstalled politician2vec-0.0.1


In [2]:
!pip install --upgrade git+ssh://git@github.com/mathiasbruun/politician2vec.git

Collecting git+ssh://****@github.com/mathiasbruun/politician2vec.git
  Cloning ssh://****@github.com/mathiasbruun/politician2vec.git to /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-req-build-u70tct1u
  Running command git clone -q 'ssh://****@github.com/mathiasbruun/politician2vec.git' /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-req-build-u70tct1u
Building wheels for collected packages: politician2vec
  Building wheel for politician2vec (setup.py) ... [?25ldone
[?25h  Created wheel for politician2vec: filename=politician2vec-0.0.1-py3-none-any.whl size=26812 sha256=e4438ad92b7987d7f2fb413f19c7859c994e02fea772856cc9ace4fa22aa2f20
  Stored in directory: /private/var/folders/8q/02vc3fzn3r1fv7wzycyspjz80000gn/T/pip-ephem-wheel-cache-kj72_zz0/wheels/ac/44/62/b0b3ddf2882cd1b1d1cc4e060c5c525b951ae01496d65cd472
Successfully built politician2vec
Installing collected packages: politician2vec
Successfully installed politician2vec-0.0.1


In [3]:
from politician2vec import Politician2Vec
from politician2vec.utils import *
import pickle
import pandas as pd
import numpy as np
import multiprocessing
available_workers = multiprocessing.cpu_count()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [275]:
test_data_path = 'data/clean/combined/subset_party_imputed_v2.p'

In [276]:
with open(test_data_path, 'rb') as p:
    test_data = pickle.load(p)

test_data.shape

(349594, 7)

In [404]:
parl_data = test_data.loc[
    (test_data['source'] == 'parliament') &
    ~(test_data['party'].isin(['SIU', 'NQ', 'JF', 'SP']))
]

In [278]:
tw_data = test_data.loc[
    test_data['source'] == 'twitter'
]

In [204]:
tw_data['party'].value_counts(dropna = False)

S      54566
V      37099
EL     27629
RV     22041
SF     21593
ALT    18070
KF     13241
DF     12363
LA     10844
FG      4196
UFG     3722
NB      1834
M        157
KD         2
Name: party, dtype: int64

In [405]:
grouped_docs = parl_data.groupby(['full_name', 'party'])['doc'].apply('. '.join).reset_index()

In [406]:
#docs = [doc for doc in tw_data.doc]
docs = [doc for doc in grouped_docs.doc]
parties = np.array([party for party in grouped_docs.party])

In [407]:
ngram_options = {
    'min_count': 5,
    'threshold': 1,
    'delimiter': ' '
}

In [408]:
#TODO: PUSH POLITICIAN2VEC, RESTART KERNEL AND REINSTALL BEFORE NEXT RUN!!!
pol2vec_model = Politician2Vec(
    documents = docs,
    custom_clusters = parties,
    party_inference_method = 'mean',
    tokenizer = preproc_docs,
    embedding_model = 'doc2vec',
    min_count = 50,
    ngram_vocab = True,
    ngram_vocab_args = ngram_options,
    speed = 'fast-learn', # CHANGE FOR REAL RUNS
    workers = available_workers
    #doc2vec_vector_size = 300,
    #doc2vec_window = 8,
    #doc2vec_samples_threshold = 1e-5
)

2022-11-25 10:24:51,418 - politician2vec - INFO - Pre-processing documents for training
2022-11-25 10:27:04,512 - politician2vec - INFO - Creating joint document/word embedding
2022-11-25 10:48:11,036 - politician2vec - INFO - Projecting vectors to 5D space using UMAP (HDBSCAN clustering disabled!)
2022-11-25 10:48:13,939 - politician2vec - INFO - Estimating party positions using mean...
2022-11-25 10:48:14,184 - politician2vec - INFO - All done!


In [35]:
# TODO: This should probably be implemented as a method of the Politician2Vec class
def inspect_topic(politician2vec_model, topic_idx, n_docs=None, query_substr=None):
    '''
    Print top words and top docs for a given
    topic.
    -------
    manual_num (int):  automatically assigned topic number (i.e. 0-indexed).
    
    n_docs (int, optional): n top documents to print for a given topic.
        Default is to print all docs within a given topic.
    
    query_substr (str, optional): if specified, only documents containing
        this substring will be printed. Cannot be specified with n_docs,
        as this would return only results within a subset of topic docs.
    '''

    num_topics = politician2vec_model.get_num_topics()
    topic_words, word_scores, topic_nums = politician2vec_model.get_topics(num_topics)

    # Get topic sizes so we know max n docs
    topic_sizes, topic_nums = politician2vec_model.get_topic_sizes()
    docs_to_return = topic_sizes[topic_idx]

    # Override n docs to return, if specified
    if n_docs:
        docs_to_return = n_docs

    # Get docs for input topic id
    documents, document_scores, document_ids = politician2vec_model.search_documents_by_topic(
        topic_num=topic_idx,
        num_docs=docs_to_return
        )

    # Limit output to docs containign certain substring, if specified
    if query_substr and n_docs:
        raise Exception('Please do NOT specify n_docs with substring query!\nOtherwise the search is only carried out for a subset of topic docs.')
    
    # Throw exception if substring query attempted on subset of docs!
    elif query_substr:
        documents = [doc for doc in documents if query_substr in doc.lower()]

    # Print output
    print('--- TOP 50 WORDS ---\n', topic_words[topic_idx], '\n')

    print(f'--- TOP {docs_to_return} DOCS. SUBSTRING QUERY: {query_substr} (n = {len(documents)}) ---\n', documents)

In [409]:
word_vectors.most_similar(positive = ['borgerlige'], topn = 25)

[('andet', 0.9987069964408875),
 ('kritiserer', 0.9986788034439087),
 ('hvilket', 0.9985679984092712),
 ('politikere', 0.9985299110412598),
 ('la', 0.9985120892524719),
 ('sidder', 0.9984993934631348),
 ('kommission', 0.9984745979309082),
 ('fald', 0.9984733462333679),
 ('stille', 0.998467743396759),
 ('rød', 0.9984644651412964),
 ('anklage', 0.9984567761421204),
 ('s-regeringen', 0.9984514713287354),
 ('skandale', 0.998436689376831),
 ('kritisere', 0.9984209537506104),
 ('statsminister', 0.9984197020530701),
 ('åbenbart', 0.9984117150306702),
 ('ellers', 0.9984065890312195),
 ('lyver', 0.9984044432640076),
 ('ydelserne', 0.9984024167060852),
 ('påstår', 0.998391330242157),
 ('vide', 0.9983769655227661),
 ('gjorde', 0.9983758330345154),
 ('indvandring', 0.9983720779418945),
 ('magten', 0.9983720183372498),
 ('simpelthen', 0.9983711242675781)]

In [411]:
inspect_topic(pol2vec_model, 1, n_docs=1, query_substr=None)

--- TOP 50 WORDS ---
 ['venstre socialdemokratiet' 'svar glæder' 'sammen venstre'
 'venstre gerne' 'tak ministeren' 'hvorfor venstre' 'bla fokus'
 'regeringen ønsker' 'sagde ministeren' 'flere ord' 'kommer tage'
 'svaret første' 'hvilket håber' 'vores side' 'mellem regeringen'
 'venstre siger' 'svar venstres' 'ellers får' 'lige svaret'
 'enige regeringen' 'nødt komme' 'derfor regeringen' 'ønsker nemlig'
 'samme hammel' 'formand jamen' 'komme tale' 'endnu højere' 'kun glad'
 'forstå socialdemokratiet' 'regeringen enig' 'glad venstre' 'stå uden'
 'følge regeringens' 'taler sammen' 'forstå venstre' 'igen får'
 'taler dag' 'går vej' 'netop skabe' 'glæder ministeren'
 'derudover ønsker' 'konkret betyder' 'handler bla' 'prøve få'
 'regeringen gjort' 'dermed bedre' 'vej ønsker' 'endnu mindre'
 'første del' 'siger ministeren'] 

--- TOP 1 DOCS. SUBSTRING QUERY: None (n = 1) ---
 ["Tak for det. Jeg vil godt følge op på det sidste spørgsmål, for det synes jeg faktisk var ret interessant. Sociald

In [412]:
model_path = 'embedding_models/politician2vec_test_parl_v1.txt'

In [413]:
pol2vec_model.save(model_path)

### Viz dev below

In [414]:
pol2vec_model, doc2vec_model = load_politician2vec_from_txt(model_path)

Loading Politician2Vec model...
Retrieving document embedding...
All done!


In [415]:
doc2vec2tensor(
    doc2vec_model,
    temp_w2v_path = 'tensorboard_input/temp/doc_tensor_parl.w2v',
    tsv_prefix = 'tensorboard_input/parl',
    output_docvecs = True,
    output_wordvecs = False
)

You have elected to extract only document vectors.
Please note that further preprocessing -- such as filtering based on topics of
interest -- may be desired in order to facilitate TensorBoard visualisation.
Please see get_doc_topic_df(), vector_subset2tensor_without_words(), and
metadata2tensor()

Saving temp w2v file and converting to tensor. This may take a while...


2022-11-25 10:50:04,913 - word2vec2tensor - INFO - running /Users/mathiasbruun/me/anaconda3/lib/python3.7/site-packages/gensim/scripts/word2vec2tensor.py -i tensorboard_input/temp/doc_tensor_parl.w2v -o tensorboard_input/parl
2022-11-25 10:50:04,913 - keyedvectors - INFO - loading projection weights from tensorboard_input/temp/doc_tensor_parl.w2v
2022-11-25 10:50:04,964 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (237, 300) matrix of type float32 from tensorboard_input/temp/doc_tensor_parl.w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-11-25T10:50:04.949135', 'gensim': '4.1.2', 'python': '3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 05:57:50) \n[Clang 11.1.0 ]', 'platform': 'Darwin-21.4.0-x86_64-i386-64bit', 'event': 'load_word2vec_format'}
2022-11-25 10:50:05,087 - word2vec2tensor - INFO - 2D tensor file saved to tensorboard_input/parl_tensor.tsv
2022-11-25 10:50:05,090 - word2vec2tensor - INFO - Tensor metadata file saved to tensorboard_in

In [416]:
n_words = len(doc2vec_model.wv)
n_docs = len(doc2vec_model.dv)
vocab = pol2vec_model.vocab

In [417]:
## TWITTER DEEPLEARN
#topic_labels = {
#    0: 'Venstre',
#    1: 'Socialdemokratiet',
#    2: 'Dansk_Folkeparti',
#    3: 'Enhedslisten',
#    4: 'Radikale_Venstre',
#    5: 'SF',
#    6: 'Konservative',
#    7: 'Liberal_Alliance',
#    8: 'Alternativet',
#    9: 'Nye_Borgerlige',
#    10: 'Frie_Grønne',
#    11: 'Kristendemokraterne',
#    12: 'Moderaterne'
#}

In [419]:
## PARLIAMENT FASTLEARN
#topic_labels = {
#    0: 'Socialdemokratiet',
#    1: 'Venstre',
#    2: 'Dansk_Folkeparti',
#    3: 'Enhedslisten',
#    4: 'Radikale Venstre',
#    5: 'SF',
#    6: 'Konservative',
#    7: 'Liberal_Alliance',
#    8: 'UFG',
#    9: 'Alternativet',
#    10: 'Danmarksdemokraterne',
#    11: 'Nye Borgerlige',
#    12: 'Frie Grønne',
#    13: 'Moderaterne',
#    14: 'Kristendemokraterne'
#}

In [420]:
topic_df = get_doc_topic_df(
    pol2vec_model,
    no_substantive_topics = 15,
    snippets = True
)

Topic sizes before filtering (topic 15 is "Other"):

[[ 0 49]
 [ 1 43]
 [ 2 36]
 [ 3 16]
 [ 4 15]
 [ 5 14]
 [ 6 14]
 [ 7 13]
 [ 8 11]
 [ 9 11]
 [10  5]
 [11  4]
 [12  3]
 [13  2]
 [14  1]]


In [421]:
# TODO: Okay, we have clearly made a mistake. By swapping party classes for politicians,
# we are now telling the model to find politician-level centroids wrt. politician superdocs, yielding...
# ... well, obviously, 154 centroids, correspodning to the 154 super documents. That makes no sense,
# since the centroid and the superdocument will be the same by construction.
#
# We either want document-level centroids clustered by politician OR politician-level centroids clustered
# by party. So politician superdocs + input party label array.
#
# TODO: Also extract topic vectors themselves, which we usually never do!
#
# TODO: Calculate medioid instead of centroid? To account for noisy estimates of politician positions?
#
# TODO: Perhaps chunk the politician-level superdocuments according to a certain special character used
# to join them? Could increase performance in doc2vec.

In [422]:
grouped_docs.head()

Unnamed: 0,full_name,party,doc
0,Alex Ahrendtsen,DF,"Tak. Det åbne spørgsmål er jo, hvad Socialdemo..."
1,Alex Vanopslagh,LA,"Tak for det, og tak for den fine udtale af eft..."
2,Anders Kronborg,S,Tak for ordet. Først og fremmest vil jeg sige ...
3,Anders Samuelsen,LA,"Tak for det, og tak for en saglig og konstrukt..."
4,Andreas Steenberg,RV,I Radikale Venstre forstår vi lovforslaget såd...


In [423]:
topic_df['snippet'] = grouped_docs['full_name']

In [424]:
topic_df

Unnamed: 0,doc,top,snippet
0,0,2,Alex Ahrendtsen
1,1,7,Alex Vanopslagh
2,2,0,Anders Kronborg
3,3,7,Anders Samuelsen
4,4,5,Andreas Steenberg
...,...,...,...
232,232,9,Ulla Sandbæk
233,233,1,Ulla Tørnæs
234,234,7,Villum Christensen
235,235,0,Yildiz Akdogan


In [425]:
len(pol2vec_model.topic_vectors)

15

In [426]:
metadata2tensor(
    topic_df,
    metadata_path = 'tensorboard_input/parl_metadata.tsv',
    label_list = topic_labels
)

In [427]:
# test read of tensor output
with open('tensorboard_input/parl_tensor.tsv','r') as r:
    lines_test = r.readlines()

len(lines_test)

237

In [428]:
partyvecs2tensor(
    party_vecs = pol2vec_model.topic_vectors,
    out_path = 'tensorboard_input/parl_parties.tsv'
)

In [429]:
def partyvecs2tensor(party_vecs, out_path):
    vec_strs = []

    for vec in party_vecs:
        vec_str = ''.join([str(val) +'\t' for val in vec]).rstrip('\t')
        vec_strs.append(vec_str)

    tensor_str = '\n'.join(vec_strs)
    
    with open(out_path, 'w') as f:
        f.write(tensor_str)

In [430]:
def concat_tensors(tensor_files, out_path):
    tensor_strs = []
    
    for file in tensor_files:
        tensor_str = pd.read_csv(file, sep = '\t', header = None)
        tensor_strs.append(tensor_str)
    
    concat_tensor = pd.concat(tensor_strs)
    
    concat_tensor.to_csv(out_path, sep = '\t', header = False, index = False)

In [431]:
label_df = pd.DataFrame(topic_labels, index = ['doc']).T
#label_df['topic'] = ''

In [432]:
label_df.to_csv(
    'tensorboard_input/parl_parties_metadata.tsv',
    sep = '\t',
    header = False,
    index = False
)

In [433]:
# Combine docvecs and partyvecs
politician_file = 'tensorboard_input/parl_tensor.tsv'
party_file = 'tensorboard_input/parl_parties.tsv'

tensor_files = [politician_file, party_file]
concat_tensors(tensor_files, 'tensorboard_input/parl_combined.tsv')

In [434]:
# Combine metadata
politician_file = 'tensorboard_input/parl_metadata.tsv'
party_file = 'tensorboard_input/parl_parties_metadata.tsv'

tensor_files = [politician_file, party_file]
concat_tensors(tensor_files, 'tensorboard_input/parl_combined_metadata.tsv')