# `politician2vec` modelling pipeline

In [2]:
# Make sure politician2vec is up to date
#%pip uninstall politician2vec -y
#%pip install git+ssh://git@github.com/mathiasbruun/politician2vec.git

# Make sure to auto-reload politician2vec in case there are remote changes
%load_ext autoreload
%autoreload 2

In [3]:
# Imports
from politician2vec import Politician2Vec
from politician2vec.utils import *
import pickle
import pandas as pd
import numpy as np
import multiprocessing
available_workers = multiprocessing.cpu_count()
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits import mplot3d
from mpl_toolkits.mplot3d import Axes3D

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mathiasbruun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
# Specify path to preprocessed dataset
test_data_path = 'data/clean/preprocessed_docs_2015_low.p'

In [5]:
# We remove parties with very few utterances (< 500), which prevents small/new parties and North-Atlantic MPs
# from skewing the embedding with very few, concentrated observations. We also remove non-affiliated MPs (UFG)
# KD is removed as they are only represented by one single MP during the entire period covered.

parl_data = pd.read_pickle(test_data_path).query("party not in ['SIU', 'NQ', 'JF', 'SP', 'M', 'DD', 'UFG', 'KD']")
parl_data.shape

(156448, 6)

In [6]:
parl_data['party'].value_counts()

S      32236
V      28410
DF     22067
EL     20054
SF     10523
LA     10512
KF     10065
RV      8882
ALT     7892
NB      5164
FG       643
Name: party, dtype: int64

In [7]:
parl_data['party'].value_counts(dropna = False)

S      32236
V      28410
DF     22067
EL     20054
SF     10523
LA     10512
KF     10065
RV      8882
ALT     7892
NB      5164
FG       643
Name: party, dtype: int64

In [8]:
grouped_docs = parl_data.groupby(['full_name', 'party'])['doc'].apply(' '.join).reset_index()

In [9]:
docs = [doc for doc in grouped_docs.doc]
parties = np.array([party for party in grouped_docs.party])

In [10]:
ngram_options = {
    'min_count': 5,
    'threshold': 1,
    'delimiter': '_'
}

In [10]:
pol2vec_model = Politician2Vec(
    documents = docs,
    custom_clusters = parties,
    party_inference_method = 'mean',
    tokenizer = tokenize_docs,
    embedding_model = 'doc2vec',
    min_count = 50, # consider setting this higher!
    ngram_vocab = True,
    ngram_vocab_args = ngram_options,
    speed = 'fast-learn', # CHANGE FOR REAL RUNS
    workers = available_workers
    #doc2vec_vector_size = 300,
    #doc2vec_window = 8,
    #doc2vec_samples_threshold = 1e-5
)

2023-04-09 15:14:51,241 - politician2vec - INFO - Pre-processing documents for training
2023-04-09 15:18:52,641 - politician2vec - INFO - Creating joint document/word embedding
2023-04-09 15:49:06,982 - politician2vec - INFO - Estimating party positions using mean...
2023-04-09 15:49:08,052 - politician2vec - INFO - All done!


In [11]:
# TODO: This should probably be implemented as a method of the Politician2Vec class
#
# NOTE: As of 2022-11-26, it has been decided to keep party 'inference' parallel to the original top2vec implementation.
#       This funtionality may prove useful in outlier detection, but--importantly--it requires the ex ante known party affilaitions
#       to be mapped to model output ex post instead of relying on inferred parties. Specifically, the 'estimated' party affiliations
#       may vary from ground truth labels in the case of semantically outlying politicians with only few data points.

def inspect_party(politician2vec_model, party_idx, n_docs=None, query_substr=None):
    '''
    Print top words and top docs for a given
    party.
    -------
    manual_num (int):  automatically assigned party number (i.e. 0-indexed).
    
    n_docs (int, optional): n top documents to print for a given party.
        Default is to print all docs within a given party.
    
    query_substr (str, optional): if specified, only documents containing
        this substring will be printed. Cannot be specified with n_docs,
        as this would return only results within a subset of party docs.
    '''

    num_parties = politician2vec_model.get_num_parties()
    party_words, word_scores, party_nums = politician2vec_model.get_parties(num_parties)

    # Get party sizes so we know max n docs
    party_sizes, party_nums = politician2vec_model.get_party_sizes()
    docs_to_return = party_sizes[party_idx]

    # Override n docs to return, if specified
    if n_docs:
        docs_to_return = n_docs

    # Get docs for input party id
    documents, document_scores, document_ids = politician2vec_model.search_documents_by_party(
        party_num=party_idx,
        num_docs=docs_to_return
        )

    # Limit output to docs containign certain substring, if specified
    if query_substr and n_docs:
        raise Exception('Please do NOT specify n_docs with substring query!\nOtherwise the search is only carried out for a subset of party docs.')
    
    # Throw exception if substring query attempted on subset of docs!
    elif query_substr:
        documents = [doc for doc in documents if query_substr in doc.lower()]

    # Print output
    print('--- TOP 50 WORDS ---\n', party_words[party_idx], '\n')

    print(f'--- TOP {docs_to_return} DOCS. SUBSTRING QUERY: {query_substr} (n = {len(documents)}) ---\n', documents)

In [12]:
word_vectors = pol2vec_model.model.wv
word_vectors.most_similar(positive = ['uværdig'], topn = 25)

NameError: name 'pol2vec_model' is not defined

In [34]:
inspect_party(pol2vec_model, 6, n_docs=1, query_substr=None)

--- TOP 50 WORDS ---
 ['radikal side' 'god radikal' 'radikal ordfører' 'radikal politik'
 'hinanden stedet' 'tak radikal' 'måde lave' 'mere energieffektive'
 'enig vigtigt' 'lov diskutere' 'muligheder muligheder' 'finde bedste'
 'lov bruge' 'får større' 'gå langt' 'uden samtidig' 'imod grundlæggende'
 'grundlæggende idé' 'bedste mening' 'virkeligheden handler'
 'sætte sammen' 'synes svært' 'andet sted' 'lyst diskutere' 'bedre sammen'
 'egentlig hellere' 'sætte gang' 'stedet sætte' 'gjort nogen'
 'særlig store' 'andre muligheder' 'ser store' 'får mest' 'idé prøve'
 'prøve tage' 'svært imod' 'taget imod' 'tager sted' 'stedet holde'
 'grundlæggende god' 'hvordan kommer' 'gang lave' 'mening lave'
 'slet diskutere' 'vores mening' 'mening tale' 'sammen finder' 'kan lave'
 'tænke prøve' 'ligger inde'] 

--- TOP 1 DOCS. SUBSTRING QUERY: None (n = 1) ---
 ['jeg har siddet og lyttet lidt til debatten og har bare et simpelt spørgsmål nemlig om de er imponeret over visionerne i regeringens forslag

In [19]:
model_path = 'embedding_models/fastlearn_2015_low.txt'
#pol2vec_model.save(model_path)

_____

## Viz dev

In [13]:
model_path = 'embedding_models/fastlearn_2015_low.txt'
pol2vec_model, doc2vec_model = load_politician2vec_from_txt(model_path)

Loading Politician2Vec model...
Retrieving document embedding...
All done!


In [14]:
def filter_vocab(ngrams, keep = 'singles'):

    if keep == 'singles':
        filtered_vocab = [word for word in pol2vec_model.vocab if word not in ngrams]
    elif keep == 'ngrams':
        filtered_vocab = [word for word in pol2vec_model.vocab if word in ngrams]
    return filtered_vocab

# retrieve ngrams from pol2vec_model.vocab
ngrams = [word for word in pol2vec_model.vocab if ' ' in word]

filtered_vocab = filter_vocab(ngrams, keep = 'singles') # keep = 'singles' or 'ngrams'

# remove vectors from pol2vec_model.model.wv if they are in ngrams
filtered_vecs = np.array([pol2vec_model.word_vectors[pol2vec_model.word_indexes[word]] for word in filtered_vocab])


KeyboardInterrupt: 

In [None]:
# get n words from filtered_vecs with highest cosine similarity to a given ngram
def get_similar_words(ngram, n = 10):

    ngram_vec = pol2vec_model.word_vectors[pol2vec_model.word_indexes[ngram]]
    # calc cos sim with all other word vecs
    cos_sim = np.dot(filtered_vecs, ngram_vec) / (np.linalg.norm(filtered_vecs, axis = 1) * np.linalg.norm(ngram_vec))
    # sort highest to lowest
    top_n = np.argsort(cos_sim)[::-1][:n+1]
    # remove the ngram itself
    top_n = [idx for idx in top_n if filtered_vocab[idx] != ngram]

    # return words and cosine sims for top n
    return [(filtered_vocab[idx], cos_sim[idx]) for idx in top_n]

In [None]:
pol2vec_model.model.wv.most_similar(positive = ['ulighed'], topn = 25)

[('uligheden', 0.5812593102455139),
 ('sundhed', 0.4702708423137665),
 ('ulighedsskabende', 0.4524366557598114),
 ('ligheden', 0.41566354036331177),
 ('ginikoefficienten', 0.4106815755367279),
 ('lighed', 0.3897664546966553),
 ('samfund', 0.3867873549461365),
 ('velfærdssamfund', 0.38276106119155884),
 ('socialt', 0.38263416290283203),
 ('største', 0.37656646966934204),
 ('fattige', 0.3748002052307129),
 ('ulige', 0.37209275364875793),
 ('fattigdom', 0.3704148828983307),
 ('indkomster', 0.3683091998100281),
 ('øger', 0.3614976704120636),
 ('formuer', 0.3612634241580963),
 ('levealder', 0.36082783341407776),
 ('samfundet', 0.3605209290981293),
 ('rigeste', 0.3587345480918884),
 ('retfærdigt', 0.3583432137966156),
 ('rigdom', 0.3537467122077942),
 ('politik', 0.3530895709991455),
 ('rige', 0.3512168824672699),
 ('sundhedsvæsenet', 0.34275534749031067),
 ('sundhedsvæsen', 0.33953893184661865)]

In [None]:
get_similar_words('rigeste', n = 80)

[('rigeste pct', 0.81262606),
 ('rigeste land', 0.8031045),
 ('rigeste mest', 0.8023713),
 ('rigeste danskere', 0.802011),
 ('kommet rigeste', 0.8017404),
 ('mens rigeste', 0.80126375),
 ('rigeste del', 0.8008297),
 ('givet rigeste', 0.7996219),
 ('rigeste lande', 0.7984037),
 ('rigeste gode', 0.7981652),
 ('danmarks rigeste', 0.797878),
 ('rigeste kommuner', 0.79778004),
 ('rigeste mennesker', 0.79722345),
 ('rigeste danmark', 0.79692566),
 ('pct rigeste', 0.79653335),
 ('kommer rigeste', 0.79612947),
 ('kun rigeste', 0.7941848),
 ('landets rigeste', 0.7904355),
 ('rigeste samfund', 0.7897128),
 ('rigeste samfundet', 0.7882949),
 ('rigeste familier', 0.77656937),
 ('blandt rigeste', 0.7719261),
 ('verdens rigeste', 0.7683592),
 ('skattelettelser rigeste', 0.7668241),
 ('rigeste procent', 0.7438826),
 ('rigeste fattigste', 0.7430326),
 ('skattelettelse rigeste', 0.7252129),
 ('skatten rigeste', 0.72379214),
 ('rigeste pensionister', 0.7206244),
 ('rigeste boligejere', 0.71191543),
 ('s

In [134]:
print(len(ngrams))
filtered_vecs.shape

116572


(15212, 300)

In [135]:
# Get the top n similar ngrams to a given ngram
def get_ngram_similarities(pol2vec_model, word, n=10):
    '''
    Get the top n similar ngrams to a given ngram.
    '''
    return pol2vec_model.model.wv.most_similar(word, topn=n)


In [None]:
# NOTE: This is a bit of a hack... Must be implemented internally in the pol2vec model learning

party_labels = list(grouped_docs['party'].value_counts().index)

# get closest 50 words to a given party vector
def get_closest_words(party_vec, n=50):
    # get cosine similarity between party vector and all other vectors
    cos_sim = np.dot(filtered_vecs, party_vec) / (np.linalg.norm(filtered_vecs, axis=1) * np.linalg.norm(party_vec))
    # get indices of top n words
    top_n = np.argsort(cos_sim)[-n:]
    # get top n words
    top_n_words = [filtered_vocab[i] for i in top_n]

    # also return the cosine similarity
    top_n_cos_sim = [cos_sim[i] for i in top_n]
    
    # print (word, cosine similarity) for each word, sorted by largest cosine similarity
    for word, cos_sim in sorted(zip(top_n_words, top_n_cos_sim), key=lambda x: x[1], reverse=True):
        print("{0:30}{1}".format(word, cos_sim))

# get party vectors
party_vecs = pol2vec_model.party_vectors

# lookup top 50 words for party 0
for i, party in enumerate(party_labels):
    print(f'\n--- TOP WORDS FOR PARTY {party} ---\n')
    get_closest_words(party_vecs[i], n=50)

In [15]:
n_words = len(doc2vec_model.wv)
n_docs = len(doc2vec_model.dv)
vocab = pol2vec_model.vocab

In [16]:
## PARLIAMENT FASTLEARN
party_labels = {
   0: 'Socialdemokratiet',
   1: 'Venstre',
   2: 'Dansk_Folkeparti',
   3: 'Enhedslisten',
   4: 'Radikale Venstre',
   5: 'SF',
   6: 'Konservative',
   7: 'Liberal_Alliance',
   8: 'UFG',
   9: 'Alternativet',
   10: 'Danmarksdemokraterne',
   11: 'Nye Borgerlige',
   12: 'Frie Grønne',
   13: 'Moderaterne',
   14: 'Kristendemokraterne'
}