# Class 8: Local and Pretrained Embeddings Using Gensim - Tutorial

In [None]:
# Import basic Python modules
import os
import pickle
import platform
import multiprocessing

# Regular expressions
import re

# Data management
import numpy as np
import pandas as pd
from collections import namedtuple, Counter

# Progress bars
from tqdm import tqdm

# Gensim
import gensim
from gensim.models.doc2vec import Doc2Vec

# SpaCy
import spacy

# DANLP
from danlp.models.embeddings import load_wv_with_gensim

# Scikit-learn
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances

# Plotting
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
sns.set_style('darkgrid')

In [None]:
# # # # Working Directory # # # #

if platform.system() == 'Linux':
    wd = '/home/rask/'
else:
    wd = 'C:/Users/au535365/'

wd = os.path.join(wd, 'Dropbox/teaching/css_fall2023')
    
# Change directory
os.chdir(wd)

# Confirm that the working directory is as intended 
os.getcwd()

In [None]:
# Generate file ids
files = ['20001', 
         '20011',
         '20012',
         '20021',
         '20031',
         '20041',
         '20042',
         '20051',
         '20061',
         '20071',
         '20072',
         '20081',
         '20091',
         '20101',
         '20102',
         '20111',
         '20121',
         '20131',
         '20141',
         '20142',
         '20151',
         '20161',
         '20171',
         '20181',
         '20182',
         '20191',
         '20201',
         '20211']

# Specify base url
base_url = 'https://raw.githubusercontent.com/mraskj/css_fall2023/master/data/ft-speeches/'

In [None]:
def stratified_sampling(dataframe, strata_col, group_col, n_size=100, return_dataframe=True):
    
    """
    Perform stratified sampling from a DataFrame based on specified strata.

    This function performs stratified sampling from a DataFrame based on the values
    of the strata column. It ensures that the sample size for each stratum (group) is
    as close as possible to the specified sample size.

    Parameters:
    dataframe (DataFrame): The DataFrame to sample from.
    strata_col (str): The name of the column used as the stratum for sampling.
    group_col (str): The name of the column used to define stratum groups.
    n_size (int): The desired sample size for each stratum (default is 500).
    return_dataframe (bool): If True, return a DataFrame with the sampled data;
        if False, return a list of indices (default is True).

    Returns:
    DataFrame or list: If return_dataframe is True, a DataFrame with the sampled
        data is returned. If return_dataframe is False, a list of indices is returned.

    Example:
    >>> sampled_data = stratified_sampling(df, 'age', 'gender', n_size=200)
    >>> sampled_indices = stratified_sampling(df, 'income', 'region', n_size=300, return_dataframe=False)
    """
    
    # Sed seet to be able to replicate
    np.random.seed(10)
    
    # Define empty list to store samples
    samples = []
   
    # Validity checks
    if group_col not in dataframe.columns:
        raise KeyError(f"Dataframe must have column {group_col}")
    else:
        groups = list(dataframe['party'].unique())

    
    if strata_col not in dataframe.columns:
        if 'text' in dataframe.columns:
            dataframe[strata_col] = dataframe.text.apply(lambda x: len(x))
        else:
            raise KeyError(f"Dataframe must have column {strata_col}")
    
    # Keep speeches within the 25th-75th percentile span in terms of number of words
    p25, p75 = np.quantile(dataframe[strata_col], q=.25), np.quantile(dataframe[strata_col], q=.75)
    dataframe = dataframe.loc[(dataframe[strata_col] <= p75) & (dataframe[strata_col] >= p25)]
    
    # Apply sampling
    for group in groups:

        indices = list(dataframe.loc[dataframe[group_col] == group].index)

        if (len(indices)) < n_size:
            n_samples = len(indices)
        else:
            n_samples = n_size

        samples += list(np.random.choice(indices, size=n_samples, replace=False))
    
    if return_dataframe:
        return dataframe.loc[dataframe.index.isin(samples)].reset_index(drop=False)
    else:
        return samples

In [None]:
# Read in data
df = pd.DataFrame()
for file in tqdm(files):
    df_term = pd.read_csv(base_url + file + '.csv')
    #df_term = pd.read_csv('data/ft-speeches/' + file + '.csv')
    if len(df_term) > 10000:
        sample_df = stratified_sampling(dataframe=df_term, strata_col='n_words', group_col='party')
        df = pd.concat([df, sample_df])
df.reset_index(drop=True, inplace=True)

In [None]:
# If you get an error, you should download the model first. Uncomment the line below (remove the hashtag #) and run. 
# !python -m spacy download da_core_news_md
# # Load the model "da_core_news_md"
spacy_pipeline_da = spacy.load("da_core_news_md")

In [None]:
# # Define list with our corpus
corpus_raw = list(df['text'])

# # Tokenize
tokens_raw = [[d for d in spacy_pipeline_da(doc)] for doc in tqdm(corpus_raw, position=0, leave=True)]

# IGNORE THIS
# tokenized_docs = [x[0].doc for x in tokens_raw]
# for p, ixs in df.groupby('period').groups.items():
#     with open(f'data/ft-speeches-tokenized/spacy_tokens_{p}.pkl', 'wb') as f:
#         pickle.dump([tokenized_docs[i] for i in list(ixs)], f)

In [None]:
# IGNORE THIS
# Read in tokens
#tokens_raw = []
#for p, ixs in df.groupby('period').groups.items():
#    print(f"Period {p}")
#    with open(f'data/ft-speeches-tokenized/spacy_tokens_{p}.pkl', 'rb') as f:
#        tokens_raw += pickle.load(f)
#        f.close()

In [None]:
# Remove:
# - stopwords
# - punctuation
# - digits
# - tokens shorter than 3 characters
# - spaces

stop_words = sorted(list(spacy_pipeline_da.Defaults.stop_words))

# Remove stopwords
tokens_cleaned = [[x for x in token if x.text not in stop_words] for token in tqdm(tokens_raw)]

# Remove punctuation
tokens_cleaned = [[x for x in token if not x.is_punct] for token in tqdm(tokens_cleaned)]

# Remove digits
tokens_cleaned = [[x for x in token if not x.is_digit] for token in tqdm(tokens_cleaned)]

# Remove tokens shorter than 3 characters
tokens_cleaned = [[x for x in token if not x.is_space] for token in tqdm(tokens_cleaned)]

# Remove tokens shorter than 3 characters
tokens_cleaned = [[x for x in token if len(x) >= 3] for token in tqdm(tokens_cleaned)]

In [None]:
# Compute number of CPU-cores on your computer
cores = multiprocessing.cpu_count()

In [None]:
# Extract tokens from speeches given by RV legislators (should be of length 21*100=2100 since we have 21 years and 100 speeches
# in each year - see stratified sampling)
tokens_subset = [[x.lower_ for x in tokens_cleaned[i]] for i in list(df.groupby('party').groups['RV'])]

## Word2Vec Example

In [None]:
# Example: Fit word2vec model for RV 
w2v_example = gensim.models.Word2Vec(tokens_subset,  
               workers=cores,  
               size=200,      
               min_count=5,  
               window = 10, 
               sample = 1e-2, 
               iter = 10 
               )

In [None]:
# Compute 10 most similar words for 'flygtninge'
w2v_example.wv.most_similar('flygtninge', topn=10)

In [None]:
# Unpack words and similarity scores in two separate objects
top_words, top_sims = zip(*w2v_example.wv.most_similar('flygtninge', topn=20))

### Doc2Vec Example

In [None]:
def generate_tags(*metatags):
    """
    Generate indicators by combining multiple input tags.

    This function takes multiple metatags and combines them by joining each element
    with a hyphen ('-'). The result is a list of combined tags, which can be
    used to label samples.

    Parameters:
    *metatags (tuple): A tuple of input tags.

    Returns:
    list: A list of combined indicators.

    Example:
    >>> generate_tags(('A', 'B'), ('1', '2'))
    ['A-1', 'B-2']
    """
    tags = ['-'.join(map(str, t)) for t in zip(*metatags)]
    return tags

def generate_iterator(words, metatags):
    """
    Generate an iterator of namedtuples containing words and tags.

    This function creates an iterator that combines a list of words and a list
    of metatags into namedtuples. Each namedtuple has two fields: 'words' and
    'metatags', where 'words' is a list of tokenized words and 'metatags' is a list of associated tags.

    Parameters:
    words (nested list): A list of tokenized words.
    metatags (list): A list of tags or indicators used to fit a Doc2Vec.

    Returns:
    list: A list of namedtuples containing 'tokens' and 'tags'.

    Example:
    >>> generate_iterator(['apple', 'banana'], ['fruit', 'yellow'])
    [speeches(words=['apple'], tags=['fruit']), speeches(words=['banana'], tags=['yellow'])]
    """
    speech_iterator = namedtuple('docs', 'words tags')
    iterator = [speech_iterator(x, [str(y)]) for x, y in zip(words, metatags)]
    return iterator

In [None]:
# Generate tags
tags = ['-'.join(map(str, t)) for t in zip(df.party)]
tags

In [None]:
# Generate tags with function
generate_tags(df.party)

In [None]:
# Generate iterator for doc2vec
speech_iterator = namedtuple('docs', 'tokens tags')
iterator = [speech_iterator(x, [y]) for x, y in zip(tokens_cleaned, tags)]
iterator

In [None]:
# Generate iterator with function 
iterator = generate_iterator(words=tokens_cleaned, metatags=tags)

In [None]:
# Construct iterator for RV 
period_tags = []
tokens_subset = []
for i in list(df.groupby('party').groups['RV']):
    period_tags.append(df.iloc[i].period)
    tokens_subset.append([x.lower_ for x in tokens_cleaned[i]])
iterator = generate_iterator(words=tokens_subset, metatags=period_tags)

In [None]:
# Fit the Doc2Vec model with:
#    - `vector_size=100`
#    - `window=10`
#    - `min_count=5`
#    - `workers=24`
#    - `epochs=10`
#    - `sample=1e-3`

d2v = Doc2Vec(iterator,
              vector_size=100, 
              window=10, 
              min_count=5,
              workers=cores,
              epochs=10, 
              sample = 1e-3)

In [None]:
# Compute most similar words for 'asyl' based on the trained d2v model
d2v.wv.most_similar('asyl')

## Party Differences In Sentiment on Immigration Words

In [None]:
# Read in AFINN
afinn = pd.read_csv('data/AFINN-da-32.txt', sep='\t', header=None, names=['word', 'score'])

print(afinn.sample(10))

# Keep only single words
afinn = afinn.loc[afinn['word'].apply(lambda x: len(x.split())) == 1,].reset_index(drop=True)

# Generate a boolean variable denoting if a word is positive or not
afinn['positive'] = afinn['score'] > 0 

# Define a function to compute the intersection between two lists
def intersection(l1, l2):
    return [v for v in l1 if v in l2]

# Define function to compute the number of positive and negative words and their mean score
def get_wordscores(df_, dict_, key_):
    pdf = df_.loc[df_.word.isin(dict_['shared_words'])].groupby('positive').describe()
    pdf = pdf.score.reset_index()
    pdf = pdf[['positive','count', 'mean']]
    pdf['party'] = [key_] * len(pdf)
    pdf['vocab_size'] = [dict_['vocab_size']] * len(pdf)
    return pdf

# Define function to compute size of vocab
def vocab_size(tokens):
    word_freq = Counter()
    for doc in tokens:
        for word in doc:
            word_freq[word] += 1
    return word_freq, len(word_freq)

afinn_words = list(afinn.word)

In [None]:
# Define list of parties
df_groups = df.groupby('party').groups
parties = list(df_groups.keys())

In [None]:
party_dict = {}
party_score_df = pd.DataFrame()

for party in tqdm(parties):
    
    party_dict[party] = {}
    
    tokens_subset = [[x.lower_ for x in tokens_cleaned[i]] for i in list(df_groups[party])]
    
    wordfreq, vsize = vocab_size(tokens=tokens_subset)
    
    w2v_party = gensim.models.Word2Vec(tokens_subset,  
                   workers=cores,  
                   size=100,      
                   min_count=5,  
                   window = 10, 
                   sample = 1e-3, 
                   iter = 10 
                   )
    
    party_dict[party]['word2vec'] = w2v_party
    
    top_words, top_sims = zip(*party_dict[party]['word2vec'].wv.most_similar('asyl', topn=100))
    
    party_dict[party]['results'] = {'words': list(top_words), 
                                    'similarity': list(top_sims), 
                                    'shared_words': intersection(afinn_words, list(top_words)),
                                    'vocab_size': vsize}
    
    party_score_df = pd.concat([party_score_df, get_wordscores(afinn,  party_dict[party]['results'], party)])

In [None]:
party_score_df

In [None]:
# Group the DataFrame by 'party', 'positive', and calculate the mean and sum of 'mean' and 'count'
grouped = party_score_df.groupby(['party', 'positive']).agg({'mean': 'mean', 'count': 'sum'}).unstack().reset_index()

# Create a dumbbell plot
fig, ax = plt.subplots(figsize=(12, 8))
index = range(len(grouped['party']))
line_height = 0.2  # Adjust this value to control the spacing between points

# Plot lines connecting points for 'Positive' True and False
for i in index:
    ax.plot([grouped['mean'][True][i], grouped['mean'][False][i]], [i, i], marker='o', markersize=10, label=grouped['party'][i])

    # Add text labels above each dot
    ax.text(grouped['mean'][True][i], i + line_height, f"Pos: {int(grouped['count'][True][i])}", ha='center')
    ax.text(grouped['mean'][False][i], i + line_height, f"Neg: {int(grouped['count'][False][i])}", ha='center')


# Add labels beneath the brackets
ax.text(-1.75, -1.5, 'Negative words', ha='center', fontsize=12)
ax.text(1.75, -1.5, 'Positive words', ha='center', fontsize=12)
ax.set_xlim(left=-3.5, right=3.5)
ax.set_yticks(index)
ax.set_yticklabels(grouped['party'])
#plt.xlabel('Mean Value')
# plt.title('Dumbbell Plot of Mean Values by Party and Positivity')
# plt.legend()

plt.show()

In [None]:
# Create a dumbbell plot
fig, ax = plt.subplots(figsize=(12, 8))
index = range(len(grouped['party']))
line_height = 0.2  # Adjust this value to control the spacing between points

# Plot lines connecting points for 'Positive' True and False
for i in index:
    ax.plot([grouped['count'][True][i], grouped['count'][False][i]], [i, i], marker='o', markersize=10, label=grouped['party'][i])
    
    # Add text labels above each dot
    ax.text(grouped['count'][True][i], i + line_height, f"Pos: {int(grouped['count'][True][i])}", ha='center')
    ax.text(grouped['count'][False][i], i + line_height, f"Neg: {int(grouped['count'][False][i])}", ha='center')

# Add labels beneath the brackets
ax.set_xlim(left=-0, right=20)
ax.set_yticks(index)
ax.set_xticks(range(0, 21))
ax.set_yticklabels(grouped['party'])
plt.xlabel('Number of Words')
plt.show()

## Pretrained Embeddings

In [None]:
import gensim.downloader
from danlp.models.embeddings import load_wv_with_gensim

In [None]:
# Loading pretrained model 'word2vec-google-news-300'
w2v_en = gensim.downloader.load('word2vec-google-news-300')

In [None]:
# Loading Danish model 'conll17.da.wv'
w2v_da = load_wv_with_gensim('conll17.da.wv')

In [None]:
# Most similar words for 'israel' for the English model
w2v_en.most_similar('israel', topn=10)

In [None]:
# Most similar words for 'israel' for the Danish model
w2v_da.most_similar('israel', topn=20)

## Analogies

In [None]:
# Analogies: father is to son as mother is to X
w2v_da.most_similar(positive=['moderen', 'sønnen'], negative=['faderen'])