In [8]:
import pandas as pd
from collections import Counter
import numpy as np
from tqdm import tqdm
from scipy.spatial import distance
from scipy.stats import wasserstein_distance
import matplotlib.pyplot as plt

In [9]:
with open("attributes.txt", mode="r") as f:
    attributes = f.readlines()
    attributes = [a.strip() for a in attributes]
    
attributes

['B_COUNTRY',
 'G_TOWNSIZE2',
 'H_SETTLEMENT',
 'H_URBRURAL',
 'E1_LITERACY',
 'Q260',
 'X003R2',
 'Q263',
 'Q266',
 'Q269',
 'Q270',
 'Q272',
 'Q273',
 'Q275R',
 'Q279',
 'Q281',
 'Q287',
 'Q288R',
 'Q289',
 'Q284']

In [10]:
df = pd.read_csv("WVS_Cross-National_Wave_7_csv_v5_0.csv")
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,version,doi,A_WAVE,A_YEAR,A_STUDY,B_COUNTRY,B_COUNTRY_ALPHA,C_COW_NUM,C_COW_ALPHA,D_INTERVIEW,...,WVS_Polmistrust_PartyVoter,WVS_LR_MedianVoter,WVS_LibCon_MedianVoter,v2psbars,v2psorgs,v2psprbrch,v2psprlnks,v2psplats,v2xnp_client,v2xps_party
0,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070001,...,62.434211,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070002,...,62.434211,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070003,...,62.434211,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070004,...,,,,,,,,,,
4,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070005,...,66.964286,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [11]:
questions = [f"Q{i}" for i in range(1, 260)]

In [12]:
def get_alignment(dist1, dist2, metric):
    epsilon = 1e5
    
    if metric == "js":
        return 1 - distance.jensenshannon(dist1+epsilon, dist2+epsilon)
    else:
        return 1 - wasserstein_distance(
            np.arange(dist1.shape[0]), 
            np.arange(dist2.shape[0]), 
            u_weights=dist1+epsilon, 
            v_weights=dist2+epsilon
        ) / (dist2.shape[0] - 1)


def get_alignment_matrix(att_dists, metric="js"):    
    alignment_matrix = np.zeros((att_dists.shape[0], att_dists.shape[0]))
    for i in range(att_dists.shape[0]):
        for j in range(att_dists.shape[0]):
            alignment_matrix[i][j] = get_alignment(att_dists[i], att_dists[j], metric)
            
    return alignment_matrix

In [13]:
def get_attribute_options(attribute):
    alignment_matrices = list()
    att_options = {a for a in df[attribute] if a > 0}
    for question in tqdm(questions):
        response_options = sorted(list({r for r in df[question] if r > 0}))
        question_df = df[df[question].isin(response_options)]
        atts_options4question = {a for a in question_df[attribute] if a > 0}
        att_options = att_options.intersection(atts_options4question)
        
    return att_options

def get_attribute_metrics(attribute):
    att_options = get_attribute_options(attribute)
    alignment_matrices = list()
    
    pbar = tqdm(questions, desc=attribute)
    print("aaahahaaha")
    for question in pbar:
        response_options = sorted(list({r for r in df[question] if r > 0}))
        question_df = df[df[question].isin(response_options)]
        att_prob_dists = list()

        for att_option in att_options:
            att_option_df = question_df[question_df[attribute] == att_option] # get the df associated with respondents of the particular attribute option
            if len(att_option_df) == 0:
                continue

            q_responses = list(att_option_df[question])
            response_counter = Counter(q_responses) # get the frequency of each question response option
            response_freq = dict(response_counter)

            for r in response_options:
                if not r in response_freq:
                    response_freq[r] = 0

            prob_dist = np.array([response_freq[key] for key in sorted(response_freq.keys())])
            prob_dist = prob_dist / np.sum(prob_dist) # get prob dist for respondents with att option on the question
            att_prob_dists.append(prob_dist)

        att_prob_dists = np.array(att_prob_dists)
        alignment_matrix = get_alignment_matrix(att_prob_dists, metric="emd") # compute NxN similarity matrix between N attribute options
        alignment_matrices.append(alignment_matrix)

    alignment_matrices = np.array(alignment_matrices)
    avg_alignment_matrix = np.mean(alignment_matrices, axis=0)

    std = np.std(avg_alignment_matrix[np.tril_indices(avg_alignment_matrix.shape[0], k=-1)])
    min_alignment = np.min(avg_alignment_matrix)
    
    return std, min_alignment, avg_alignment_matrix

In [14]:
att_metric_dict = dict()

for i in range(len(attributes)):
    std, min_alignment, avg_alignment_matrix = get_attribute_metrics(attributes[i])
    att_metric_dict[attributes[i]] = {
        "std": std,
        "min_alignment": min_alignment
    }
    
    
    plt.imshow(avg_alignment_matrix, cmap='hot', interpolation='nearest')
    plt.colorbar()
    plt.savefig(os.path.join("attribute_heatmaps", f"{attributes[i]}.png"))
    plt.clf()

  4%|█▉                                             | 11/259 [00:04<01:51,  2.22it/s]


KeyboardInterrupt: 