In [1]:
from datasets import load_dataset
import ast
import re
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("Anthropic/llm_global_opinions")["train"]
dataset

Dataset({
    features: ['question', 'selections', 'options', 'source'],
    num_rows: 2556
})

In [3]:
def extract_curly_braces(string):
    # Using regular expression to find the substring within curly braces
    match = re.search(r'\{.*?\}', string)
    if match:
        return match.group(0)
    else:
        return None
    
def remove_parentheses(string):
    # Using regular expression to remove parentheses and their contents
    result = re.sub(r'\s*\([^()]*\)', '', string)
    return result

def map_to_str_dict(sample):    
    str_selections = sample["selections"]
    str_selections = extract_curly_braces(str_selections)
    str_selections = remove_parentheses(str_selections)
    str_selections = str_selections.strip()

    return {
        "question": sample["question"],
        "selections": str_selections,
        "options": sample["options"],
        "source": sample["source"]
    }

In [4]:
dataset = dataset.map(map_to_str_dict, num_proc=10)

print(dataset[0]["question"])
print(dataset[0]["selections"])
print(dataset[0]["options"])
print(dataset[0]["source"])

When it comes to Germany’s decision-making in the European Union, do you think Germany has too much influence, has too little influence or has about the right amount of influence?
{'Belgium': [0.21, 0.07, 0.69, 0.03], 'France': [0.35, 0.09, 0.54, 0.02], 'Germany': [0.13131313131313133, 0.30303030303030304, 0.5252525252525253, 0.04040404040404041], 'Greece': [0.86, 0.04, 0.1, 0.0], 'Italy': [0.6138613861386139, 0.0297029702970297, 0.3465346534653465, 0.009900990099009901], 'Netherlands': [0.2, 0.06, 0.72, 0.02], 'Spain': [0.53, 0.03, 0.43, 0.01], 'Sweden': [0.15, 0.02, 0.82, 0.01]}
['Has too much influence', 'Has too little influence', 'Has about the right amount of influence', 'DK/Refused']
GAS


In [5]:
all_selections = dataset["selections"]
all_countries = list()
for selection in all_selections:
    dict_selection = ast.literal_eval(selection)
    selection_countries = list(dict_selection.keys())
    all_countries += selection_countries


counter = 0
country2code = dict()

all_countries = set(all_countries)
for country in all_countries:
    country2code[country] = counter
    counter += 1
    print(country)

Peru
Finland
South Korea
Bosnia Herzegovina
Bulgaria
Taiwan
Venezuela
Morocco
Argentina
Burkina Faso
Canada
Israel
Angola
Ethiopia
Greece
Nicaragua
Bangladesh
Kenya
Bolivia
Czech Rep.
Croatia
S. Africa
Ecuador
Japan
Estonia
Belgium
Cyprus
Thailand
Slovakia
Colombia
Vietnam
Guatemala
Honduras
Ukraine
El Salvador
India
Montenegro
Jordan
Iran
Brazil
Taiwan ROC
Spain
Egypt
Hungary
Malaysia
Iraq
Armenia
North Macedonia
Mongolia
Ivory Coast
Azerbaijan
Chile
Belarus
Mali
Portugal
Ghana
Tanzania
Slovenia
Kuwait
United States
Iceland
Albania
Kazakhstan
Uganda
Great Britain
France
Pakistan
Czechia
Romania
Turkey
New Zealand
Uruguay
Sweden
Lithuania
Italy
Tunisia
Singapore
Switzerland
Zimbabwe
Kyrgyzstan
Mexico
Uzbekistan
Andorra
Palest. ter.
Nigeria
Northern Ireland
Australia
Lebanon
Philippines
Indonesia
Serbia
Norway
Hong Kong SAR
Tajikistan
Maldives
Austria
Latvia
Russia
Netherlands
S. Korea
Poland
Senegal
Libya
Macau SAR
Denmark
Georgia
Britain
Germany
Puerto Rico
China
Myanmar


In [12]:
with open("countries.txt", mode="a") as f:
    for country in countries:
        f.write(f"{country}\n")

{'Albania',
 'Andorra',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Bolivia',
 'Bosnia Herzegovina',
 'Brazil',
 'Britain',
 'Bulgaria',
 'Burkina Faso',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Croatia',
 'Cyprus',
 'Czech Rep.',
 'Czechia',
 'Denmark',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Great Britain',
 'Greece',
 'Guatemala',
 'Honduras',
 'Hong Kong SAR',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Israel',
 'Italy',
 'Ivory Coast',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kuwait',
 'Kyrgyzstan',
 'Latvia',
 'Lebanon',
 'Libya',
 'Lithuania',
 'Macau SAR',
 'Malaysia',
 'Maldives',
 'Mali',
 'Mexico',
 'Mongolia',
 'Montenegro',
 'Morocco',
 'Myanmar',
 'Netherlands',
 'New Zealand',
 'Nicaragua',
 'Nigeria',
 'North Macedonia',
 'Northern Ireland',
 'Norway',
 'Pakistan',
 'Palest. ter.

In [6]:
df_question_id = list()
df_question = list()
df_option = list()
df_option_num = list()
df_option_tot = list()
df_country = list()
df_country_id = list()
df_score = list()
df_source = list()

for i in tqdm(range(len(dataset))):
    question = dataset[i]["question"]
    selections = dataset[i]["selections"]
    options = dataset[i]["options"]
    source = dataset[i]["source"]

    dict_selection = ast.literal_eval(selections)
    countries = list(dict_selection.keys())
    num_countries = len(countries)

    list_options = ast.literal_eval(options)
    num_options = len(list_options)

    for j in range(num_countries):
        country = countries[j]
        country_selections = dict_selection[country]

        for k in range(num_options):
            option = list_options[k]
            score = country_selections[k]
            
            df_question_id.append(i)
            df_question.append(question)
            df_option.append(option)
            df_option_num.append(k)
            df_option_tot.append(num_options)
            df_country.append(country)
            df_country_id.append(country2code[country])
            df_score.append(score)
            df_source.append(source)        


100%|██████████| 2556/2556 [00:02<00:00, 855.48it/s] 


In [7]:
df_dict = {
    "question_id": df_question_id,
    "question": df_question,
    "option": df_option,
    "option_id": df_option_num,
    "num_options": df_option_tot,
    "country": df_country,
    "country_id": df_country_id,
    "score": df_score,
    "source": df_source
}

dataset_df = pd.DataFrame(df_dict)
# dataset_df.to_csv("dataset.csv")


In [8]:
dataset_df

Unnamed: 0,question_id,question,option,option_id,num_options,country,country_id,score,source
0,0,When it comes to Germany’s decision-making in ...,Has too much influence,0,4,Belgium,25,0.210,GAS
1,0,When it comes to Germany’s decision-making in ...,Has too little influence,1,4,Belgium,25,0.070,GAS
2,0,When it comes to Germany’s decision-making in ...,Has about the right amount of influence,2,4,Belgium,25,0.690,GAS
3,0,When it comes to Germany’s decision-making in ...,DK/Refused,3,4,Belgium,25,0.030,GAS
4,0,When it comes to Germany’s decision-making in ...,Has too much influence,0,4,France,65,0.350,GAS
...,...,...,...,...,...,...,...,...,...
256630,2555,Here are two statements people sometimes make ...,Economy growth and creating jobs,1,6,Northern Ireland,85,0.361,WVS
256631,2555,Here are two statements people sometimes make ...,Other answer,2,6,Northern Ireland,85,0.007,WVS
256632,2555,Here are two statements people sometimes make ...,Don't know,3,6,Northern Ireland,85,0.039,WVS
256633,2555,Here are two statements people sometimes make ...,No answer,4,6,Northern Ireland,85,0.001,WVS


In [9]:
dataset_df["question"][500]

'Do you think this change in the working conditions for ordinary workers is largely because of the way the world has become more connected or mostly for other reasons?'

In [10]:
for country in country2code:
    num_samples = len(dataset_df[dataset_df["country"] == country])
    print(f"{country}: {num_samples}")

Peru: 3145
Finland: 960
South Korea: 1768
Bosnia Herzegovina: 960
Bulgaria: 2095
Taiwan: 294
Venezuela: 3346
Morocco: 2491
Argentina: 4086
Burkina Faso: 357
Canada: 4031
Israel: 1929
Angola: 411
Ethiopia: 2759
Greece: 3637
Nicaragua: 2265
Bangladesh: 2890
Kenya: 4789
Bolivia: 2523
Czech Rep.: 1410
Croatia: 960
S. Africa: 2545
Ecuador: 1689
Japan: 4552
Estonia: 960
Belgium: 650
Cyprus: 1690
Thailand: 2290
Slovakia: 2827
Colombia: 2500
Vietnam: 2814
Guatemala: 2100
Honduras: 411
Ukraine: 3695
El Salvador: 879
India: 3348
Montenegro: 960
Jordan: 4497
Iran: 1631
Brazil: 4389
Taiwan ROC: 1690
Spain: 4057
Egypt: 3812
Hungary: 2561
Malaysia: 3053
Iraq: 1814
Armenia: 1696
North Macedonia: 960
Mongolia: 1689
Ivory Coast: 694
Azerbaijan: 960
Chile: 3158
Belarus: 960
Mali: 714
Portugal: 960
Ghana: 1811
Tanzania: 1819
Slovenia: 960
Kuwait: 558
United States: 5305
Iceland: 953
Albania: 960
Kazakhstan: 1662
Uganda: 1625
Great Britain: 1621
France: 4393
Pakistan: 4381
Czechia: 1690
Romania: 1690
Turk

In [11]:
# train_df, test_df = train_test_split(dataset_df, test_size=0.2, random_state=42)
# train_df.to_csv("train_ds.csv")
# test_df.to_csv("test_ds.csv")