In [6]:
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', 26) # show all columns in the output

In [51]:
df = pd.read_csv('WVS_Cross-National_Wave_7_csv_v6_0.csv',
                 sep=',',
                 usecols=[ # Because the dataset is huge, just import the following columns:
                     "A_STUDY", "A_WAVE", "A_YEAR", "B_COUNTRY_ALPHA", "J_INTDATE", # Metadaten des Interviews für die Auswertung
                     "G_TOWNSIZE", "G_TOWNSIZE2", "H_SETTLEMENT", "H_URBRURAL", # 
                     "Q5", "Q6", "Q7", "Q8", "Q9", "Q43", "Q44", # Social Values, Norms, Stereotypes
                     "Q66", "Q67", "Q75", "Q77", "Q82", # Social Capital, Trust and Organizational Memberships
                     "Q107", "Q108", "Q110", # Economic values
                     "Q122", # Migration
                     "Q235", # ... Political Culture and Political Regimes 
                ], dtype={'B_COUNTRY_ALPHA': 'category'} # just overriding this as everything else is correctly detected automatically by Pandas
                )

df.sample(n=10, random_state=2024)

Unnamed: 0,A_WAVE,A_YEAR,A_STUDY,B_COUNTRY_ALPHA,J_INTDATE,G_TOWNSIZE,G_TOWNSIZE2,H_SETTLEMENT,H_URBRURAL,Q5,Q6,Q7,Q8,Q9,Q43,Q44,Q66,Q67,Q75,Q77,Q82,Q107,Q108,Q110,Q122,Q235
17813,7,2018,2,CHN,20180807,7,4,4,1,2,3,1,1,1,3,1,2,2,2,2,-4,4,4,5,0,2
35755,7,2018,2,IDN,20180604,2,1,5,2,1,1,1,2,2,1,2,3,3,1,3,3,8,7,7,0,4
27223,7,2022,2,GBR,20220526,7,4,2,1,2,4,1,2,2,1,1,3,3,1,4,4,5,3,6,2,1
76962,7,2020,2,SGP,-4,8,5,1,1,2,1,1,2,1,1,2,3,3,3,3,2,4,5,4,2,1
37488,7,2023,2,IND,20230609,6,3,3,1,1,1,1,1,1,3,1,4,4,2,4,2,1,-1,-1,0,-1
72202,7,2018,2,PRI,20180803,5,3,4,1,2,1,2,2,1,3,1,3,3,3,3,3,5,5,1,1,4
6214,7,2018,2,BGD,20181217,5,3,2,2,1,1,1,2,1,3,1,2,2,1,1,1,5,4,3,2,4
28422,7,2022,2,GBR,20220829,7,4,3,1,2,4,2,1,1,1,2,4,4,3,4,3,5,1,5,2,3
69886,7,2018,2,PER,20180904,1,1,5,2,1,1,2,2,1,1,1,2,2,2,2,2,7,7,6,0,2
90239,7,2017,2,USA,20170502,8,5,3,1,2,3,2,1,1,1,1,3,3,2,3,3,5,1,7,0,2


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97220 entries, 0 to 97219
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   A_WAVE           97220 non-null  int64   
 1   A_YEAR           97220 non-null  int64   
 2   A_STUDY          97220 non-null  int64   
 3   B_COUNTRY_ALPHA  97220 non-null  category
 4   J_INTDATE        97220 non-null  int64   
 5   G_TOWNSIZE       97220 non-null  int64   
 6   G_TOWNSIZE2      97220 non-null  int64   
 7   H_SETTLEMENT     97220 non-null  int64   
 8   H_URBRURAL       97220 non-null  int64   
 9   Q5               97220 non-null  int64   
 10  Q6               97220 non-null  int64   
 11  Q7               97220 non-null  int64   
 12  Q8               97220 non-null  int64   
 13  Q9               97220 non-null  int64   
 14  Q43              97220 non-null  int64   
 15  Q44              97220 non-null  int64   
 16  Q66              97220 non-null  int64  

### For further analysis: how many interviews conducted per country?

In [87]:
no_of_interviews_per_country = df.groupby(by=["B_COUNTRY_ALPHA"], observed=False)["B_COUNTRY_ALPHA"].aggregate("count").sort_values(ascending=False)
no_of_interviews_per_country

B_COUNTRY_ALPHA
CAN    4018
IDN    3200
CHN    3036
GBR    2609
USA    2596
       ... 
ARG    1003
URY    1000
CYP    1000
CHL    1000
NIR     447
Name: B_COUNTRY_ALPHA, Length: 66, dtype: int64

# Economy
Additional data used:
- [IMF. (2024). Die 20 Länder mit dem größten Bruttoinlandsprodukt (BIP) pro Kopf im Jahr 2023 (in US-Dollar). Statista. Statista GmbH. Zugriff: 17. Juni 2024. https://de.statista.com/statistik/daten/studie/166224/umfrage/ranking-der-20-laender-mit-dem-groessten-bruttoinlandsprodukt-pro-kopf/][1]

[1]: https://de.statista.com/statistik/daten/studie/166224/umfrage/ranking-der-20-laender-mit-dem-groessten-bruttoinlandsprodukt-pro-kopf/

In [91]:
# For some of the highest BIP countries, there's not data in our dataset:
selected_countries = ["NIR", "CHE", "NOR", "SGP", "USA", "ISL", "QAT", "MCU", "DEN"]
for country in selected_countries:
    if country not in no_of_interviews_per_country:
        print(f"There's no interview data for the country {country}")

There's no interview data for the country CHE
There's no interview data for the country NOR
There's no interview data for the country ISL
There's no interview data for the country QAT
There's no interview data for the country MCU
There's no interview data for the country DEN


In [92]:
econ_df = df.copy()
econ_df = econ_df.loc[econ_df["B_COUNTRY_ALPHA"].isin(selected_countries),
                      ["A_YEAR", "B_COUNTRY_ALPHA", "J_INTDATE",
                     "G_TOWNSIZE", "G_TOWNSIZE2", "H_SETTLEMENT", "H_URBRURAL",
                     "Q5", "Q6", "Q7", "Q8", "Q9", "Q43", "Q44"]
                     ]

econ_df.sample(n=10)

Unnamed: 0,A_YEAR,B_COUNTRY_ALPHA,J_INTDATE,G_TOWNSIZE,G_TOWNSIZE2,H_SETTLEMENT,H_URBRURAL,Q5,Q6,Q7,Q8,Q9,Q43,Q44
63284,2022,NIR,20220702,7,4,1,1,3,4,1,2,2,1,1
76616,2020,SGP,-4,8,5,1,1,2,2,2,1,2,1,1
91774,2017,USA,20170502,8,5,3,1,1,3,1,2,1,1,1
92317,2017,USA,20170504,5,3,4,2,1,1,2,2,1,3,2
76738,2020,SGP,-4,8,5,1,1,2,2,1,2,1,2,2
92296,2017,USA,20170501,8,5,3,1,1,1,2,1,1,3,1
90604,2017,USA,20170502,-5,-5,4,1,2,1,1,2,1,3,3
92305,2017,USA,20170503,5,3,4,1,1,3,1,1,2,1,1
90652,2017,USA,20170515,7,4,3,1,3,2,1,2,1,1,2
77049,2020,SGP,-4,8,5,1,1,3,4,1,2,2,3,1


In [85]:
"NIR" in no_interviews_per_country

True