# Selected socio-economic determinants of social trust

Author: Mateusz Kasprowicz
Date: January 2024

## Load libraries

In [1]:
import random
import os

import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

## Constants

In [2]:
random_state = 0

np.random.seed(random_state)
os.environ["PYTHONHASHSEED"] = str(random_state)
random.seed(random_state)

In [3]:
sklearn.set_config(transform_output="pandas")

## Load data

In [4]:
columns_used = ["cntry", 
                "agea", 
                "eduyrs", 
                "gndr", 
                "domicil", # A big city, suburbs, town or small city, country village, etc.
                "hinctnta", # Household's total net income, all sources
                "uemp3m", # Ever unemployed and seeking work for a period more than three months
                "ppltrst", # Most people can be trusted or you can't be too careful
                ]

In [5]:
data = pd.read_stata(r"../data/ESS10SC_STATA/ESS10SC.dta", columns=columns_used)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22074 entries, 0 to 22073
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   cntry     22074 non-null  object  
 1   agea      21028 non-null  category
 2   eduyrs    20074 non-null  category
 3   gndr      21439 non-null  category
 4   domicil   21291 non-null  category
 5   hinctnta  17232 non-null  category
 6   uemp3m    20676 non-null  category
 7   ppltrst   21921 non-null  category
dtypes: category(7), object(1)
memory usage: 328.7+ KB


In [7]:
data[["agea", "eduyrs"]] = data[["agea", "eduyrs"]].astype(pd.Int64Dtype())

In [8]:
data.sample(5)

Unnamed: 0,cntry,agea,eduyrs,gndr,domicil,hinctnta,uemp3m,ppltrst
6676,DE,77,,Female,,,,2
14824,IL,42,16.0,Male,Suburbs or outskirts of big city,H - 10th decile,Yes,7
18791,RS,61,12.0,Female,Town or small city,C - 3rd decile,Yes,You can't be too careful
11088,DE,51,13.0,Female,Town or small city,S - 6th decile,No,6
8420,DE,33,15.0,Male,Country village,R - 2nd decile,No,3


## Overall EDA

In [9]:
# ProfileReport(data, title="Profiling Report").to_file("../data/EDA_full.html")

## Modeling

### Model for Poland

#### Preprocess data

In [10]:
data_pl = data.loc[data.cntry == "PL"].drop(columns=["cntry"])

In [11]:
data_pl.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2065 entries, 16217 to 18281
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   agea      1972 non-null   Int64   
 1   eduyrs    1944 non-null   Int64   
 2   gndr      2065 non-null   category
 3   domicil   1989 non-null   category
 4   hinctnta  1579 non-null   category
 5   uemp3m    1940 non-null   category
 6   ppltrst   2041 non-null   category
dtypes: Int64(2), category(5)
memory usage: 63.7 KB


In [12]:
data_pl.isna().sum()

agea         93
eduyrs      121
gndr          0
domicil      76
hinctnta    486
uemp3m      125
ppltrst      24
dtype: int64

In [13]:
# ProfileReport(data_se, title="Profiling Report for Poland").to_file("../data/EDA_PL.html")

Number of observations with at least one NaN value

In [14]:
data_pl.loc[data_pl.isna().any(axis=1)].shape[0]

600

In [37]:
data_pl_preprocessed = data_pl.dropna()

In [40]:
data_pl_preprocessed[["agea", "eduyrs"]] = data_pl_preprocessed[["agea", "eduyrs"]].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pl_preprocessed[["agea", "eduyrs"]] = data_pl_preprocessed[["agea", "eduyrs"]].astype(int)


In [53]:
data_pl_preprocessed = data_pl_preprocessed.assign(top_20pct = np.where((data_pl_preprocessed.hinctnta == "H - 10th decile") | (data_pl_preprocessed.hinctnta == "D - 9th decile"), 1, 0)).drop(columns=["hinctnta"])

#### Build a model

In [41]:
# ordinal logistic regression: https://www.statsmodels.org/stable/examples/notebooks/generated/ordinal_regression.html
# https://www.statsmodels.org/dev/generated/statsmodels.miscmodels.ordinal_model.OrderedModel.html
model_pl = OrderedModel.from_formula("ppltrst ~ agea + eduyrs + gndr + domicil + hinctnta + uemp3m", data_pl_preprocessed, distr="logit")

res_log = model_pl.fit(method='bfgs', disp=False)

In [42]:
res_log.summary()

0,1,2,3
Dep. Variable:,ppltrst,Log-Likelihood:,-3035.7
Model:,OrderedModel,AIC:,6125.0
Method:,Maximum Likelihood,BIC:,6268.0
Date:,"Sun, 07 Jan 2024",,
Time:,20:13:33,,
No. Observations:,1465,,
Df Residuals:,1438,,
Df Model:,17,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gndr[T.Female],-0.0993,0.095,-1.048,0.295,-0.285,0.086
domicil[T.Suburbs or outskirts of big city],0.1469,0.227,0.648,0.517,-0.298,0.591
domicil[T.Town or small city],0.0123,0.124,0.099,0.921,-0.231,0.256
domicil[T.Country village],0.0237,0.132,0.179,0.858,-0.236,0.283
domicil[T.Farm or home in countryside],-0.2343,0.262,-0.893,0.372,-0.748,0.280
hinctnta[T.R - 2nd decile],0.2412,0.219,1.103,0.270,-0.188,0.670
hinctnta[T.C - 3rd decile],-0.1351,0.231,-0.585,0.559,-0.588,0.318
hinctnta[T.M - 4th decile],-0.0579,0.226,-0.257,0.797,-0.500,0.384
hinctnta[T.F - 5th decile],-0.2068,0.229,-0.905,0.366,-0.655,0.241


In [65]:
model_pl_v2 = OrderedModel.from_formula("ppltrst ~ agea + eduyrs + gndr + domicil + top_20pct + uemp3m", data_pl_preprocessed, distr="logit")

res_log = model_pl_v2.fit(method='bfgs', disp=False)
res_log.summary()

0,1,2,3
Dep. Variable:,ppltrst,Log-Likelihood:,-3039.6
Model:,OrderedModel,AIC:,6117.0
Method:,Maximum Likelihood,BIC:,6218.0
Date:,"Sun, 07 Jan 2024",,
Time:,20:38:14,,
No. Observations:,1465,,
Df Residuals:,1446,,
Df Model:,9,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gndr[T.Female],-0.1021,0.094,-1.089,0.276,-0.286,0.082
domicil[T.Suburbs or outskirts of big city],0.1362,0.225,0.605,0.545,-0.305,0.577
domicil[T.Town or small city],0.0333,0.123,0.271,0.787,-0.208,0.274
domicil[T.Country village],0.0444,0.132,0.337,0.736,-0.214,0.303
domicil[T.Farm or home in countryside],-0.2060,0.261,-0.789,0.430,-0.718,0.306
uemp3m[T.No],0.0893,0.099,0.905,0.365,-0.104,0.283
agea,-0.0012,0.003,-0.459,0.646,-0.006,0.004
eduyrs,0.0798,0.015,5.432,0.000,0.051,0.109
top_20pct,0.2934,0.120,2.442,0.015,0.058,0.529


### Model for Sweden

In [58]:
data_se = data[data.cntry == "SE"].drop(columns=["cntry"])

In [59]:
data_se.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2287 entries, 19787 to 22073
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   agea      2260 non-null   Int64   
 1   eduyrs    2125 non-null   Int64   
 2   gndr      2280 non-null   category
 3   domicil   2279 non-null   category
 4   hinctnta  2241 non-null   category
 5   uemp3m    2264 non-null   category
 6   ppltrst   2272 non-null   category
dtypes: Int64(2), category(5)
memory usage: 70.4 KB


In [60]:
data_se.isna().sum()

agea         27
eduyrs      162
gndr          7
domicil       8
hinctnta     46
uemp3m       23
ppltrst      15
dtype: int64

In [None]:
# ProfileReport(data_pl, title="Profiling Report for Poland").to_file("../data/EDA_PL.html")

Number of observations with at least one NaN value

In [61]:
data_se.loc[data_se.isna().any(axis=1)].shape[0]

232

In [62]:
data_se_preprocessed = data_se.dropna()

In [63]:
data_se_preprocessed[["agea", "eduyrs"]] = data_se_preprocessed[["agea", "eduyrs"]].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_se_preprocessed[["agea", "eduyrs"]] = data_se_preprocessed[["agea", "eduyrs"]].astype(int)


In [64]:
data_se_preprocessed = data_se_preprocessed.assign(top_20pct = np.where((data_se_preprocessed.hinctnta == "H - 10th decile") | (data_se_preprocessed.hinctnta == "D - 9th decile"), 1, 0)).drop(columns=["hinctnta"])

In [66]:
model_pl_v2 = OrderedModel.from_formula("ppltrst ~ agea + eduyrs + gndr + domicil + top_20pct + uemp3m", data_se_preprocessed, distr="logit")

res_log = model_pl_v2.fit(method='bfgs', disp=False)
res_log.summary()

0,1,2,3
Dep. Variable:,ppltrst,Log-Likelihood:,-4578.4
Model:,OrderedModel,AIC:,9195.0
Method:,Maximum Likelihood,BIC:,9302.0
Date:,"Sun, 07 Jan 2024",,
Time:,20:38:39,,
No. Observations:,2055,,
Df Residuals:,2036,,
Df Model:,9,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gndr[T.Female],-0.0899,0.078,-1.155,0.248,-0.243,0.063
domicil[T.Suburbs or outskirts of big city],-0.0248,0.119,-0.207,0.836,-0.259,0.209
domicil[T.Town or small city],-0.1105,0.112,-0.987,0.324,-0.330,0.109
domicil[T.Country village],-0.0396,0.137,-0.288,0.773,-0.308,0.229
domicil[T.Farm or home in countryside],-0.0684,0.153,-0.448,0.654,-0.367,0.231
uemp3m[T.No],0.2125,0.089,2.396,0.017,0.039,0.386
agea,0.0145,0.002,6.996,0.000,0.010,0.019
eduyrs,0.0571,0.009,6.235,0.000,0.039,0.075
top_20pct,0.4668,0.094,4.941,0.000,0.282,0.652
