# Selected socio-economic determinants of social trust

Author: Mateusz Kasprowicz
Date: January 2024

## Load libraries

In [1]:
import random

import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import sklearn
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

## Constants

In [2]:
np.random.seed(0)
random.seed(0)

## Load data

In [6]:
columns_used = ["cntry", 
                "agea", 
                "eduyrs", 
                "gndr", 
                "domicil", # A big city, suburbs, town or small city, country village, etc.
                "hinctnta", # Household's total net income, all sources
                "uemp3m", # Ever unemployed and seeking work for a period more than three months
                "ppltrst", # Most people can be trusted or you can't be too careful
                ]

In [7]:
data = pd.read_stata(r"../data/ESS10SC_STATA/ESS10SC.dta", columns=columns_used)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22074 entries, 0 to 22073
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   cntry     22074 non-null  object  
 1   agea      21028 non-null  category
 2   eduyrs    20074 non-null  category
 3   gndr      21439 non-null  category
 4   domicil   21291 non-null  category
 5   hinctnta  17232 non-null  category
 6   uemp3m    20676 non-null  category
 7   ppltrst   21921 non-null  category
dtypes: category(7), object(1)
memory usage: 328.7+ KB


In [11]:
data[["agea", "eduyrs"]] = data[["agea", "eduyrs"]].astype(pd.Int64Dtype())

In [12]:
data.sample(5)

Unnamed: 0,cntry,agea,eduyrs,gndr,domicil,hinctnta,uemp3m,ppltrst
6676,DE,77,,Female,,,,2
14824,IL,42,16.0,Male,Suburbs or outskirts of big city,H - 10th decile,Yes,7
18791,RS,61,12.0,Female,Town or small city,C - 3rd decile,Yes,You can't be too careful
11088,DE,51,13.0,Female,Town or small city,S - 6th decile,No,6
8420,DE,33,15.0,Male,Country village,R - 2nd decile,No,3


## Overall EDA

In [13]:
profile = ProfileReport(data, title="Profiling Report")
profile.to_file("../data/EDA_full.html")

## Modeling

### Model for Poland

#### Preprocess data

In [17]:
data_pl = data[data.cntry == "PL"].drop(columns=["cntry"])

In [25]:
data_pl.shape[0]

2065

In [None]:
profile = ProfileReport(data_pl, title="Profiling Report for Poland")
profile.to_file("../data/EDA_PL.html")

Number of observations with at least one NaN value

In [34]:
data_pl.loc[data_pl.isna().any(axis=1)].shape[0]

600

In [36]:
# TODO: which NaNs to drop and which to impute -> then do the same for SE

#### Build a model

In [None]:
# ordinal logistic regression: https://www.statsmodels.org/stable/examples/notebooks/generated/ordinal_regression.html
# https://www.statsmodels.org/dev/generated/statsmodels.miscmodels.ordinal_model.OrderedModel.html
model_pl = OrderedModel(endog=None,
                       exog=None,
                       distr="logit")
res_log = model_pl.fit(method='bfgs', disp=False)
res_log.summary()

### Model for Sweden

In [21]:
data_se = data[data.cntry == "SE"].drop(columns=["cntry"])

In [None]:
profile = ProfileReport(data_se, title="Profiling Report for Sweden")
profile.to_file("../data/EDA_SE.html")

In [None]:
model_se = OrderedModel(endog=None,
                       exog=None,
                       distr="logit")
res_log = model_se.fit(method='bfgs', disp=False)
res_log.summary()