# 01 Data processing (Immigration)
Two dataframes are prepared:
- df_bg contains all background characteristics used for regression analysis
- df_sq contains the ___immigration___ search query columns for further data prep in 02_query_processing

Original dataset available at OSF: https://osf.io/yu64r/

In [1]:
import pandas as pd
import numpy as np

In [2]:
# setting paths
PATH = '/Users/marieke/SearchingForBias'

In [3]:
# load data set
df = pd.read_csv(PATH+'/data/DigSocSurvey_26012021.tsv', delimiter='\t')
df.shape

(1994, 479)

In [4]:
#df.head()

In [5]:
# column selection
vote_cols = ['base_voteInt_1', 'base_voteInt_2','base_voteInt_3', 'base_voteInt_4', 'base_voteInt_5', 'base_voteInt_6',
             'base_voteInt_7', 'base_voteInt_8', 'base_voteInt_9', 'base_voteInt_10', 'base_voteInt_11', 'base_voteInt_12',
             'base_voteInt_13', 'base_voteInt_14', 'base_voteInt_15', 'base_voteInt_16', 'base_voteInt_17', 'base_voteInt_18',
             'base_voteInt_14_TEXT']
cols = ["base_lft", "base_gender", "base_opl", "base_regio", "base_polar", "base_intpol", "base_voteTK2017", "base_voteTK2017_14_TEXT",
        "MVH_att_im_1", "MVH_att_im_2", "MVH_att_im_3", "MVH_att_importance_1"] + vote_cols
sq_cols = ["MVH_search_im_1", "MVH_search_im_2", "MVH_search_im_3"]

In [6]:
# search queries 
df_sq = df[["ID"]+sq_cols].copy()
df_sq.head()

Unnamed: 0,ID,MVH_search_im_1,MVH_search_im_2,MVH_search_im_3
0,1,data criminaliteit allochtonen,immigratie data,criminaliteit demografie
1,2,asielzoekers,alochtonen,asielzoekerscentrum
2,3,Immigranten in Nederland laatste 5 jaar CBS,Reden van vluchten immigranten,Refugees in Europe
3,4,Geen,Geen,Geen
4,5,reintegratie vluchtelingen,asielzoekers Nederland,vluchtelingen


In [7]:
# save search query columns for further processing in 02_query_processing
df_sq.to_pickle(PATH+"/data/immigration/01_df_sq.pkl")

In [8]:
# independent variables
df_bg = df[["ID"]+cols].copy()
df_bg.head()

Unnamed: 0,ID,base_lft,base_gender,base_opl,base_regio,base_polar,base_intpol,base_voteTK2017,base_voteTK2017_14_TEXT,MVH_att_im_1,...,base_voteInt_10,base_voteInt_11,base_voteInt_12,base_voteInt_13,base_voteInt_14,base_voteInt_15,base_voteInt_16,base_voteInt_17,base_voteInt_18,base_voteInt_14_TEXT
0,1,36,1,6,4,7.0,5.0,1,,1,...,,,,,,,,,,
1,2,18,2,3,5,7.0,,16,,3,...,,,,,,,,,,
2,3,19,2,5,2,4.0,6.0,16,,4,...,,,,,,,,1.0,,
3,4,43,2,5,12,5.0,5.0,16,,3,...,,,,,,,,1.0,,
4,5,67,2,3,12,2.0,7.0,7,,4,...,,,,,,,,,,


#### Immigration attitude

In [9]:
att_cols = ["MVH_att_im_1", "MVH_att_im_2", "MVH_att_im_3"]
for c in att_cols:
    print(df_bg[c].value_counts(dropna=False))

4    566
3    329
5    323
6    238
1    192
7    174
2    172
Name: MVH_att_im_1, dtype: int64
4    502
5    457
3    289
6    256
1    182
2    170
7    138
Name: MVH_att_im_2, dtype: int64
5    374
4    353
6    310
3    257
1    256
7    255
2    189
Name: MVH_att_im_3, dtype: int64


In [10]:
# mean score on three items (cronbach's alpha on final sample size in other script)
df_bg["att_im_mean"] = df_bg[att_cols].mean(axis=1)
df_bg["att_im_mean"].describe()

count    1994.000000
mean        4.106319
std         1.510793
min         1.000000
25%         3.000000
50%         4.333333
75%         5.000000
max         7.000000
Name: att_im_mean, dtype: float64

#### Issue importance

In [11]:
df_bg.MVH_att_importance_1.value_counts(dropna=False)

3    713
4    686
2    308
5    159
1    128
Name: MVH_att_importance_1, dtype: int64

#### Political orientation

In [12]:
df_bg.base_polar.value_counts(dropna=False)
# 8 respondents missing

5.0     393
7.0     264
8.0     220
6.0     206
4.0     205
3.0     205
2.0     141
11.0    141
1.0      71
0.0      52
10.0     47
9.0      41
NaN       8
Name: base_polar, dtype: int64

In [13]:
# set 11 (I don't want to say) to missing.
df_bg['base_polar'] = df_bg['base_polar'].replace(11, np.nan)
# now 149 respondents missing.

##### Political interest 

In [14]:
df_bg.base_intpol.value_counts(dropna=False)
# 13 respondents missing.

6.0     648
7.0     275
5.0     193
3.0     155
0.0     149
9.0     133
4.0     120
2.0     119
1.0     104
10.0     85
NaN      13
Name: base_intpol, dtype: int64

#### Vote choice

In [15]:
# how many chose multiple parties
party_cols = ["base_voteInt_"+str(i) for i in range(1,15)]
df_bg['sum_parties']=df_bg[party_cols].sum(axis=1)
df_bg.sum_parties.value_counts(normalize=True)

1.0    0.469910
0.0    0.249248
2.0    0.158977
3.0    0.121866
Name: sum_parties, dtype: float64

In [16]:
parties = {1:'VVD',
           2:'PVV',
           3:'CDA',
           4:'D66',
           5:'GroenLinks',
           6:'SP',
           7:'PvdA',
           8:'ChristenUnie',
           9:'PvdD',
           10:'50Plus',
           11:'SGP',
           12:'DENK',
           13:'FvD',
           14:'Andere_partij',
           15:'Blank',
           16:'I_do_not_know_yet',
           17:'I_do_not_want_to_say'
          }

In [17]:
parties2 = list(parties.values())
party_cols2 = party_cols+["base_voteInt_15", "base_voteInt_16", "base_voteInt_17"]

In [18]:
df_bg.rename(columns=dict(zip(party_cols2, parties2)), inplace=True)

In [19]:
df_bg[parties2] = df_bg[parties2].fillna(0)

In [20]:
for c in parties2:
    print(df_bg[c].value_counts(dropna=False))

0.0    1600
1.0     394
Name: VVD, dtype: int64
0.0    1729
1.0     265
Name: PVV, dtype: int64
0.0    1830
1.0     164
Name: CDA, dtype: int64
0.0    1765
1.0     229
Name: D66, dtype: int64
0.0    1709
1.0     285
Name: GroenLinks, dtype: int64
0.0    1786
1.0     208
Name: SP, dtype: int64
0.0    1756
1.0     238
Name: PvdA, dtype: int64
0.0    1885
1.0     109
Name: ChristenUnie, dtype: int64
0.0    1837
1.0     157
Name: PvdD, dtype: int64
0.0    1948
1.0      46
Name: 50Plus, dtype: int64
0.0    1954
1.0      40
Name: SGP, dtype: int64
0.0    1980
1.0      14
Name: DENK, dtype: int64
0.0    1865
1.0     129
Name: FvD, dtype: int64
0.0    1972
1.0      22
Name: Andere_partij, dtype: int64
0.0    1970
1.0      24
Name: Blank, dtype: int64
0.0    1895
1.0      99
Name: I_do_not_know_yet, dtype: int64
0.0    1641
1.0     353
Name: I_do_not_want_to_say, dtype: int64


In [21]:
#df_bg.base_voteInt_14_TEXT.value_counts(dropna=False)

#### Education level (low middle high)

In [22]:
opl = {1:1,
       2:1,
       3:2,
       4:2,
       5:2,
       6:3,
       7:3,
      }
df_bg['opl_3cat'] = df_bg['base_opl'].replace(opl)

In [23]:
df_bg.opl_3cat.value_counts(dropna=False)

2    1036
3     792
1     166
Name: opl_3cat, dtype: int64

#### Gender

In [24]:
df_bg.base_gender.value_counts(dropna=False)

1    1004
2     985
3       5
Name: base_gender, dtype: int64

#### Age

In [25]:
df_bg.base_lft.value_counts(dropna=False)

54    82
51    71
52    70
63    69
58    66
66    62
65    60
67    60
60    60
64    59
53    57
56    56
61    54
62    53
55    49
57    48
30    47
34    45
41    44
59    43
22    38
50    37
40    37
28    36
26    34
39    34
23    34
20    33
42    33
24    32
18    32
38    31
27    31
44    29
43    29
33    27
31    26
35    26
47    25
32    25
29    24
36    24
25    24
19    24
21    23
45    23
37    21
49    16
46    14
48    13
69     1
70     1
77     1
68     1
Name: base_lft, dtype: int64

In [26]:
#df_bg.head()

In [27]:
df_bg.to_pickle(PATH+"/data/immigration/02_df_bg.pkl")