This script to prepare the data for testing out the synthethic data

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
eurob = pd.read_csv("data/SI395.csv") ## 38,718 rows 480 columns
dfshape = eurob.shape
print("data dimensions = {}".format(dfshape)) ##

#all_calls = list(eurob.columns)
#print(all_calls)

data dimensions = (38718, 480)


We need two different datasets out of the Eurobarometer

To produce these, I'm going to subset by column name, first eyeballing the start and end of the list so sanity check. I'm looking to make sure that the first and last column names in the subset look like survey questions rather than metadata.

- Responses
These are the columns that correspond to survey responses. According to my notes, these correspond to 57:412. The column names generally start with "q."

In [3]:
questions_subset = list(eurob.columns)[57:412]

## Check the first column not selected & verify that format does not look like survey q
print(list(eurob.columns)[413]) 


capi_cawi


- Respondant attributes
We will use these for the visualizations. These are: unique ID, country, social class, d63, qb7_2, and measures of political orientation and trust.

[1] "uniqid"       "isocntry"     "d63"          "qb7_2"        "class"        "trusttradm"   "trustwebonly"
 [8] "trustallm"    "trustnom"     "polorient"    "mediatrust"  

These correspond to "uniqid", "isocntry", and dummied-variables for: "d63", "qb7_2", qa6a, qa20, qd4a, d63, qd1

Revision: We also need to pull the uniqueID (uniqid), or we will not be able to match the metadata.

In [4]:
## Question codes for the IDVs (from 00demographics.R in the original)
## QA6a - trust in media
## QA20 -- who trust for info on covid
## QD4a -- where get news
## Qd3 is self-identified class
## Qd1 is political orientation

trust = [col for col in eurob.columns if col.startswith('qa6a')] ## trust in media
covidinfo = [col for col in eurob if col.startswith('qa20')] ## who trust for covid news
news= [col for col in eurob if col.startswith('qd4a')] ## news consumption
social_class = [col for col in eurob if col.startswith('d63')]# self-identified social class 
pol_oriet = [col for col in eurob if col.startswith('d1')] ## these are demographics (education, employment, gender)
    
idvs_subset = (['uniqid', 'isocntry', 'd63', 'qb7_2'] + trust + covidinfo + news + social_class + pol_oriet)

print(idvs_subset)
## Which are missing in our data:
## Evidently these are renamed variables from the data. 
## Find the original 
#print(set(idvs_subset).difference(set(all_calls))) 
#print(list(eurob.columns))


['uniqid', 'isocntry', 'd63', 'qb7_2', 'qa6a_1', 'qa6a_2', 'qa6a_3', 'qa6a_4', 'qa6a_5', 'qa6at', 'qa20.1', 'qa20.2', 'qa20.3', 'qa20.4', 'qa20.5', 'qa20.6', 'qa20.7', 'qa20.8', 'qa20.9', 'qa20.10', 'qd4a', 'd63', 'd11', 'd11r1', 'd11r2', 'd10', 'd15a', 'd15a_r1', 'd15a_r2', 'd15b', 'd15b_r', 'd1', 'd1r1', 'd1r2']


In [5]:
## Subset the data:
## Pull the union of the questions and idvs:

cols_tot = list(set(questions_subset + idvs_subset))
print(len(cols_tot)) ## 360 deduped into a list

eurob2 = eurob[cols_tot]
print(eurob2.shape) ## 38718, 360
print(eurob.shape) ## 38718, 480, so removed 12 columns

print(eurob2.iloc[0:5, 0:7]) #eyeball

del eurob ## reduce confusion

360
(38718, 360)
(38718, 480)
   qb6_4  d73_4  qa5.15  qa10_3  qa18_4  qb5.11  qc3b.8
0    2.0    2.0     0.0       1     3.0     0.0     0.0
1    1.0    1.0     0.0       1     2.0     0.0     0.0
2    NaN    1.0     NaN       1     NaN     NaN     0.0
3    NaN    1.0     NaN       2     NaN     NaN     0.0
4    1.0    1.0     0.0       3     2.0     0.0     0.0


Standardize column names (some have '.' and some have "_"):

In [6]:
eurob2.columns = [c.replace(".", "_") for c in list(eurob2.columns)]

#print(type(eurob2))
#print(list(eurob2.columns))

Construct the composite variables: trusttradm, trustwebonly, trustallm, trustnom, mediatrust, class, polorient 

In [7]:
## Class:
eurob2["d63"].value_counts() 

eurob2['class'] = 0 #None 

eurob2.loc[eurob2['d63'] == 1, 'class'] = 1 #'WorkingClass'
eurob2.loc[eurob2['d63'] == 2, 'class'] = 2 #'LowerMiddle'
eurob2.loc[eurob2['d63'] == 3, 'class'] = 3 #'MiddleClass'
eurob2.loc[eurob2['d63'] == 4, 'class'] = 4 #'UpperMiddleClass'
eurob2.loc[eurob2['d63'] == 5, 'class'] = 5 #'UpperClass'
eurob2.loc[eurob2['d63'].isin([6, 7, 8, 9]), 'class'] = 5 #'Other/None/DN/Refused'

eurob2["class"].value_counts() 


class
3    19726
1     7473
2     6886
4     3892
5      741
Name: count, dtype: int64

In [8]:
## Political orientation

eurob2['polorient'] = 0 # None

eurob2.loc[eurob2['d1'].isin([1, 2]), 'polorient'] = 1 #'FarLeft'
eurob2.loc[eurob2['d1'].isin([3, 4]), 'polorient'] = 2 #'Left'
eurob2.loc[eurob2['d1'].isin([5, 6]), 'polorient'] = 3 #'Center'
eurob2.loc[eurob2['d1'].isin([7, 8]), 'polorient'] =  4 #'Right'
eurob2.loc[eurob2['d1'].isin([9, 10]), 'polorient'] = 5 #'FarRight'
eurob2.loc[eurob2['d1'].isin([97, 98]), 'polorient'] =  6 #'Refused/DNt'

eurob2["polorient"].value_counts() 
eurob2["polorient"].describe()


count    38718.000000
mean         3.130327
std          1.260320
min          1.000000
25%          2.000000
50%          3.000000
75%          4.000000
max          6.000000
Name: polorient, dtype: float64

In [9]:
## Media Trust
eurob2['mediatrust'] = 9 #'Other'

eurob2.loc[((eurob2['qa6a_1'] == 1) | 
           (eurob2['qa6a_2'] == 1) | 
           ((eurob2['qa6a_3'] == 1) & 
            (eurob2['qa6a_4'] == 2) & 
            (eurob2['qa6a_5'] == 2))), 
          'mediatrust'] = 1 #'TrustTrad'

eurob2.loc[((eurob2['qa6a_1'] == 2) & 
           (eurob2['qa6a_2'] == 2) & 
           (eurob2['qa6a_3'] == 2) & 
           ((eurob2['qa6a_4'] == 1) | 
            (eurob2['qa6a_5'] == 1))), 
          'mediatrust'] = 2 #'TrustWebOnly'

eurob2.loc[((eurob2['qa6a_1'] == 1) & 
           (eurob2['qa6a_2'] == 1) & 
           (eurob2['qa6a_3'] == 1) & 
           (eurob2['qa6a_4'] == 1) & 
           (eurob2['qa6a_5'] == 1)), 'mediatrust'] = 3 #'TrustAll'

eurob2.loc[((eurob2['qa6a_1'] == 2) & 
           (eurob2['qa6a_2'] == 2) & 
           (eurob2['qa6a_3'] == 2) & 
           (eurob2['qa6a_4'] == 2) & 
           (eurob2['qa6a_5'] == 2)), 'mediatrust'] = 4 # 'TrustNone'

print(eurob2['mediatrust'].value_counts())

mediatrust
1    21988
4     7414
3     4123
9     2953
2     2240
Name: count, dtype: int64


In [10]:
# Trust no media:

eurob2['trustnom']  = 0

eurob2.loc[((eurob2['qa6a_1'] == 2) & ## written press
           (eurob2['qa6a_2'] == 2) &  ## Radio
           (eurob2['qa6a_3'] == 2) & ## TV
           (eurob2['qa6a_4'] == 2) & ## Internet
           (eurob2['qa6a_5'] == 2)),  ## online social nets
          'trustnom'] = 1

print(eurob2['trustnom'].value_counts())


eurob2['trustallm'] = 0
      
eurob2.loc[((eurob2['qa6a_1'] ==1) & ## written press
            (eurob2['qa6a_2'] ==1) & ## radio
            (eurob2['qa6a_3'] ==1) & ## TV
            (eurob2['qa6a_4'] == 1) &  ## Internet
            (eurob2['qa6a_5'] == 1)),
      'trustallm'] <- 1 ## online social nets
      
print(eurob2['trustallm'].value_counts()) ### Note-- all 0

trustnom
0    31304
1     7414
Name: count, dtype: int64
trustallm
0    38718
Name: count, dtype: int64


In [11]:
## Web only

eurob2['trustwebonly'] = 0

eurob2.loc[((eurob2['qa6a_1'] == 2) & 
           (eurob2['qa6a_2'] == 2) & 
           (eurob2['qa6a_3'] == 2) & 
           ((eurob2['qa6a_4'] == 1) | 
            (eurob2['qa6a_5'] == 1))),
          'trustwebonly'] = 1

print(eurob2['trustwebonly'].value_counts())

## Trust traditional:
eurob2['trusttradm'] = 0

# Assigning value 1 based on conditions
eurob2.loc[((eurob2['qa6a_1'] == 1) | 
           (eurob2['qa6a_2'] == 1) | 
           ((eurob2['qa6a_3'] == 1) & 
            (eurob2['qa6a_4'] == 2) & 
            (eurob2['qa6a_5'] == 2))),
          'trusttradm'] = 1

print(eurob2['trusttradm'].value_counts())


trustwebonly
0    36478
1     2240
Name: count, dtype: int64
trusttradm
1    26111
0    12607
Name: count, dtype: int64


In [12]:
print(len(eurob2["isocntry"].value_counts())) ## 40 countries represented


40


## Part Two: Sample from the full data

Taking a subset of the rows to reduce the resources needed from the synthetic data algorithm.

(At some point, I might do away with this entirely if it is worth investing in the full infrastructure.)

In [13]:

eurob_toy = eurob2.sample(frac=0.1,
                         replace=False, 
                         random_state= 42424)

print(eurob_toy.shape) ##3872 x 367

(3872, 367)


In [14]:
## Save the data:

eurob_toy.to_csv("eurob_toy.csv", index=False)

## Save the vector of userID: