In [1]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [3]:
data = pd.read_csv('data\survey_data.csv',low_memory=False)

In [4]:
#copy fields to be used in model

df = data[['age', 
           #'age_category',
           'gender',
           'education',
           'employment_status',
           'household_income', 
           'subjective_income',
           'geography', 
           'region',
           #'country', 
           #'country_code', 
           'country_index',
           #'country_income',
           #'national_weight', #national weight, for analysis at the country level
           #'population_weight', #population weight (included factor to project to 15+ population in each country) for analysis of pooled multi-country data
           'Q1', #how much do you, personally, know about science? do you know a lot, some, not much, or nothing at all?
           'Q2', #on this survey, when i say 'science' i mean the understanding we have about the world from observation and testing. when i say 'scientists' i mean people who study the planet earth, nature and medicine, among other things. how much did you understand the meaning of 'science' and 'scientists' that was just read? did you understand all of it, some of it, not much of it, or none of it?
           'Q3', #do you think studying diseases is a part of science?
           'Q4', #do you think writing poetry is a part of science?
           'Q5A', #have you, personally, ever, learned about science at primary school?
           'Q5B', #have you, personally, ever, learned about science at secondary school?
           'Q5C', #have you, personally, ever, learned about science at college/university?
           'Q6', #have you, personally, tried to get any information about science in the past 30 days?
           'Q7', #have you, personally, tried to get any information about medicine, disease, or health in the past 30 days?
           'Q8', #would you, personally, like to know more about science?
           'Q9', #would you, personally, like to know more about medicine, disease, or health?
           'Q10A', #in (country), do you have confidence in each of the following, or not? how about non-governmental organizations or non-profit organizations. 
           'Q10B', #in (country), do you have confidence in each of the following, or not? how about hospitals and health clinics.
           'Q11A', #how much do you trust each of the following? how about the people in your neighborhood? do you trust them a lot, some, not much, or not at all?
           'Q11B', #how much do you trust each of the following? how about the national government in this country? do you trust them a lot, some, not much, or not at all?
           'Q11C', #how much do you trust each of the following? how about scientists in this country? do you trust them a lot, some, not much, or not at all?
           'Q11D', #how much do you trust each of the following? how about journalists in this country? do you trust them a lot, some, not much, or not at all?
           'Q11E', #how much do you trust each of the following? how about doctors and nurses in this country? do you trust them a lot, some, not much, or not at all?
           'Q11F', #how much do you trust each of the following? how about people who work at non-governmental organizations or non-profit organizations? do you trust them a lot, some, not much, or not at all?
           'Q11G', #how much do you trust each of the following? how about traditional healers (or country equivalent) in this country? do you trust them a lot, some, not much, or not at all?
           'Q12', #in general, would you say that you trust science a lot, some, not much, or not at all?
           'Q13', #in general, how much do you trust scientists to find out accurate information about the world? a lot, some, not much, or not at all?
           'Q14A', #how much do you trust scientists working in colleges/universities in this country to do each of the following? to do their work with the intention of benefiting the public. do you trust them to do this a lot, some, not much, or not at all?
           'Q14B', #how much do you trust scientists working in colleges/universities in this country to do each of the following? to be open and honest about who is paying for their work. do you trust them to do this a lot, some, not much, or not at all?
           'Q15A', #now, thinking about companies - for example, those who make medicines or agricultural supplies - how much do you trust scientists working for companies in this country to do each of the following? to do their work with the intention of benefiting the public. do you trust them to do this a lot, some, not much, or not at all?
           'Q15B', #now, thinking about companies - for example, those who make medicines or agricultural supplies - how much do you trust scientists working for companies in this country to do each of the following? to be open and honest about who is paying for their work. do you trust them to do this a lot, some, not much, or not at all?
           'Q16', #in general, do you think the work that scientists do benefits most, some, or very few people in this country?
           'Q17', #in general, do you think the work that scientists do benefits people like you in this country?
           'Q18', #overall, do you think that science and technology will help improve life for the next generation?
           'Q19', #overall, do you think that science and technology will increase/decrease the number of jobs in your local area in the next five years?
           'Q20', #which of the following people do you trust most to give you medical or health advice?
           'Q21', #in general, how much do you trust medical and health advice from the government in this country? a lot, some, not much, or not at all?
           'Q22', #in general, how much do you trust medical and health advice from medical workers, such as doctors and nurses, in this country? a lot, some, not much, or not at all?
           'Q23', #a vaccine is given to people to strengthen their body's ability to fight certain diseases. sometimes people are given a vaccine as an injection, but vaccines can also be given by mouth or some other way. before today, had you ever heard of a vaccine?
           'Q24', #do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? vaccines are important for children to have.
           'Q25', #do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? vaccines are safe.
           'Q26', #do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? vaccines are effective.
           #'Q27', #do you, personally, have any children? 
           #'Q28', #(if respondent is a parent) to the best of your knowledge have any of your children ever received a vaccine that was supposed to prevent them from getting childhood diseases such as (examples), or not?
           'D1'#, #are you religious?
           #'Q29', #(if respondent identifies with a religion) has science ever disagreed with the teachings of your religion?
           #'Q30', #(if respondent believes science has disagreed with teachings of religion) generally speaking, when science disagrees with the teachings of your religion, what do you believe? science or the teachings of your religion?
           #'ViewOfScience', #wellcome global monitor trust in scientists index
           #'WGM_Index', #wellcome global monitor trust in scientists index (recoded into 3 categories)
           #'WGM_Indexr', #how a person views personal & societal benefit of science
           ]].copy()

In [5]:
#standardise and remove rows with null fields and rows with questions unanswered

def nanNoAnswer(x):
    if x == '!' or x == '?' or str(x) == 'nan':
        return None
    else:
        return x
    
for i in df.columns:
    df[i] = df[i].apply(lambda x:nanNoAnswer(x))
    
df = df.dropna().reset_index()

In [6]:
#return category labels as integers
#loose standard of classifying from smallests/lowest/least desireable answer as 0, 1...N. else arbitrary for labels

def convAge(x):
    if x == 'young':
        return 0
    if x == 'middle':
        return 1
    if x == 'old':
        return 2
def convGen(x):
    if x == 'male':
        return 0
    if x == 'female':
        return 1
def convEdu(x):
    if 'time' in x:
        return 1
    else:
        return 0
def convSubjIncome(x):
    if x == 'struggling':
        return 0
    if x == 'managing':
        return 1
    if x == 'comfortable':
        return 2
def convGeog(x):
    if x == 'rural':
        return 0
    if x == 'urban':
        return 1
def convReg(x):
    if x == 'northern america':
        return 0
    if x =='north africa':
        return 1
    if x =='middle east':
        return 2
    if x =='south asia':
        return 3
    if x =='southeast asia':
        return 4
    if x =='northern europe':
        return 5
    if x =='western europe':
        return 6
    if x =='southern europe':
        return 7
    if x =='eastern europe':
        return 8
    if x =='east asia':
        return 9
    if x =='south america':
        return 10
    if x =='central america and mexico':
        return 11
    if x =='western africa':
        return 12
    if x =='eastern africa':
        return 13
    if x =='southern africa':
        return 14
    if x =='aus/nz':
        return 15
    if x =='central asia':
        return 16
    if x =='central africa':
        return 17
def convAuthority(x):
    if x == 'famous person':
        return 0
    if x== 'religious leader':
        return 1
    if x== 'traditional healer':
        return 2
    if x == 'friend/family':
        return 3
    if x== 'doctor/nurse':
        return 4
    if x == 'An other':
        return 5
def vaxSafe(x):
    if x == '1' or x == '0.75':
        return 1
    else:
        return 0

In [7]:
#add integer category columns 

df['int_gender'] = df['gender'].apply(lambda x:convGen(x))
df['int_education'] = df['education'].apply(lambda x:convEdu(x))
df['int_subj_income']  = df['subjective_income'].apply(lambda x:convSubjIncome(x))
df['int_geography'] = df['geography'].apply(lambda x:convGeog(x))
df['int_region'] = df['region'].apply(lambda x:convReg(x))
df['int_advice'] = df['Q20'].apply(lambda x:convAuthority(x))
df['vax_safe'] = df['Q25'].apply(lambda x:vaxSafe(x))

In [8]:
#expect higher number of rows where vaccines believed safe
#equalise dataset by randomly selecting from vax_safe subset a sample size equal to number of rows where vaccines belived to be unsafe

safe = df['vax_safe'] == 1
unsafe = df['vax_safe'] == 0

safe_index = df[safe].index.tolist()

L = df[unsafe].shape[0]
new_index = random.sample(safe_index, L)

all_index = new_index + df[unsafe].index.tolist()

df = df.iloc[all_index].reset_index()

In [9]:
X = df[['age',  'household_income','Q1', 'Q2', 'Q3', 'Q4', 'Q5A',
       'Q5B', 'Q5C', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10A', 'Q10B', 'Q11A', 'Q11B',
       'Q11C', 'Q11D', 'Q11E', 'Q11F', 'Q11G', 'Q12', 'Q13', 'Q14A', 'Q14B',
       'Q15A', 'Q15B', 'Q16', 'Q17', 'Q18', 'Q19', 'Q21', 'Q22', 'Q23','Q24','Q26',
       'D1', 'int_gender','int_education',
       'int_subj_income', 'int_geography', 'country_index', 'int_advice']].copy()

y=df[['vax_safe']].copy()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)

In [11]:
vax_classifier = DecisionTreeClassifier(max_leaf_nodes=50, random_state=0)
vax_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=50,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [12]:
predictions = vax_classifier.predict(X_test)

In [13]:
accuracy_score(y_true = y_test, y_pred = predictions)

0.7744961779013204