In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

import pingouin as pg
from scipy.stats import fisher_exact
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind

from matplotlib import ticker

%matplotlib inline 

from platform import python_version
print(python_version())

In [None]:
ls

In [None]:
#df = pd.read_stata('DatasetPsychometricsMalawi.dta')
df = pd.read_stata('AlexMartinToCheckScores.dta')
df.tail()

In [None]:
df_clean = pd.DataFrame()
df_clean['id'] = df.index

#confidence
cols = ['sec3_q24', 'sec3_q25', 'sec3_q30', 'sec3_q31', 'sec3_32', 'sec3_q37']
for col in cols:
    df_clean[col] = np.nan
    df_clean.loc[df[col] == 'Strongly disagree', col] = 0
    df_clean.loc[df[col] == 'Somewhat disagree', col] = 1
    df_clean.loc[df[col] == 'Somewhat agree', col] = 2
    df_clean.loc[df[col] == 'Strongly agree', col] = 3

#talks
cols = ['sec5_q51', 'sec5_q53',    'sec5_q58'] #'sec5_q57',
for col in cols:
    df_clean[col] = np.nan
    df_clean.loc[df[col] == 'No', col] = 0
    df_clean.loc[df[col] == 3, col] = 3
    df_clean.loc[df[col] == 'No one', col] = 3
    df_clean.loc[df[col] == 'Yes, many people', col] = 0
    df_clean.loc[df[col] == 'Yes, a few people', col] = 0
    df_clean.loc[df[col] == 'Yes, one person', col] = 0

#concerns
cols = ['sec3_q33', 'sec3_q34', 'sec3_q35', 'sec3_q36']
for col in cols:
    df_clean[col] = np.nan
    df_clean.loc[df[col] == 'Strongly disagree', col] = 3
    df_clean.loc[df[col] == 'Somewhat disagree', col] = 2
    df_clean.loc[df[col] == 'Somewhat agree', col] = 1
    df_clean.loc[df[col] == 'Strongly agree', col] = 0

#feels others
cols = ['sec5_q56']
for col in cols:
    df_clean[col] = np.nan
    df_clean.loc[df[col] == 'Strongly disagree', col] = 0
    df_clean.loc[df[col] == 'Disagree', col] = 1
    df_clean.loc[df[col] == 'Agree', col] = 2
    df_clean.loc[df[col] == 'Strongly agree', col] = 3

#outcome
df_clean['anydoses'] = df['anydoses']
df_clean = df_clean[~df_clean.anydoses.isna()].reset_index().copy()

df_clean.tail()

In [None]:
len(df_clean)

In [None]:
#missing data by question
df_clean.isna().sum()

In [None]:
#how many questions did each person skip?
pd.DataFrame(df_clean.isna().sum(axis=1)).groupby(0).size()

In [None]:
#number of observations with at least one missing question
df_clean.id[df_clean.isna().sum(axis=1)>0].count()

In [None]:
30/253

In [None]:
for col in df_clean.columns.drop('id'):
    df_clean[col] = df_clean[col].astype('category')
    
df_clean[df_clean.columns.drop('id')].tail()

In [None]:
df_clean.shape

In [None]:
df_clean.groupby('anydoses').size()

In [None]:
import miceforest as mf

for col in df_clean.columns.drop('id'):
    df_clean[col] = df_clean[col].astype('category')

num_datasets = 20

kernel = mf.ImputationKernel(
  df_clean[df_clean.columns.drop('id')],
  num_datasets=num_datasets,
  random_state=1991,
  mean_match_candidates=0  # Skip mean matching
)

kernel.mice(3)

In [None]:
kernel.complete_data(dataset=0)

In [None]:
# average scores

cols = ['motivationscore', 
       'confidence', 'sec3_q24', 'sec3_q25', 'sec3_q30', 'sec3_q31', 'sec3_32', 'sec3_q37', 
       'talks', 'sec5_q51', 'sec5_q53',        'sec5_q58', #'sec5_q57', 
       'concerns', 'sec3_q33', 'sec3_q34', 'sec3_q35', 'sec3_q36', 
       'feelsothers']

coefs = pd.DataFrame([], columns = cols)
errs = pd.DataFrame([], columns = cols)


for d in range(num_datasets):
    df_imputed = kernel.complete_data(dataset=d).copy()

    for col in df_imputed.columns:
        df_imputed[col] = df_imputed[col].astype('int')

    df_imputed['confidence'] = df_imputed['sec3_q24'] + df_imputed['sec3_q25'] + df_imputed['sec3_q30'] + df_imputed['sec3_q31'] + df_imputed['sec3_32'] + df_imputed['sec3_q37']
    df_imputed['talks'] = df_imputed['sec5_q51'] + df_imputed['sec5_q53'] +           df_imputed['sec5_q58'] # df_imputed['sec5_q57'] +
    df_imputed['concerns'] = df_imputed['sec3_q33'] + df_imputed['sec3_q34'] + df_imputed['sec3_q35'] + df_imputed['sec3_q36']
    df_imputed['feelsothers'] = df_imputed['sec5_q56']
    df_imputed['motivationscore'] = df_imputed['confidence'] + df_imputed['talks'] + df_imputed['concerns'] + df_imputed['feelsothers']
    
    cs = []
    stderrs = []
    for col in cols:
        
        cs = cs + [df_imputed[col].mean()]
        stderrs = stderrs + [df_imputed[col].sem()]
    
    coefs.loc[d] = cs
    errs.loc[d] = stderrs

In [None]:
pd.DataFrame(round(coefs.mean(), 2).astype(str) + ' ± ' + round((((errs.mean())**2 + ((1 + 1/num_datasets) * (num_datasets) / (num_datasets - 1) * errs.std())**2)**0.5)*1.96,2).astype(str))

In [None]:
# odds ratios

cols = ['motivationscore', 
       'confidence', 'sec3_q24', 'sec3_q25', 'sec3_q30', 'sec3_q31', 'sec3_32', 'sec3_q37', 
       'talks', 'sec5_q51', 'sec5_q53',       'sec5_q58', # 'sec5_q57',
       'concerns', 'sec3_q33', 'sec3_q34', 'sec3_q35', 'sec3_q36', 
       'feelsothers']

coefs = pd.DataFrame([], columns = cols)
errs = pd.DataFrame([], columns = cols)


for d in range(num_datasets):
    df_imputed = kernel.complete_data(dataset=d).copy()

    for col in df_imputed.columns:
        df_imputed[col] = df_imputed[col].astype('int')
    
    df_imputed['const'] = 1
    df_imputed['confidence'] = df_imputed['sec3_q24'] + df_imputed['sec3_q25'] + df_imputed['sec3_q30'] + df_imputed['sec3_q31'] + df_imputed['sec3_32'] + df_imputed['sec3_q37']
    df_imputed['talks'] = df_imputed['sec5_q51'] + df_imputed['sec5_q53']          + df_imputed['sec5_q58'] # + df_imputed['sec5_q57']
    df_imputed['concerns'] = df_imputed['sec3_q33'] + df_imputed['sec3_q34'] + df_imputed['sec3_q35'] + df_imputed['sec3_q36']
    df_imputed['feelsothers'] = df_imputed['sec5_q56']
    df_imputed['motivationscore'] = df_imputed['confidence'] + df_imputed['talks'] + df_imputed['concerns'] + df_imputed['feelsothers']

    
    cs = []
    stderrs = []
    for col in cols:

        y = df_imputed['anydoses'] 
        X = df_imputed[['const', col]] 
    
        model = sm.Logit(y, X)
        res = model.fit(maxiter = 100)

        vals = res.summary2().tables[1]
        cs = cs + [vals['Coef.'].iloc[1]]
        stderrs = stderrs + [vals['Std.Err.'].iloc[1]]
    
    coefs.loc[d] = cs
    errs.loc[d] = stderrs

In [None]:
import scipy.stats as st

coef_mi = coefs.mean()
stderr_mi = (errs.mean()**2 + (1 + 1/num_datasets) * (num_datasets) / (num_datasets - 1) * errs.std()**2)**0.5
df_ans= pd.DataFrame(np.round(np.exp(coef_mi),2).astype(str) + ' [' + np.round(np.exp(coef_mi - 1.96 * stderr_mi),2).astype(str) + ', ' + np.round(np.exp(coef_mi + 1.96 * stderr_mi),2).astype(str) + ']', columns = ['OR'])
df_ans['p'] = np.round((1 - st.norm.cdf(np.abs(coef_mi/stderr_mi))) * 2,4)
df_ans

In [None]:
# kenya data

In [None]:
ls

In [None]:
df = pd.read_stata('hpvaxdataset20230113_1347.dta')
df.tail()

In [None]:
df.dtypes

In [None]:
list(df.columns)

In [None]:
#'sec5_q64', 'sec5_q63', 'sec6_q73', 'sec6_q74', 'sec3_q33', 'sec5_q60', 'sec5_q62', 'Correct_A_q83'
df.groupby('Correct_A_q83', dropna=False, observed=True).size()

In [None]:
df_clean = pd.DataFrame()
df_clean['id'] = df.index
df_clean['caseid'] = df.caseid

#confidence
cols = ['sec3_q28', 'sec3_q29', 'sec3_q36', 'sec3_q37', 'sec3_q38', 'sec3_q45']
for col in cols:
    df_clean[col] = np.nan
    df_clean.loc[df[col].str[3:] == 'Strongly disagree', col] = 0
    df_clean.loc[df[col].str[3:] == 'Somewhat disagree', col] = 1
    df_clean.loc[df[col].str[3:] == 'Somewhat agree', col] = 2
    df_clean.loc[df[col].str[3:] == 'Strongly agree', col] = 3

#talks
cols = ['sec5_q59', 'sec5_q61',  'sec5_q66'] #'sec5_q65',
for col in cols:
    df_clean[col] = np.nan
    df_clean.loc[df[col].str[3:] == 'No', col] = 0
    df_clean.loc[df[col].str[3:] == 'Yes', col] = 3
    #df_clean.loc[df[col].str[3:] == 3, col] = 3
    df_clean.loc[df[col].str[3:] == 'No one', col] = 3
    df_clean.loc[df[col].str[3:] == 'Yes, many people', col] = 0
    df_clean.loc[df[col].str[3:] == 'Yes, a few people', col] = 0
    df_clean.loc[df[col].str[3:] == 'Yes, one person', col] = 0

#concerns
cols = ['sec3_q39', 'sec3_q40', 'sec3_q41', 'sec3_q42']
for col in cols:
    df_clean[col] = np.nan
    df_clean.loc[df[col].str[3:] == 'Strongly disagree', col] = 3
    df_clean.loc[df[col].str[3:] == 'Somewhat disagree', col] = 2
    df_clean.loc[df[col].str[3:] == 'Somewhat agree', col] = 1
    df_clean.loc[df[col].str[3:] == 'Strongly agree', col] = 0

#feels others
cols = ['sec5_q64']
for col in cols:
    df_clean[col] = np.nan
    df_clean.loc[df[col].str[3:] == 'Strongly disagree', col] = 0
    df_clean.loc[df[col].str[3:] == 'Disagree', col] = 1
    df_clean.loc[df[col].str[3:] == 'Agree', col] = 2
    df_clean.loc[df[col].str[3:] == 'Strongly agree', col] = 3

#outcome
df_clean['anydoses'] = np.nan
df_clean.loc[df['AnyHPVdoses'].str[3:] == 'No', 'anydoses'] = 0
df_clean.loc[df['AnyHPVdoses'].str[3:] == 'Yes', 'anydoses'] = 1

# extra questions: 'sec5_q64', 'sec5_q63', 'sec6_q73', 'sec6_q74', 'sec3_q33', 'sec5_q60', 'sec5_q62', 'Correct_A_q83'
extra_cols = ['sec5_q64', 'sec5_q63', 'sec6_q73', 'sec6_q74', 'sec3_q33', 'sec5_q60', 'sec5_q62', 'Correct_A_q83']
for col in extra_cols:
    df_clean[col] = np.nan
    
    df_clean.loc[df[col].str[3:] == 'Strongly disagree', col] = 3
    df_clean.loc[df[col].str[3:] == 'Disagree', col] = 2
    df_clean.loc[df[col].str[3:] == 'Somewhat disagree', col] = 2
    df_clean.loc[df[col].str[3:] == 'Agree', col] = 1
    df_clean.loc[df[col].str[3:] == 'Somewhat agree', col] = 1
    df_clean.loc[df[col].str[3:] == 'Strongly agree', col] = 0
    
    df_clean.loc[df[col].str[3:] == 'Alot', col] = 3
    df_clean.loc[df[col].str[3:] == 'Some', col] = 2
    df_clean.loc[df[col].str[3:] == 'Not much', col] = 1
    df_clean.loc[df[col].str[3:] == 'Not at all', col] = 0

    df_clean.loc[df[col].str[3:] == 'Not at all comfortable', col] = 0
    df_clean.loc[df[col].str[3:] == 'Somewhat comfortable', col] = 1
    df_clean.loc[df[col].str[3:] == 'Very comfortable', col] = 2

    df_clean.loc[df[col].str[3:] == 'No', col] = 0
    df_clean.loc[df[col].str[3:] == 'Yes', col] = 1
if col == 'Correct_A_q83':
    df_clean[col] = df_clean[col].fillna(0)
    
df_clean.tail()

In [None]:
df_clean.dtypes

In [None]:
len(df_clean)

In [None]:
#missing data by question
df_clean.isna().sum()

In [None]:
#how many questions did each person skip?
pd.DataFrame(df_clean.isna().sum(axis=1)).groupby(0).size()

In [None]:
#number of observations with at least one missing question
df_clean.id[df_clean.isna().sum(axis=1)>0].count()

In [None]:
df_clean[df_clean.isna().sum(axis=1)>14]

In [None]:
# exclude the 2 people who skipped 15 questions
df_clean = df_clean[df_clean.isna().sum(axis=1)<=14].copy()

In [None]:
df_clean[df_clean.isna().sum(axis=1)>14]

In [None]:
df_clean.reset_index(inplace=True)

In [None]:
for col in df_clean.columns.drop('id'):
    df_clean[col] = df_clean[col].astype('category')
    
df_clean[df_clean.columns.drop('id')].tail()

In [None]:
369/1345

In [None]:
len(df_clean)

In [None]:
import miceforest as mf

#for sec3_q28, it's almost always 0, so just impute 0
df_clean['sec3_q28'] = df_clean['sec3_q28'].fillna(3.0)

for col in df_clean.columns.drop('id'):
    df_clean[col] = df_clean[col].astype('category')

num_datasets = 20

kernel = mf.ImputationKernel(
  df_clean[df_clean.columns.drop('id')],
  num_datasets=num_datasets,
  random_state=1991,
  mean_match_candidates=0  # Skip mean matching
)

kernel.mice(3)

In [None]:
kernel.complete_data(dataset=0)

In [None]:
#### average scores

cols = ['motivationscore', 
       'confidence', 'sec3_q28', 'sec3_q29', 'sec3_q36', 'sec3_q37', 'sec3_q38', 'sec3_q45',
       'talks', 'sec5_q59', 'sec5_q61',  'sec5_q66', #'sec5_q65',
       'concerns', 'sec3_q39', 'sec3_q40', 'sec3_q41', 'sec3_q42',
       'feelsothers',
       'sec5_q64', 'sec5_q63', 'sec6_q73', 'sec6_q74', 'sec3_q33', 'sec5_q60', 'sec5_q62', 'Correct_A_q83']

coefs = pd.DataFrame([], columns = cols)
errs = pd.DataFrame([], columns = cols)


for d in range(num_datasets):
    df_imputed = kernel.complete_data(dataset=d).copy()

    for col in df_imputed.columns:
        df_imputed[col] = df_imputed[col].astype('int')

    df_imputed['confidence'] = df_imputed['sec3_q28'] + df_imputed['sec3_q29'] + df_imputed['sec3_q36'] + df_imputed['sec3_q37'] + df_imputed['sec3_q38'] + df_imputed['sec3_q45']
    df_imputed['talks'] = df_imputed['sec5_q59'] + df_imputed['sec5_q61']      + df_imputed['sec5_q66'] #+ df_imputed['sec5_q65']
    df_imputed['concerns'] = df_imputed['sec3_q39'] + df_imputed['sec3_q40'] + df_imputed['sec3_q41'] + df_imputed['sec3_q42']
    df_imputed['feelsothers'] = df_imputed['sec5_q64']
    df_imputed['motivationscore'] = df_imputed['confidence'] + df_imputed['talks'] + df_imputed['concerns'] + df_imputed['feelsothers']
    
    cs = []
    stderrs = []
    for col in cols:
        
        cs = cs + [df_imputed[col].mean()]
        stderrs = stderrs + [df_imputed[col].sem()]
    
    coefs.loc[d] = cs
    errs.loc[d] = stderrs

In [None]:
pd.DataFrame(round(coefs.mean(), 2).astype(str) + ' ± ' + round((((errs.mean())**2 + ((1 + 1/num_datasets) * (num_datasets) / (num_datasets - 1) * errs.std())**2)**0.5)*1.96,2).astype(str))

In [None]:
# odds ratios

cols = ['motivationscore', 
       'confidence', 'sec3_q28', 'sec3_q29', 'sec3_q36', 'sec3_q37', 'sec3_q38', 'sec3_q45',
       'talks', 'sec5_q59', 'sec5_q61', 'sec5_q66', # 'sec5_q65',
       'concerns', 'sec3_q39', 'sec3_q40', 'sec3_q41', 'sec3_q42',
       'feelsothers',
       'sec5_q64', 'sec5_q63', 'sec6_q73', 'sec6_q74', 'sec3_q33', 'sec5_q60', 'sec5_q62', 'Correct_A_q83']

coefs = pd.DataFrame([], columns = cols)
errs = pd.DataFrame([], columns = cols)


for d in range(num_datasets):
    df_imputed = kernel.complete_data(dataset=d).copy()

    for col in df_imputed.columns:
        df_imputed[col] = df_imputed[col].astype('int')
    
    df_imputed['const'] = 1
    df_imputed['confidence'] = df_imputed['sec3_q28'] + df_imputed['sec3_q29'] + df_imputed['sec3_q36'] + df_imputed['sec3_q37'] + df_imputed['sec3_q38'] + df_imputed['sec3_q45']
    df_imputed['talks'] = df_imputed['sec5_q59'] + df_imputed['sec5_q61'] + df_imputed['sec5_q66'] # + df_imputed['sec5_q65'] 
    df_imputed['concerns'] = df_imputed['sec3_q39'] + df_imputed['sec3_q40'] + df_imputed['sec3_q41'] + df_imputed['sec3_q42']
    df_imputed['feelsothers'] = df_imputed['sec5_q64']
    df_imputed['motivationscore'] = df_imputed['confidence'] + df_imputed['talks'] + df_imputed['concerns'] + df_imputed['feelsothers']
    
    cs = []
    stderrs = []
    for col in cols:

        y = df_imputed['anydoses'] 
        X = df_imputed[['const', col]] 
    
        model = sm.Logit(y, X)
        res = model.fit(maxiter = 100)

        vals = res.summary2().tables[1]
        cs = cs + [vals['Coef.'].iloc[1]]
        stderrs = stderrs + [vals['Std.Err.'].iloc[1]]
    
    coefs.loc[d] = cs
    errs.loc[d] = stderrs

In [None]:
import scipy.stats as st

coef_mi = coefs.mean()
stderr_mi = (errs.mean()**2 + (1 + 1/num_datasets) * (num_datasets) / (num_datasets - 1) * errs.std()**2)**0.5
df_ans= pd.DataFrame(np.round(np.exp(coef_mi),2).astype(str) + ' [' + np.round(np.exp(coef_mi - 1.96 * stderr_mi),2).astype(str) + ', ' + np.round(np.exp(coef_mi + 1.96 * stderr_mi),2).astype(str) + ']', columns = ['OR'])
df_ans['p'] = np.round((1 - st.norm.cdf(np.abs(coef_mi/stderr_mi))) * 2,4)
df_ans

In [None]:
df_clean.tail()

In [None]:
ls

In [None]:
df_new = pd.read_excel('parent_vax_followup2024.xlsx')

In [None]:
df_clean['caseid2'] = df_clean.caseid.astype(int)

In [None]:
cols = [
       'sec3_q28', 'sec3_q29', 'sec3_q36', 'sec3_q37', 'sec3_q38', 'sec3_q45',
       'sec5_q59', 'sec5_q61',  'sec5_q66', #'sec5_q65',
       'sec3_q39', 'sec3_q40', 'sec3_q41', 'sec3_q42',
       
       'sec5_q64', 'anydoses', 'vax_any']

df_two = pd.merge(df_clean, df_new, left_on = 'caseid2', right_on = 'caseid')
df_two = df_two[['caseid_x'] + cols].copy()
df_two[df_two.anydoses==0].tail()

In [None]:
df_clean_2 = df_two[df_two.anydoses==0].copy()

In [None]:
len(df_clean_2)

In [None]:
df_clean_2[['caseid_x', 'anydoses', 'vax_any']].rename(columns={'caseid_x':'caseid', 'anydoses':'anydoses_2022', 'vax_any':'anydoses_2024'}).to_csv('Kenya2024_caseids.csv')

In [None]:
import miceforest as mf

#for sec3_q28, it's almost always 0, so just impute 0
df_clean_2['sec3_q28'] = df_clean_2['sec3_q28'].fillna(3.0)

for col in df_clean_2.columns.drop('caseid_x'):
    df_clean[col] = df_clean_2[col].astype('category')

num_datasets = 20

kernel = mf.ImputationKernel(
  df_clean[df_clean_2.columns.drop('caseid_x')],
  num_datasets=num_datasets,
  random_state=1991,
  mean_match_candidates=0  # Skip mean matching
)

kernel.mice(3)

In [None]:
kernel.complete_data(dataset=0)

In [None]:
#### average scores

cols = ['motivationscore', 
       'confidence', 'sec3_q28', 'sec3_q29', 'sec3_q36', 'sec3_q37', 'sec3_q38', 'sec3_q45',
       'talks', 'sec5_q59', 'sec5_q61',  'sec5_q66', #'sec5_q65',
       'concerns', 'sec3_q39', 'sec3_q40', 'sec3_q41', 'sec3_q42',
       'feelsothers']
       #'sec5_q64', 'sec5_q63', 'sec6_q73', 'sec6_q74', 'sec3_q33', 'sec5_q60', 'sec5_q62', 'Correct_A_q83']

coefs = pd.DataFrame([], columns = cols)
errs = pd.DataFrame([], columns = cols)


for d in range(num_datasets):
    df_imputed = kernel.complete_data(dataset=d).copy()

    for col in df_imputed.columns:
        df_imputed[col] = df_imputed[col].astype('int')

    df_imputed['confidence'] = df_imputed['sec3_q28'] + df_imputed['sec3_q29'] + df_imputed['sec3_q36'] + df_imputed['sec3_q37'] + df_imputed['sec3_q38'] + df_imputed['sec3_q45']
    df_imputed['talks'] = df_imputed['sec5_q59'] + df_imputed['sec5_q61']      + df_imputed['sec5_q66'] #+ df_imputed['sec5_q65']
    df_imputed['concerns'] = df_imputed['sec3_q39'] + df_imputed['sec3_q40'] + df_imputed['sec3_q41'] + df_imputed['sec3_q42']
    df_imputed['feelsothers'] = df_imputed['sec5_q64']
    df_imputed['motivationscore'] = df_imputed['confidence'] + df_imputed['talks'] + df_imputed['concerns'] + df_imputed['feelsothers']
    
    cs = []
    stderrs = []
    for col in cols:
        
        cs = cs + [df_imputed[col].mean()]
        stderrs = stderrs + [df_imputed[col].sem()]
    
    coefs.loc[d] = cs
    errs.loc[d] = stderrs

In [None]:
pd.DataFrame(round(coefs.mean(), 2).astype(str) + ' ± ' + round((((errs.mean())**2 + ((1 + 1/num_datasets) * (num_datasets) / (num_datasets - 1) * errs.std())**2)**0.5)*1.96,2).astype(str))

In [None]:
# odds ratios

cols = ['motivationscore', 
       'confidence', 'sec3_q28', 'sec3_q29', 'sec3_q36', 'sec3_q37', 'sec3_q38', 'sec3_q45',
       'talks', 'sec5_q59', 'sec5_q61', 'sec5_q66', # 'sec5_q65',
       'concerns', 'sec3_q39', 'sec3_q40', 'sec3_q41', 'sec3_q42',
       'feelsothers']
       #'sec5_q64', 'sec5_q63', 'sec6_q73', 'sec6_q74', 'sec3_q33', 'sec5_q60', 'sec5_q62', 'Correct_A_q83']

coefs = pd.DataFrame([], columns = cols)
errs = pd.DataFrame([], columns = cols)


for d in range(num_datasets):
    df_imputed = kernel.complete_data(dataset=d).copy()

    for col in df_imputed.columns:
        df_imputed[col] = df_imputed[col].astype('int')
    
    df_imputed['const'] = 1
    df_imputed['confidence'] = df_imputed['sec3_q28'] + df_imputed['sec3_q29'] + df_imputed['sec3_q36'] + df_imputed['sec3_q37'] + df_imputed['sec3_q38'] + df_imputed['sec3_q45']
    df_imputed['talks'] = df_imputed['sec5_q59'] + df_imputed['sec5_q61'] + df_imputed['sec5_q66'] # + df_imputed['sec5_q65'] 
    df_imputed['concerns'] = df_imputed['sec3_q39'] + df_imputed['sec3_q40'] + df_imputed['sec3_q41'] + df_imputed['sec3_q42']
    df_imputed['feelsothers'] = df_imputed['sec5_q64']
    df_imputed['motivationscore'] = df_imputed['confidence'] + df_imputed['talks'] + df_imputed['concerns'] + df_imputed['feelsothers']
    
    cs = []
    stderrs = []
    for col in cols:

        y = df_imputed['vax_any'] 
        X = df_imputed[['const', col]] 
    
        model = sm.Logit(y, X)
        res = model.fit(maxiter = 100)

        vals = res.summary2().tables[1]
        cs = cs + [vals['Coef.'].iloc[1]]
        stderrs = stderrs + [vals['Std.Err.'].iloc[1]]
    
    coefs.loc[d] = cs
    errs.loc[d] = stderrs

In [None]:
import scipy.stats as st

coef_mi = coefs.mean()
stderr_mi = (errs.mean()**2 + (1 + 1/num_datasets) * (num_datasets) / (num_datasets - 1) * errs.std()**2)**0.5
df_ans= pd.DataFrame(np.round(np.exp(coef_mi),2).astype(str) + ' [' + np.round(np.exp(coef_mi - 1.96 * stderr_mi),2).astype(str) + ', ' + np.round(np.exp(coef_mi + 1.96 * stderr_mi),2).astype(str) + ']', columns = ['OR'])
df_ans['p'] = np.round((1 - st.norm.cdf(np.abs(coef_mi/stderr_mi))) * 2,4)
df_ans

In [None]:
pd.merge(df_clean_2, df, left_on='caseid_x', right_on='caseid').groupby('vax_any').size()

In [None]:
111+1+2+15+30

In [None]:
df.iloc[0][0:25]