In [1]:
import os
import numpy as np
from os.path import dirname
from pandas import DataFrame, read_csv
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest
ROOT_DIR = dirname(dirname(dirname(os.path.realpath('__file__'))))

## Section 1: Experiment 1

In [2]:
## Load data.
demo = read_csv(os.path.join(ROOT_DIR, 'study01', 'data', 's1', 'demographics.csv'))

## Apply rejections.
reject = read_csv(os.path.join(ROOT_DIR, 'study01', 'data', 's1', 'reject.csv'))
demo = demo[demo.subject.isin(reject.query('reject == 0').subject)].reset_index(drop=True)

### 1.1 Gender

In [3]:
## Count responses.
gb = demo.groupby('gender').subject.count().reset_index(name='count')
gb['%'] = gb['count'].transform(lambda x: np.round(x / x.sum() * 100, 1))
print(gb.set_index('gender').sort_values('count', ascending=False))

        count     %
gender             
Man        55  53.4
Woman      47  45.6
Other       1   1.0


### 1.2 Age

In [4]:
demo.age.describe().round(1)

count    103.0
mean      35.5
std       10.3
min       20.0
25%       28.5
50%       33.0
75%       41.0
max       69.0
Name: age, dtype: float64

#### Discretized

In [5]:
## Discretize ages.
demo['age_cat'] = np.digitize(demo.age, [29.5, 39.5, 49.5, 59.5])

## Count responses.
gb = demo.groupby('age_cat').subject.count().reset_index(name='count')
gb['%'] = gb['count'].transform(lambda x: np.round(x / x.sum() * 100, 1))
gb.index = ['18-29', '30-39', '40-49', '50-59', '60 and older']
print(gb[['count','%']])

              count     %
18-29            30  29.1
30-39            43  41.7
40-49            19  18.4
50-59             8   7.8
60 and older      3   2.9


### 1.3 Race & ethnicity

#### Ethnicity

In [6]:
## Count responses.
gb = demo.groupby('ethnicity').subject.count().reset_index(name='count')
gb['%'] = gb['count'].transform(lambda x: np.round(x / x.sum() * 100, 1))
print(gb.set_index('ethnicity').sort_values('count', ascending=False))

                        count     %
ethnicity                          
Not Hispanic or Latino     93  90.3
Hispanic or Latino          9   8.7
Rather not say              1   1.0


#### Race

In [7]:
## Count responses.
gb = demo.groupby('race').subject.count().reset_index(name='count')
gb['%'] = gb['count'].transform(lambda x: np.round(x / x.sum() * 100, 1))
print(gb.set_index('race').sort_values('count', ascending=False))

                           count     %
race                                  
White                         80  77.7
Asian                         10   9.7
Black or African American      9   8.7
Rather not say                 4   3.9


## Section 2: Experiment 2

In [8]:
## Load data.
demo = read_csv(os.path.join(ROOT_DIR, 'study02', 'data', 's1', 'demographics.csv'))

## Apply rejections.
reject = read_csv(os.path.join(ROOT_DIR, 'study02', 'data', 's1', 'reject.csv'))
demo = demo[demo.subject.isin(reject.query('reject == 0').subject)].reset_index(drop=True)

### 2.1 Gender

In [9]:
## Count responses.
gb = demo.groupby('gender').subject.count().reset_index(name='count')
gb['%'] = gb['count'].transform(lambda x: np.round(x / x.sum() * 100, 1))
print(gb.set_index('gender').sort_values('count', ascending=False))

                count     %
gender                     
Man                65  59.1
Woman              43  39.1
Other               1   0.9
Rather not say      1   0.9


### 2.2 Age

In [10]:
demo.age.describe().round(1)

count    110.0
mean      39.6
std       11.0
min       23.0
25%       31.0
50%       37.0
75%       46.0
max       69.0
Name: age, dtype: float64

#### Discretized

In [11]:
## Discretize ages.
demo['age_cat'] = np.digitize(demo.age, [29.5, 39.5, 49.5, 59.5])

## Count responses.
gb = demo.groupby('age_cat').subject.count().reset_index(name='count')
gb['%'] = gb['count'].transform(lambda x: np.round(x / x.sum() * 100, 1))
gb.index = ['18-29', '30-39', '40-49', '50-59', '60 and older']
print(gb[['count','%']])

              count     %
18-29            21  19.1
30-39            42  38.2
40-49            26  23.6
50-59            14  12.7
60 and older      7   6.4


### 2.3 Race & ethnicity

#### Ethnicity

In [12]:
## Count responses.
gb = demo.groupby('ethnicity').subject.count().reset_index(name='count')
gb['%'] = gb['count'].transform(lambda x: np.round(x / x.sum() * 100, 1))
print(gb.set_index('ethnicity').sort_values('count', ascending=False))

                        count     %
ethnicity                          
Not Hispanic or Latino    101  91.8
Hispanic or Latino          8   7.3
Rather not say              1   0.9


#### Race

In [13]:
## Count responses.
gb = demo.groupby('race').subject.count().reset_index(name='count')
gb['%'] = gb['count'].transform(lambda x: np.round(x / x.sum() * 100, 1))
print(gb.set_index('race').sort_values('count', ascending=False))

                                            count     %
race                                                   
['White']                                      91  82.7
['Black or African American']                   8   7.3
['American Indian/Alaska Native', 'White']      3   2.7
['Rather not say']                              3   2.7
['Asian']                                       2   1.8
['Black or African American', 'White']          2   1.8
['Asian', 'White']                              1   0.9


## Section 3: Demographics comparison

In [14]:
## Load demographics (study 1).
d1 = read_csv(os.path.join(ROOT_DIR, 'study01', 'data', 's1', 'demographics.csv'))
reject = read_csv(os.path.join(ROOT_DIR, 'study01', 'data', 's1', 'reject.csv'))
d1 = d1[d1.subject.isin(reject.query('reject == 0').subject)].reset_index(drop=True)

## Load demographics (study 2).
d2 = read_csv(os.path.join(ROOT_DIR, 'study02', 'data', 's1', 'demographics.csv'))
reject = read_csv(os.path.join(ROOT_DIR, 'study02', 'data', 's1', 'reject.csv'))
d2 = d2[d2.subject.isin(reject.query('reject == 0').subject)].reset_index(drop=True)

### 1.1 Gender

In [15]:
## Perform proportions z-test.
zval, pval = proportions_ztest(
    [sum(d1.gender=='Man'), sum(d2.gender=='Man')],
    [len(d1), len(d2)]
)

## Return statistics.
print('z = %0.3f, p = %0.3f' %(zval, pval))

z = -0.837, p = 0.403


### 1.2 Age

In [16]:
## Perform 2-sample t-test.
tval, pval = ttest_ind(d1.age, d2.age)

## Return statistics.
print('t = %0.3f, p = %0.3f' %(zval, pval))

t = -0.837, p = 0.006


### 1.3 Ethnicity

In [17]:
## Perform proportions z-test.
zval, pval = proportions_ztest(
    [sum(d1.ethnicity=='Not Hispanic or Latino'), sum(d2.ethnicity=='Not Hispanic or Latino')],
    [len(d1), len(d2)]
)

## Return statistics.
print('z = %0.3f, p = %0.3f' %(zval, pval))

z = -0.391, p = 0.696


### 1.4 Race

In [18]:
## Unpack responses.
gb = DataFrame(dict(race = [r for resp in d2.race for r in eval(resp)]))

## Perform proportions z-test.
zval, pval = proportions_ztest(
    [sum(d1.race=='White'), sum(gb.race=='White')],
    [len(d1), len(gb)]
)

## Return statistics.
print('z = %0.3f, p = %0.3f' %(zval, pval))

z = -1.116, p = 0.264
