# long_covid

In [1]:
# Don't change this cell; just run it.
import numpy as np  # The array library.
import pandas as pd  # A library for loading tables.

# The OKpy testing system.
from client.api.notebook import Notebook
ok = Notebook('long_covid.ok')

Assignment: long covid
OK, version v1.18.1



## Loading the data

These data come from Matta et al's supplementary
material](https://cdn.jamanetwork.com/ama/content_public/journal/intemed/938839/ioi210066supp1_prod_1647627389.20501.pdf?Expires=1672986105&Signature=POBPWVqfFjiQjfJkNYkEORFAl94cWc-C5BSowE2m6lh2EpSBvooGrnzZyrUSQmo0d3-D~L5dtSloJ8Jzq~0KncVeLzwsot3MPYt5PEhqfT0aWEReWT1jV6XTn9MTc2rQjND7OmTjKyvvnxWMjV-bZ4lN8zCsWeMqLjqL859EVrtPfp5eTTSUwdA0qJIhMnAZyBC6iIajkdS-Jj4JYBaYoLTT7iloZEdqPhHzc95VvuiLpJGojbtoMqDYeqrGaSwoYimHRuc4asyPXqvhdcd-svUchv-SInYT6G~yk80EoEI6pLwDAof04ekVWwKAcZNCCdyItXfn07rB3ok9QhRx9g__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA)

In [2]:
df = pd.read_csv('matta_table_e4.csv').set_index('Self-rated health')
df

Unnamed: 0_level_0,"Belief-,Serology-","Belief+,Serology-","Belief-,Serology+","Belief+,Serology+"
Self-rated health,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3901,60,102,72
2,12547,234,346,226
3,5454,114,121,94
4,1627,23,33,33
5,776,11,15,11
6,523,10,9,12
7,192,3,6,3
8,61,1,0,0


In [3]:
dfs = []
char2code = {'-': 0, '+': 1}
for health in range(1, 9):
    for belief_char in '-+':
        belief_code = char2code[belief_char]
        for sero_char in '-+':
            sero_code = char2code[sero_char]
            col = f'Belief{belief_char},Serology{sero_char}'
            count = df.loc[health, col]
            dfs.append(pd.DataFrame({'health': [health] * count,
                                     'belief': [belief_code] * count,
                                     'serology': [sero_code] * count}))
patients = pd.concat(dfs).reset_index(drop=True).astype(int)
patients

Unnamed: 0,health,belief,serology
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
26615,8,0,0
26616,8,0,0
26617,8,0,0
26618,8,0,0


In [4]:
patients.groupby(['belief', 'serology']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,health
belief,serology,Unnamed: 2_level_1
0,0,2.420757
0,1,2.310127
1,0,2.425439
1,1,2.407982


In [5]:
belief_no = patients.loc[patients['belief'] == 0, 'health'].values
belief_yes = patients.loc[patients['belief'] == 1, 'health'].values

In [6]:
actual = np.mean(belief_no) - np.mean(belief_yes)
actual

0.001279023892392761

In [7]:
both = np.concatenate([belief_no, belief_yes])
both

array([1, 1, 1, ..., 7, 7, 8])

In [8]:
n_no = len(belief_no)
n_no

25713

In [9]:
rng = np.random.default_rng()

In [10]:
n_iters = 10000
diffs = np.zeros(n_iters)
for i in range(n_iters):
    shuffled = rng.permutation(both)
    diffs[i] = np.mean(shuffled[:n_no]) - np.mean(shuffled[n_no:])

In [11]:
np.count_nonzero(diffs >= actual) / n_iters

0.5011

In [12]:
covid_no = patients.loc[patients['serology'] == 0, 'health'].values
covid_yes = patients.loc[patients['serology'] == 1, 'health'].values

In [13]:
n_no_covid = len(covid_no)

In [14]:
actual_covid = np.mean(covid_no) - np.mean(covid_yes)
actual_covid

0.06996315631464212

In [15]:
both_covid = np.concatenate([covid_no, covid_yes])

In [16]:
covid_diffs = np.zeros(n_iters)
for i in range(n_iters):
    shuffled = rng.permutation(both_covid)
    covid_diffs[i] = np.mean(shuffled[:n_no_covid]) - np.mean(shuffled[n_no_covid:])

In [17]:
np.count_nonzero(covid_diffs >= actual_covid) / n_iters

0.0252

## Done.

Congratulations, you're done with the assignment!  Be sure to:

- **run all the tests** (the next cell has a shortcut for that).
- **Save and Checkpoint** from the `File` menu.

In [18]:
# For your convenience, you can run this cell to run all the tests at once!
import os
_ = [ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q')]