In [1]:
import json
import numpy as np
import pandas as pd



## Load Data

### Prevaccine

This file contains the number of swabs for each one-month age group.

In [9]:
sample = pd.read_csv("data/kenya/sample.csv")
sample.tail()

Unnamed: 0,agem,denom
54,57,46
55,58,56
56,59,50
57,60,10
58,65,1


This file has the data about carriers: age (months), serotype sampled, number of individuals

In [10]:
serotypes_agem = pd.read_csv("data/kenya/serotypes_agem.csv")
serotypes_agem.head()

Unnamed: 0,agem,serotype,freq
0,3,6A,4
1,3,6B,3
2,3,9V,1
3,3,15C,1
4,3,19F,4


### Postvaccine data source

This file has both number of carriers and number of swabbed individuals. U5 means under five, agem is age in months. Null values for serotype indicate that the individual was not colonized.

In [11]:
npc = pd.read_csv("data/kenya/NPC 2009-2013 U5 serotypes.csv")
npc.head()

Unnamed: 0,serotype,agem,year
0,,10,2009
1,,1,2009
2,11A,10,2009
3,14,11,2009
4,,7,2009


Check that the maximum age is indeed under 5 fives

In [13]:
npc.agem.max()

59

Drop the agem column, we won't be using it

In [14]:
npc.drop('agem', axis=1, inplace=True)

Get serotypes that appear in prevaccine and/or postvaccine data

In [15]:
serotypes = sorted(set(npc.serotype.unique()) | set(serotypes_agem.serotype.unique()))
print serotypes

[nan, '1', '10A', '10B', '10F', '11A', '11D', '12B', '12F', '13', '14', '15A', '15B', '15C', '15F', '16F', '17F', '18B', '18C', '18F', '19A', '19B', '19C', '19F', '20', '21', '22A', '22F', '23A', '23B', '23F', '24F', '28A', '28F', '29', '3', '31', '33B', '33C', '33D', '34', '35A', '35B', '35F', '38', '4', '40', '5', '6A', '6B', '7C', '7F', '8', '9A', '9L', '9N', '9V', 'non-typable']


In [19]:
serotypes_agem[serotypes_agem.serotype == 'non-typable']

Unnamed: 0,agem,serotype,freq


In [20]:
serotypes_agem[serotypes_agem.serotype.isnull()]

Unnamed: 0,agem,serotype,freq


In [21]:
npc[npc.serotype == 'non-typable']

Unnamed: 0,serotype,year
594,non-typable,2012
655,non-typable,2012
706,non-typable,2012


Remove what are clearly not serotypes

In [7]:
serotypes.remove(serotypes[0])
serotypes.remove('non-typable')

In [8]:
print serotypes

['1', '10A', '10B', '10F', '11A', '11D', '12B', '12F', '13', '14', '15A', '15B', '15C', '15F', '16F', '17F', '18B', '18C', '18F', '19A', '19B', '19C', '19F', '20', '21', '22A', '22F', '23A', '23B', '23F', '24F', '28A', '28F', '29', '3', '31', '33B', '33C', '33D', '34', '35A', '35B', '35F', '38', '4', '40', '5', '6A', '6B', '7C', '7F', '8', '9A', '9L', '9N', '9V']


In [9]:
with open('outputs/serotypes_kenya.json', 'w') as f:
    json.dump({'serotypes': serotypes}, f)

Make this series so that we output everything in the same order

In [10]:
zero_counts = pd.Series(0, index=serotypes)
zero_counts.head()

1      0
10A    0
10B    0
10F    0
11A    0
dtype: int64

## Prevaccine

Get the total number of samples

In [11]:
total_pre = sample.denom.sum()
print total_pre, "total prevaccine samples"

2840 total prevaccine samples


Get counts for each serotype

In [12]:
counts_pre = serotypes_agem.groupby('serotype').freq.sum().astype(int)
counts_pre.head()

serotype
1      13
10A    56
10B     3
10F     6
11A    90
Name: freq, dtype: int64

In [13]:
counts_pre = counts_pre[serotypes].fillna(0).astype(int)
assert (counts_pre.index == serotypes).all()

Get number of uncolonized

In [14]:
uncolonized_pre = total_pre - sum(counts_pre)

In [15]:
print 'prevaccine prevalence: {:.3f}'.format(sum(counts_pre) / float(total_pre))

prevaccine prevalence: 0.658


In [16]:
with open('outputs/observed_counts_kenya_prevaccine.json', 'w') as f:
    json.dump({'counts': list(counts_pre) + [uncolonized_pre]}, f)

### Calculate initial ranks

In [17]:
ranks_pre = counts_pre.rank(ascending=False)

In [18]:
with open('outputs/initial_ranks_kenya.json', 'w') as f:
    json.dump({'ranks': list(ranks_pre)}, f)

Try a different set of initial ranks

In [31]:
ranks_ = counts_pre.rank(ascending=False)
ranks_[ranks_ != 1] += 5
ranks_[ranks_ > len(ranks_)] = len(ranks_)

In [36]:
print pd.DataFrame({'original': ranks_pre, 'adjusted': ranks_})

          adjusted  original
serotype                    
1             31.5      26.5
10A           14.0       9.0
10B           43.0      38.0
10F           36.0      31.0
11A           10.0       5.0
11D           54.0      49.0
12B           48.0      43.0
12F           38.0      33.0
13            18.5      13.5
14            11.0       6.0
15A           18.5      13.5
15B           15.0      10.0
15C           20.0      15.0
15F           54.0      49.0
16F           22.5      17.5
17F           43.0      38.0
18B           56.0      55.0
18C           24.0      19.0
18F           43.0      38.0
19A           16.0      11.0
19B           25.0      20.0
19C           54.0      49.0
19F            1.0       1.0
20            27.0      22.0
21            29.0      24.0
22A           43.0      38.0
22F           43.0      38.0
23A           28.0      23.0
23B           13.0       8.0
23F            9.0       4.0
24F           38.0      33.0
28A           54.0      49.0
28F           

In [37]:
with open('outputs/initial_ranks_kenya_adjusted.json', 'w') as f:
    json.dump({'ranks': list(ranks_)}, f)

## Postvaccine

In [20]:
uncolonized_npc = npc.groupby('year').apply(lambda g: g.serotype.isnull().sum())
uncolonized_npc

year
2008     1
2009    73
2010    47
2011    58
2012    37
2013    49
dtype: int64

Get counts by year and serotype

In [21]:
counts = npc.groupby(["year", "serotype"]).size()
counts.head()

year  serotype
2009  1            3
      10A          3
      11A          6
      13           6
      14          12
dtype: int64

In [22]:
years = counts.index.levels[0]
counts_by_year = {}
for yr in years:
    counts_by_year[yr] = counts.loc[yr][serotypes].fillna(0).astype(int)

Double check that serotypes are presented in the same order across years

In [23]:
for yr in years:
    assert (counts_by_year[years[0]].index == serotypes).all()

Write to files

In [24]:
for yr in years:
    with open('outputs/observed_counts_kenya_{}.json'.format(yr), 'w') as f:
        json.dump({'counts': list(counts_by_year[yr]) + [uncolonized_npc[yr]]}, f)

Serotype-specific immunity. 0.9 for serotype 14, 0.3 for the rest

In [24]:
sigmas = len(serotypes) * [0.3]
#sigmas[serotypes.index("14")] = 0.9

In [25]:
with open('outputs/sigmas_kenya.json', 'w') as f:
  json.dump({'sigmas': sigmas}, f)