In [4]:
import json
import numpy as np
import pandas as pd

## Get observed counts for each serotype (by year)

Read in spreadsheet

In [5]:
sparc2 = pd.read_excel('data/united-states/SPARC2_CollectionYear_Serotypes_Age_2014_BMC.xls')
sparc2.head()

Unnamed: 0,Collection_Year,Serotype,Number_of_Serotypes,Age_Years
0,2001,,6,1. <=1 year
1,2001,,12,2. 1> - <=2 years
2,2001,,3,3. 2> - <=3 years
3,2001,,5,4. 3> - <=4 years
4,2001,,2,5. 4> - <=5 years


Extract meaning from Age_Years

In [6]:
sparc2[['age_lower', 'age_upper']] = sparc2.Age_Years.str.extract('\d\.\s(\d)?.*?(\d)').convert_objects(convert_numeric=True)
sparc2.head()

  if __name__ == '__main__':
  if __name__ == '__main__':


Unnamed: 0,Collection_Year,Serotype,Number_of_Serotypes,Age_Years,age_lower,age_upper
0,2001,,6,1. <=1 year,,1
1,2001,,12,2. 1> - <=2 years,1.0,2
2,2001,,3,3. 2> - <=3 years,2.0,3
3,2001,,5,4. 3> - <=4 years,3.0,4
4,2001,,2,5. 4> - <=5 years,4.0,5


See what serotypes are present

In [7]:
print sorted(sparc2.Serotype.unique())

[nan, u'10', u'10A', u'10B', u'11A', u'13', u'14', u'15A', u'15B/C', u'15F', u'16F', u'17F', u'18C', u'18F', u'19A', u'19F', u'20', u'21', u'22F', u'23A', u'23B', u'23F', u'24F', u'25A', u'29', u'3', u'31', u'33', u'33A', u'33F', u'34', u'35A/B', u'35B', u'35F', u'36', u'37', u'38', u'4', u'6A', u'6B', u'6C', u'7', u'7C', u'7F', u'9A', u'9N', u'9V', u'NT', u'Pool I']


Restrict our dataset to serotyped data and to kids under 5

In [8]:
serotyped = sparc2.Serotype.notnull() & ~sparc2.Serotype.isin(['Pool I', 'NT'])
under_five = sparc2.age_upper <= 5
sparc2 = sparc2[serotyped & under_five]

In [9]:
sparc2.head()

Unnamed: 0,Collection_Year,Serotype,Number_of_Serotypes,Age_Years,age_lower,age_upper
7,2001,10,2,1. <=1 year,,1
8,2001,10,1,2. 1> - <=2 years,1.0,2
9,2001,11A,1,1. <=1 year,,1
10,2001,11A,1,2. 1> - <=2 years,1.0,2
11,2001,11A,5,3. 2> - <=3 years,2.0,3


Combine certain serotypes into the same group

In [10]:
serotype_map = {
  '10A': '10',
  '10B': '10',
  '35B': '35A/B'
}
sparc2.Serotype.replace(serotype_map, inplace=True)

Get counts, grouped by collection year and by serotype

In [11]:
counts = sparc2.groupby(['Collection_Year', 'Serotype'])['Number_of_Serotypes'].sum()

A template Series to align serotype counts between the different years

In [12]:
zero_counts = pd.Series(0, index=sparc2.Serotype.unique())

In [23]:
years = counts.index.levels[0]
counts_by_year = {}
for yr in years:
  counts_by_year[yr] = (zero_counts + counts[yr]).fillna(0).astype(int)
  print "{}: {}".format(yr, counts_by_year[yr].sum())

2001: 126
2004: 183
2007: 263
2009: 253
2011: 311
2014: 286


Double check that all the counts are aligned

In [14]:
for yr in years:
  same_index = (counts_by_year[years[0]].index == counts_by_year[yr].index).all()
  if not same_index:
    print 'Problem!'

## Get denominators (number of swabs) by year

Read in data

In [15]:
swabs = pd.read_csv("data/united-states/SPARC_SwabbingByAge_2001-2014 MDL.csv")

Extract meaning from age_range. Restrict to under 5

In [16]:
swabs[['age_lower', 'age_upper']] = swabs.age_range.str.extract('\w\.\W*(\d?)\W*(\d)').convert_objects(convert_numeric=True)

  if __name__ == '__main__':
  if __name__ == '__main__':


In [17]:
swabs = swabs[swabs.age_upper <= 5]

Get total number of swabs by year

In [18]:
total_swabs = swabs.groupby(['year'])['frequency'].sum()
total_swabs

year
2001    560
2004    810
2007    830
2009    873
2011    889
2014    870
Name: frequency, dtype: int64

## Write to JSON files

Serotype file

In [19]:
serotypes = [s.replace('/', '') for s in counts_by_year[years[0]].index]
with open('outputs/serotypes_sparc2.json', 'w') as f: 
  json.dump({'serotypes': list(serotypes)}, f)

Observed counts files, including the count of uncolonized hosts

In [20]:
for yr in years:
  with open('outputs/observed_counts_sparc2_{}.json'.format(yr), 'w') as f:
    num_uncolonized = total_swabs.loc[yr] - counts_by_year[yr].sum()
    json.dump({'counts': list(counts_by_year[yr]) + [num_uncolonized]}, f)

An initial ranking of serotypes

In [21]:
ranks_2001 = counts_by_year[2001].rank(ascending=False)

In [22]:
with open('outputs/initial_ranks_sparc2.json', 'w') as f:
  json.dump({'ranks': list(ranks_2001)}, f)

Serotype-specific immunity. 0.9 for serotype 14, 0.3 for the rest

In [None]:
sigmas = len(serotypes) * [0.3]
sigmas[serotypes.index("14")] = 0.9

In [29]:
with open('outputs/sigmas_sparc2.json', 'w') as f:
  json.dump({'sigmas': sigmas}, f)

Old Version

In [79]:
sparc2 = pd.read_excel('Data Task 3 Mar 10 2015.xlsx', sheetname='post-vaccine carriage (0)')
sparc2.head(10)

Unnamed: 0,Observed Prevalence <5,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Prevalence <5,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Expected Carriers 5-6 Vaccinated,Unnamed: 198,Unnamed: 199,Unnamed: 200,Unnamed: 201,Unnamed: 202,Unnamed: 203,Unnamed: 204,Unnamed: 205,Unnamed: 206
Serotype,2001.0,2004.0,2007.0,2009.0,2011.0,2014.0,2001.0,2004.0,2007.0,2009.0,...,2001,2004,2007,2009,2011,2014,2016,2018,2020,2050
6A,0.047813,0.020485,0.006024,0.003436,0.001125,0.0,0.06238,0.035498,0.007414,0.001017,...,0,0,5,4,0,1,0,1,0,0
23F,0.028253,0.002561,0.001205,0.0,0.001125,0.0,0.033744,0.018392,0.003378,0.000647,...,0,0,2,0,0,0,0,0,0,0
19F,0.02608,0.020485,0.008434,0.002291,0.004499,0.002299,0.035477,0.017933,0.002815,0.00037,...,0,0,2,0,0,0,0,0,0,0
6B,0.023906,0.003841,0.00241,0.001145,0.0,0.0,0.023347,0.019956,0.002909,0.000462,...,0,0,2,2,0,0,0,0,1,0
11A,0.01956,0.021765,0.03012,0.018328,0.023622,0.028736,0.030278,0.026209,0.029373,0.039678,...,0,0,22,99,89,101,96,109,116,115
15B/C,0.017386,0.020485,0.03253,0.033219,0.056243,0.048276,0.015595,0.021243,0.024681,0.035516,...,0,0,15,94,111,59,70,98,91,109
19A,0.015213,0.032007,0.050602,0.045819,0.038245,0.010345,0.010123,0.012231,0.024118,0.037366,...,0,0,22,94,93,13,2,0,0,0
35A/B,0.010867,0.023045,0.031325,0.020619,0.021372,0.04023,0.005928,0.006805,0.01464,0.02155,...,0,0,9,65,66,66,87,66,73,52
14,0.008693,0.002561,0.0,0.001145,0.0,0.0,0.00611,0.009012,0.00122,0.000277,...,0,0,1,1,0,0,0,0,0,0


In [110]:
raw_serotypes = np.array(sparc2.index[1:]).astype(str)
serotypes = filter(lambda s : s not in invalid, raw_serotypes)

In [113]:
print serotypes

['6A', '23F', '19F', '6B', '11A', '15B/C', '19A', '35A/B', '14', '22F', '9A', '18C', 'NT', '10', '6C', '9N', '23A', '35F', '23B', '3', '34', '4', '31', '15A', '38', '15F', '29', '25A', '7F', '16F', '33F', '17F', '21', '37', '9V', '7C', '33A', '13', '18F', '36', '20', '24F']


In [127]:
years = (2001, 2004, 2007, 2009, 2011)
carriage_under_5 = {}
for i, yr in enumerate(years):
    carriage_under_5[yr] = list(sparc2.ix[serotypes,i].fillna(0))

Write to JSON files

In [134]:
with open('sparc2_serotypes.json', 'w') as f:
    json.dump({'serotypes': serotypes}, f)

In [None]:
for yr in years:
    with open('sparc2_fake_counts_{}.json'.format(yr)) as f:
        json.dump({'})