# Pneumococcal Data Analysis

In [997]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from scipy import stats
from fractions import Fraction

In [998]:
HOME_DIR = "/Users/martinemons/polybox/Universitaet/MSc_CBB/FS2021/IDD-rotation/IDD_TB"

In [999]:
df = pd.read_csv(HOME_DIR + "/data/PneumoData.csv")

### Data Wrangling

In [1000]:
df

Unnamed: 0,Accession,Strain Name,Taxon ID,Strain Cluster (SC),Year of Isolation,Community of Isolation,Host Age (months),Serotype,Capsule locus,Consensus serotype,Sequence type,Inferred sequence type,Benzylpenicillin MIC (_g/mL),Ceftriaxone MIC (_g/mL),Trimethoprim MIC (_g/mL),Erythromycin MIC (_g/mL),Tetracycline MIC (_g/mL),Chloramphenicol MIC (_g/mL)
0,ERR129088,CH2079,5Z52R,1,2007,C,Jun-24,10A,10A,10A,816.0,816,0.023,0.023,0.125,0.064,,
1,ERR129126,LE4000,N5O68,1,2007,D,24-36,10A,10A,10A,3290.0,3290,0.023,0.023,0.094,0.094,,
2,ERR129158,LE4124,RUJ90,1,2007,D,24-36,10A,10A,10A,816.0,816,0.023,0.023,0.19,0.047,,
3,ERR129164,MD5021,29ORI,1,2007,E,Jun-24,10A,10A,10A,816.0,816,0.032,0.032,0.125,0.064,,
4,ERR129199,ND6034,RN3X8,1,2007,F,36-84,10A,10A,10A,816.0,816,0.023,0.032,0.25,0.094,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,ERR069838,388483,388483,16,2001,H,36-84,6B,NT,6B,146.0,146,0.25,0.5,>4/76,>1,0.25,
612,ERR069840,397079,397079,16,2001,J,36-84,6B,6A,6B,315.0,315,0.25,0.06,0.5/9.,>1,>8,
613,ERR065957,436154,436154,16,2001,P,Jun-24,6B,6A,6B,1954.0,1954,1,0.5,0.25/4,<0.03,0.12,
614,ERR069769,132571,132571,16,2001,N,Jun-24,9N,9N,9N,405.0,405,<0.03,<0.03,0.25/4,<0.03,0.25,


In the early 2000's the Prevnar vaccine was approved by the FDA for usage in children. It is a vaccine agains *Streptococcus pneumoniae*. It targets the serotypes 4, 6B, 9V, 14, 18C, 19F, and 23F according to the FDA. This means we can classify the vaccine type as being all but those infected with the strains that were in the Prevnar vaccine and the rest as being non-vaccine type

In [1001]:
#set all the types included in the Prevnar Vaccine to be non-vaccinated
dic = {'4': 'non-vaccinated', '6B': 'non-vaccinated', '9V': 'non-vaccinated', '14': 'non-vaccinated', '18C': 'non-vaccinated', '19F': 'non-vaccinated', '23F': 'non-vaccinated'}

#then set all to be non-vaccine that are infected with the strains included in the Prevnar vaccine
df['Vaccine-type'] = df['Serotype'].map(dic) 

#set the rest (denoted as NaN) to be vaccinated
df['Vaccine-type'] = df['Vaccine-type'].fillna('vaccinated')

one can define whether the pneumococcal isolate is resistant or sensitive whether the minimum inhibitory concentration (MIC) is above or below a certain threshold. We will perform this analysis first for a single antibiotic, namely benzylpenicillin. An MIC for benzylpenicillin above 0.06 is considered as being resistant

#converting first every entry into a string
df['Benzylpenicillin MIC (_g/mL)'] = df['Benzylpenicillin MIC (_g/mL)'].astype(str)

#cutting away the '<' and '<=' etc.
df['Benzylpenicillin MIC (_g/mL)'] = df['Benzylpenicillin MIC (_g/mL)'].map(lambda x: x.lstrip('<>='))

#mapping every entry back to a float
df['Benzylpenicillin MIC (_g/mL)'] = df['Benzylpenicillin MIC (_g/mL)'].astype(float)

#setting every value above 0.06 to be resistant and sensitive otherwise
df['Resistance-type'] = ['resistant' if x > 0.06 else 'sensitive' for x in df['Benzylpenicillin MIC (_g/mL)']]

In [1002]:
#prior setting everything to sensiti
df['Resistance-type'] = np.repeat('sensitive', 616)

In [1003]:
df = df.fillna(0)
df

Unnamed: 0,Accession,Strain Name,Taxon ID,Strain Cluster (SC),Year of Isolation,Community of Isolation,Host Age (months),Serotype,Capsule locus,Consensus serotype,Sequence type,Inferred sequence type,Benzylpenicillin MIC (_g/mL),Ceftriaxone MIC (_g/mL),Trimethoprim MIC (_g/mL),Erythromycin MIC (_g/mL),Tetracycline MIC (_g/mL),Chloramphenicol MIC (_g/mL),Vaccine-type,Resistance-type
0,ERR129088,CH2079,5Z52R,1,2007,C,Jun-24,10A,10A,10A,816.0,816,0.023,0.023,0.125,0.064,0,0,vaccinated,sensitive
1,ERR129126,LE4000,N5O68,1,2007,D,24-36,10A,10A,10A,3290.0,3290,0.023,0.023,0.094,0.094,0,0,vaccinated,sensitive
2,ERR129158,LE4124,RUJ90,1,2007,D,24-36,10A,10A,10A,816.0,816,0.023,0.023,0.19,0.047,0,0,vaccinated,sensitive
3,ERR129164,MD5021,29ORI,1,2007,E,Jun-24,10A,10A,10A,816.0,816,0.032,0.032,0.125,0.064,0,0,vaccinated,sensitive
4,ERR129199,ND6034,RN3X8,1,2007,F,36-84,10A,10A,10A,816.0,816,0.023,0.032,0.25,0.094,0,0,vaccinated,sensitive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,ERR069838,388483,388483,16,2001,H,36-84,6B,NT,6B,146.0,146,0.25,0.5,>4/76,>1,0.25,0,non-vaccinated,sensitive
612,ERR069840,397079,397079,16,2001,J,36-84,6B,6A,6B,315.0,315,0.25,0.06,0.5/9.,>1,>8,0,non-vaccinated,sensitive
613,ERR065957,436154,436154,16,2001,P,Jun-24,6B,6A,6B,1954.0,1954,1,0.5,0.25/4,<0.03,0.12,0,non-vaccinated,sensitive
614,ERR069769,132571,132571,16,2001,N,Jun-24,9N,9N,9N,405.0,405,<0.03,<0.03,0.25/4,<0.03,0.25,0,vaccinated,sensitive


In [1004]:
#some weird data points were inexistant, those were set to 0 manually for the time being
df.iloc[336,12] = 0

df.iloc[423,12] = 0

df.iloc[516,12] = 0

df.iloc[523,12] = 0

df.iloc[558,12] = 0

In [1005]:
for i in range(12,16): 
    df.iloc[:,i] = df.iloc[:,i].astype(str)

    #cutting away the '<' and '<=' etc.
    df.iloc[:,i] = df.iloc[:,i].map(lambda x: x.lstrip('<>=AprFebJan-'))
    df.iloc[:,i] = df.iloc[:,i].map(lambda x: x.rstrip('/'))
    for j in range (616):

        #mapping every entry back to a float
        try:
            df.iloc[j,i] = float(df.iloc[j,i])
        except:
            #had to write an exception if the number has written as e.g. "0.25/7" to get the correct transformation
            t = df.iloc[j,i].split('/')
            #print(t)
            df.iloc[j,i] = float(t[0])/float(t[1])
  

In [1006]:
#the MIC for the antibiotics as given by the ordering of the dataframe
MIC = np.array([0.06, 0.5, 1, 0.2])

#going through all the MICs and check whether these make for a resistant phenotype
for i in range(12,16):
    for j in range (616):
        if df.iloc[j,i] > MIC[i-12]:
            df.iloc[j,19] = 'resistant'

In [1007]:
df

Unnamed: 0,Accession,Strain Name,Taxon ID,Strain Cluster (SC),Year of Isolation,Community of Isolation,Host Age (months),Serotype,Capsule locus,Consensus serotype,Sequence type,Inferred sequence type,Benzylpenicillin MIC (_g/mL),Ceftriaxone MIC (_g/mL),Trimethoprim MIC (_g/mL),Erythromycin MIC (_g/mL),Tetracycline MIC (_g/mL),Chloramphenicol MIC (_g/mL),Vaccine-type,Resistance-type
0,ERR129088,CH2079,5Z52R,1,2007,C,Jun-24,10A,10A,10A,816.0,816,0.023,0.023,0.125,0.064,0,0,vaccinated,sensitive
1,ERR129126,LE4000,N5O68,1,2007,D,24-36,10A,10A,10A,3290.0,3290,0.023,0.023,0.094,0.094,0,0,vaccinated,sensitive
2,ERR129158,LE4124,RUJ90,1,2007,D,24-36,10A,10A,10A,816.0,816,0.023,0.023,0.19,0.047,0,0,vaccinated,sensitive
3,ERR129164,MD5021,29ORI,1,2007,E,Jun-24,10A,10A,10A,816.0,816,0.032,0.032,0.125,0.064,0,0,vaccinated,sensitive
4,ERR129199,ND6034,RN3X8,1,2007,F,36-84,10A,10A,10A,816.0,816,0.023,0.032,0.25,0.094,0,0,vaccinated,sensitive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,ERR069838,388483,388483,16,2001,H,36-84,6B,NT,6B,146.0,146,0.25,0.5,0.052632,1.0,0.25,0,non-vaccinated,resistant
612,ERR069840,397079,397079,16,2001,J,36-84,6B,6A,6B,315.0,315,0.25,0.06,0.055556,1.0,>8,0,non-vaccinated,resistant
613,ERR065957,436154,436154,16,2001,P,Jun-24,6B,6A,6B,1954.0,1954,1.0,0.5,0.0625,0.03,0.12,0,non-vaccinated,resistant
614,ERR069769,132571,132571,16,2001,N,Jun-24,9N,9N,9N,405.0,405,0.03,0.03,0.0625,0.03,0.25,0,vaccinated,sensitive


now the idea is to take a subset of the data, the non-vaccinated individuals, and make the rest of the inference based on these. The question we are asking how does antibiotic resistance behave in the non-vaccinated group after we reduced the amount of available host via vaccination.

In [1008]:
df['Vaccine-type'].value_counts()

vaccinated        537
non-vaccinated     79
Name: Vaccine-type, dtype: int64

In [1009]:
#select those individuals where the vaccine-type is non-vaccinated
df_nonvacc = df[df['Vaccine-type']=='non-vaccinated']

In [1010]:
df_nonvacc.to_csv()

',Accession,Strain Name,Taxon ID,Strain Cluster (SC),Year of Isolation,Community of Isolation,Host Age (months),Serotype,Capsule locus,Consensus serotype,Sequence type,Inferred sequence type,Benzylpenicillin MIC (_g/mL),Ceftriaxone MIC (_g/mL),Trimethoprim MIC (_g/mL),Erythromycin MIC (_g/mL),Tetracycline MIC (_g/mL),Chloramphenicol MIC (_g/mL),Vaccine-type,Resistance-type\n68,ERR129077,CH2029,STPDE,5,2007,C,<6,9V,9V,9V,162.0,162,0.023,0.023,0.25,0.094,0,0,non-vaccinated,sensitive\n84,ERR129172,MD5037,UB6XH,6,2007,E,Jun-24,6B,6A,6B,138.0,138,0.023,0.016,0.25,0.047,0,0,non-vaccinated,sensitive\n150,ERR129146,LE4081,ZXPKH,9,2007,D,Jun-24,23F,23A,23A,42.0,42,0.023,0.023,0.125,0.064,0,0,non-vaccinated,sensitive\n227,ERR129062,BR1111,6GU7V,15,2007,J,<6,19F,19F,19F,3292.0,3292,1.0,0.5,32.0,256.0,0,0,non-vaccinated,resistant\n228,ERR129078,CH2033,WMK3T,15,2007,C,36-84,19F,19F,19F,236.0,236,3.0,1.5,6.0,4.0,0,0,non-vaccinated,resistant\n263,ERR129045,BR1065,DALR8,16,2007,J,36-84,19F,19F,19F,426

In [1011]:
df_nonvacc['Resistance-type'].value_counts()

sensitive    40
resistant    39
Name: Resistance-type, dtype: int64

In [1012]:
#group by year of isolation and resistance type then count the occurences
df_nonvacc_freq = df_nonvacc.groupby(['Year of Isolation', 'Resistance-type']).size().reset_index()

#rename the columns
df_nonvacc_freq.columns = ['Year of Isolation', 'Resistance-type', 'Counts']

In [1013]:
#in order to normalise
norm = df_nonvacc['Year of Isolation'].value_counts()
norm.to_frame()

newdf = pd.DataFrame(np.repeat(norm.values,2,axis=0))
newdf.columns = ['norm']
print(newdf)

   norm
0    40
1    40
2    29
3    29
4    10
5    10


In [1014]:
df_nonvacc_freq['Relative Frequencies'] = df_nonvacc_freq['Counts']/newdf['norm']

In [1015]:
df_nonvacc_freq

Unnamed: 0,Year of Isolation,Resistance-type,Counts,Relative Frequencies
0,2001,resistant,20,0.5
1,2001,sensitive,20,0.5
2,2004,resistant,16,0.551724
3,2004,sensitive,13,0.448276
4,2007,resistant,3,0.3
5,2007,sensitive,7,0.7


### Plotting the Results

In [1016]:
#plot the histogram

fig = px.bar(df_nonvacc_freq, x='Year of Isolation', y = 'Relative Frequencies',  color = 'Resistance-type',  barmode='group', title = 'Relative Frequencies of non-vaccinated subtypes after vaccination')
fig.show()

### Shapiro Wilk test for Normality

In [1017]:
#the confidence level
alpha = 0.05

p = stats.shapiro(df_nonvacc_freq['Relative Frequencies'])
if p[1] < 0.05:
    print("The relative frequencies are not normally distributed with:", p)
else:
    print("The relative frequencies are normally distributed with:", p)

The relative frequencies are normally distributed with: ShapiroResult(statistic=0.9598776698112488, pvalue=0.8187973499298096)


the problem is a little bit, that the sample sizes are at times rather small, especially for the year 2007 as seen above with $n=10$. Never the less we will procede with a t-test as the frequency data seems to be normally distributed and the t-test should be accurate for small sample sizes

In [1018]:
df_nonvacc_freq['Relative Frequencies'][1]

0.5

In [1019]:
stats.ttest_ind()

TypeError: ttest_ind() missing 2 required positional arguments: 'a' and 'b'