# Truncated Ver. of Open+Atlas_Reuse_Data

## Importing Packages

In [1]:
import pandas as pd
from scipy import stats

## Data Import

In [2]:
df=pd.read_csv('Truncated.csv')

In [3]:
df.shape

(321, 19)

In [4]:
df.columns

Index(['Isolate Id', 'Study', 'Species', 'Organism Group', 'Country', 'State',
       'Gender', 'Age Group', 'Speciality', 'Source', 'In / Out Patient',
       'Year', 'Phenotype', 'Amikacin', 'Amikacin_I',
       'Amoxycillin clavulanate', 'Amoxycillin clavulanate_I', 'Ampicillin',
       'Ampicillin_I'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,Isolate Id,Study,Species,Organism Group,Country,State,Gender,Age Group,Speciality,Source,In / Out Patient,Year,Phenotype,Amikacin,Amikacin_I,Amoxycillin clavulanate,Amoxycillin clavulanate_I,Ampicillin,Ampicillin_I
0,1000000.0,TEST,Pseudomonas aeruginosa,Non-Enterobacteriaceae,France,,Male,85 and Over,Emergency Room,GU: Urine,None Given,2013.0,,8,Susceptible,>32,,>32,
1,1000001.0,TEST,Pseudomonas aeruginosa,Non-Enterobacteriaceae,France,,Female,13 to 18 Years,Emergency Room,HEENT: Ears,None Given,2013.0,,4,Susceptible,>32,,>32,
2,1000002.0,TEST,Pseudomonas aeruginosa,Non-Enterobacteriaceae,France,,Female,65 to 84 Years,Nursing Home / Rehab,GU: Urine,None Given,2013.0,,4,Susceptible,>32,,>32,
3,1000003.0,TEST,Pseudomonas aeruginosa,Non-Enterobacteriaceae,France,,Male,19 to 64 Years,Medicine General,INT: Skin,None Given,2013.0,,4,Susceptible,>32,,>32,
4,1000004.0,TEST,Serratia marcescens,Enterobacteriaceae,France,,Male,19 to 64 Years,Medicine General,CVS: Blood,None Given,2013.0,,2,Susceptible,>32,Resistant,>32,Resistant


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321 entries, 0 to 320
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Isolate Id                 320 non-null    float64
 1   Study                      320 non-null    object 
 2   Species                    320 non-null    object 
 3   Organism Group             320 non-null    object 
 4   Country                    320 non-null    object 
 5   State                      0 non-null      float64
 6   Gender                     317 non-null    object 
 7   Age Group                  320 non-null    object 
 8   Speciality                 320 non-null    object 
 9   Source                     320 non-null    object 
 10  In / Out Patient           320 non-null    object 
 11  Year                       320 non-null    float64
 12  Phenotype                  146 non-null    object 
 13  Amikacin                   224 non-null    object 

## Categorical Data

The Amikacin_I column describes the susceptibility of the sample to the drug Amikacin

In [7]:
# Show the first 10 rows of the 'Amikacin_I' column
df['Amikacin_I'].head(10)

0    Susceptible
1    Susceptible
2    Susceptible
3    Susceptible
4    Susceptible
5    Susceptible
6    Susceptible
7      Resistant
8    Susceptible
9    Susceptible
Name: Amikacin_I, dtype: object

In [8]:
df['Amikacin_I'].value_counts()

Amikacin_I
Susceptible     180
Resistant        12
Intermediate      3
Name: count, dtype: int64

In [10]:
df['Amikacin_I'].value_counts(normalize=True) * 100

Amikacin_I
Susceptible     92.307692
Resistant        6.153846
Intermediate     1.538462
Name: proportion, dtype: float64

In [11]:
df['Amikacin_I'].unique()

array(['Susceptible', 'Resistant', nan, 'Intermediate'], dtype=object)

## Categorical/Categorical Comparison

The Species column described below contains all of the species that were tested in this study. 

In [13]:
df['Species'].unique()

array(['Pseudomonas aeruginosa', 'Serratia marcescens',
       'Acinetobacter pitii', 'Acinetobacter baumannii',
       'Enterobacter cloacae', 'Escherichia coli',
       'Haemophilus influenzae', 'Staphylococcus aureus',
       'Enterococcus faecium', 'Enterococcus faecalis',
       'Streptococcus agalactiae', 'Klebsiella pneumoniae',
       'Klebsiella (Enterobacter) aerogenes', 'Acinetobacter junii',
       'Klebsiella oxytoca', 'Enterobacter kobei',
       'Streptococcus pneumoniae', 'Acinetobacter, non-speciated',
       'Acinetobacter lwoffii', 'Serratia liquefaciens',
       'Enterobacter asburiae', 'Citrobacter freundii', nan], dtype=object)

In [14]:
df.Species.value_counts()

Species
Pseudomonas aeruginosa                 36
Staphylococcus aureus                  36
Klebsiella pneumoniae                  36
Haemophilus influenzae                 29
Escherichia coli                       26
Enterococcus faecalis                  25
Enterobacter cloacae                   20
Klebsiella oxytoca                     19
Streptococcus agalactiae               17
Serratia marcescens                    16
Acinetobacter baumannii                15
Streptococcus pneumoniae               14
Klebsiella (Enterobacter) aerogenes     8
Enterobacter kobei                      5
Enterococcus faecium                    4
Acinetobacter, non-speciated            4
Acinetobacter pitii                     3
Serratia liquefaciens                   2
Enterobacter asburiae                   2
Acinetobacter junii                     1
Acinetobacter lwoffii                   1
Citrobacter freundii                    1
Name: count, dtype: int64

In [15]:
df.Species.value_counts(normalize=True) * 100

Species
Pseudomonas aeruginosa                 11.2500
Staphylococcus aureus                  11.2500
Klebsiella pneumoniae                  11.2500
Haemophilus influenzae                  9.0625
Escherichia coli                        8.1250
Enterococcus faecalis                   7.8125
Enterobacter cloacae                    6.2500
Klebsiella oxytoca                      5.9375
Streptococcus agalactiae                5.3125
Serratia marcescens                     5.0000
Acinetobacter baumannii                 4.6875
Streptococcus pneumoniae                4.3750
Klebsiella (Enterobacter) aerogenes     2.5000
Enterobacter kobei                      1.5625
Enterococcus faecium                    1.2500
Acinetobacter, non-speciated            1.2500
Acinetobacter pitii                     0.9375
Serratia liquefaciens                   0.6250
Enterobacter asburiae                   0.6250
Acinetobacter junii                     0.3125
Acinetobacter lwoffii                   0.3125
Citro

In [16]:
pd.crosstab(df['Species'], df['Amikacin_I'])

Amikacin_I,Intermediate,Resistant,Susceptible
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acinetobacter baumannii,1,11,3
Acinetobacter junii,0,0,1
Acinetobacter lwoffii,0,0,1
Acinetobacter pitii,0,0,3
"Acinetobacter, non-speciated",0,0,4
Citrobacter freundii,0,0,1
Enterobacter asburiae,0,0,2
Enterobacter cloacae,1,0,19
Enterobacter kobei,0,0,5
Escherichia coli,0,1,25


## Numerical Data Analysis

Here, the Amikacin column is actually an object type, however after reviewing the data myself, the type of the data should actually be a float type considering there are only numbers here. This column is also convenient if we treat it as a numerical type because it will allow us to make nice comparisons with the Amikacin_I column which is the conclusions made from the results of the Amikacin column. 

The Amikacin column contains data found from conducting MIC's(Minimum Inhibitory Concentration) tests on each sample. The values here are in micrograms/milliliter and the values describe the lowest concencentration of the antibiotic Amikacin that inhibits the growth of bacteria.

In [18]:
#Convert the values of the 'Amikacin' column to float
df['Amikacin'] = df['Amikacin'].str.replace('<', '')
df['Amikacin'] = df['Amikacin'].str.replace('>', '')
df['Amikacin'] = df['Amikacin'].str.replace('=', '')
df['Amikacin'] = df['Amikacin'].str.replace('ug/ml', '')
df['Amikacin'] = df['Amikacin'].str.replace('MIC', '')
df['Amikacin'] = df['Amikacin'].str.replace(' ', '')
df['Amikacin'] = df['Amikacin'].str.replace('µg/ml', '')

In [20]:
# Convert the 'Amikacin' column to float
df['Amikacin'] = df['Amikacin'].astype(float)

In [21]:
df.Amikacin.describe()

count    224.000000
mean       6.399554
std       14.290153
min        0.500000
25%        2.000000
50%        2.000000
75%        4.000000
max       64.000000
Name: Amikacin, dtype: float64

In [22]:
df.Amikacin.mean()

np.float64(6.399553571428571)

## Categorical/Numerical Comparative Analysis

In [23]:
# Describe the summary statists of the 'Amikacin' column for each 'Amakacun_I' value
df.groupby('Amikacin_I')['Amikacin'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Amikacin_I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Intermediate,3.0,32.0,0.0,32.0,32.0,32.0,32.0,32.0
Resistant,12.0,64.0,0.0,64.0,64.0,64.0,64.0,64.0
Susceptible,180.0,2.583333,1.833503,0.5,2.0,2.0,4.0,16.0


In [24]:
# describe the summary statistics of the 'Amikacin' column for each 'Species' value
df.groupby('Species')['Amikacin'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Acinetobacter baumannii,15.0,49.633333,25.579335,0.5,48.0,64.0,64.0,64.0
Acinetobacter junii,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Acinetobacter lwoffii,1.0,0.5,,0.5,0.5,0.5,0.5,0.5
Acinetobacter pitii,3.0,6.666667,8.082904,2.0,2.0,2.0,9.0,16.0
"Acinetobacter, non-speciated",4.0,1.75,0.5,1.0,1.75,2.0,2.0,2.0
Citrobacter freundii,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
Enterobacter asburiae,2.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0
Enterobacter cloacae,20.0,3.75,6.812334,1.0,2.0,2.0,2.0,32.0
Enterobacter kobei,5.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0
Enterococcus faecalis,0.0,,,,,,,


## Numerical/Numerical Data Analysis

In [27]:
#make the 'Ampicillin' column to be float
df['Ampicillin'] = df['Ampicillin'].str.replace('<', '')
df['Ampicillin'] = df['Ampicillin'].str.replace('>', '')
df['Ampicillin'] = df['Ampicillin'].str.replace('=', '')
df['Ampicillin'] = df['Ampicillin'].str.replace('ug/ml', '')
df['Ampicillin'] = df['Ampicillin'].str.replace('MIC', '')
df['Ampicillin'] = df['Ampicillin'].str.replace(' ', '')
df['Ampicillin'] = df['Ampicillin'].str.replace('µg/ml', '')
df['Ampicillin'] = df['Ampicillin'].astype(float)

In [28]:
# Calculate the correlation between the 'Amikacin' and 'Ampicillin' columns
df['Amikacin'].corr(df['Ampicillin'])

np.float64(0.09268581755123168)