In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_excel("URINELOTS.xlsx")
data = data[['UHID','RAMJA Results','AIIMS Result culture']]
data.columns = ['UHID','RAMJA','AIIMS']
data.head()

Unnamed: 0,UHID,RAMJA,AIIMS
0,20220065432,Enterococcus faecalis,Enterococcus faecalis
1,20240016182,,Contamination
2,20230125389,"E.Coli, K.pneumoniae, P. aeruginosa, A. bauman...",E.coli
3,20180022685,"K.pneumoniae, E. faecalis, P. mirabilis, A. b...",CONTAMINATION
4,20201415102,,


In [3]:
data.shape

(146, 3)

In [377]:
#df_nan.to_excel('compareddata.xlsx', sheet_name='Null data')

In [4]:
import re
def cleanup(text):
    text = str(text)
    text = text.upper()
    text = text.replace(".", " ")
    text = text.replace(" AND ", " ")
    text = re.sub(r'\s+', ' ', text)  # Use re.sub for regular expression replacement
    text = text.strip()
    return text


In [5]:
bacteriatypes = {
    "COLI" : 'Escherichia coli', 
    "PNEUMO" : 'Klebsiella pneumoniae',
    "AERUGI" : 'Pseudomonas aeruginosa',
    "BAUM" : 'Acinetobacter baumannii', 
    "MIRABIL" : 'Proteus mirabilis', 
    "FAECALI" : 'Enterococcus faecalis', 
    "AUREUS" : 'Staphylococcus aureus', 
    "SAPROPHY" : 'Staphylococcus saprophyticus',
    "STERI" : 'Sterile',
    "CONTA" : 'Contamination',
    "NAN" : 'NAN'
}

In [6]:
data['cleanRAMJA'] = data['RAMJA'].apply(cleanup)
data['cleanAIIMS'] = data['AIIMS'].apply(cleanup)

In [7]:
def check(mainstring, s1, s2):
    if (mainstring.count(s1) > 0 or mainstring.count(s2) > 0):
        #print("YES")
        return(1)
    else:
        #print("NO")
        return(0)

### RAMJA Analysis

In [8]:
df_R = data.iloc[:,0:6]
genus = ["ESCHER","KLEBS","PSEUDO","ACINE","PROT","ENTER","AURE","SAPRO","STERI","CONTA","NAN"]
species = ["COLI", "PNEUMO", "AERUGI", "BAUM", "MIRABIL", "FAECALI", "AUREUS", "SAPROPHY", "STERI","CONTA","NAN"]
for s1, s2 in zip(genus,species):
    df_R[s2] = data['cleanRAMJA'].apply(lambda x: check(x, s1, s2))

df_R.rename(columns=bacteriatypes, inplace=True) #change column names
# add suffix _R to column names
# columns_to_modify = ['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile','Contamination','NAN']
# data.rename(columns={col: f"{col}_R" for col in columns_to_modify}, inplace=True)

In [9]:
df_R.shape

(146, 16)

In [10]:
#....add a column named 'Others' contain values other than assigend catagories
df_R_temp = df_R[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile','Contamination','NAN']]
df_R_temp = df_R_temp == 0
df_R_temp['Others_R'] = df_R_temp.all(axis=1)
df_R['Others_R'] = df_R_temp['Others_R'].astype(int)
df_R.shape

(146, 17)

In [11]:
#...Get infections positive column in RAMJA
df_R_temp  = df_R[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus']]
df_R_temp = df_R_temp == 1
df_R_temp['Infection_R'] = df_R_temp.any(axis=1)
df_R['Infection_R'] = df_R_temp['Infection_R'].astype(int)
df_R.shape

(146, 18)

In [12]:
#....Drop rows containing NaN in Ramaj column results
df_NonNaN_R = df_R.dropna(subset=['RAMJA'])
df_NonNaN_R.shape

(143, 18)

In [13]:
#...Remove contaminated sample and Others samples
df_R_clean = df_NonNaN_R[(df_NonNaN_R['Contamination'] == 0) & (df_NonNaN_R['Others_R'] == 0)]
df_R_clean.shape

(143, 18)

In [14]:
#....After Dopping NaN these two set of values should be equal
print(df_NonNaN_R['Infection_R'].value_counts())
print(df_NonNaN_R['Sterile'].value_counts())

1    97
0    46
Name: Infection_R, dtype: int64
0    97
1    46
Name: Sterile, dtype: int64


In [391]:
#df_R.to_excel("temp1.xlsx")

### AIIMS Analysis

In [15]:
df_A = data.iloc[:,0:6]
genus = ["ESCHER","KLEBS","PSEUDO","ACINE","PROT","ENTER","AURE","SAPRO","STERI","CONTA","NAN"]
species = ["COLI", "PNEUMO", "AERUGI", "BAUM", "MIRABIL", "FAECALI", "AUREUS", "SAPROPHY", "STERI","CONTA","NAN"]
for s1, s2 in zip(genus,species):
    df_A[s2] = data['cleanAIIMS'].apply(lambda x: check(x, s1, s2))

df_A.rename(columns=bacteriatypes, inplace=True) #change column names

# add suffix _R to column names
#columns_to_modify = ['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile','Contamination','NAN']
#data.rename(columns={col: f"{col}_A" for col in columns_to_modify}, inplace=True)

In [16]:
#....add a column named 'Others' contain values other than assigend catagories
df_A_temp = df_A[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile','Contamination','NAN']]
df_A_temp = df_A_temp == 0
df_A_temp['Others_A'] = df_A_temp.all(axis=1)
df_A['Others_A'] = df_A_temp['Others_A'].astype(int)
df_A.shape

(146, 17)

In [17]:
#...Get infections positive (any infection) in AIIMS
df_A_temp  = df_A[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus']]
df_A_temp >= 1
df_A_temp['Infection_A'] = df_A_temp.any(axis=1)
df_A['Infection_A'] = df_A_temp['Infection_A'].astype(int)
df_A.shape


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_A_temp['Infection_A'] = df_A_temp.any(axis=1)


(146, 18)

In [18]:
#...dataset containing removed NaN based on AIIMS
df_NonNaN_A = df_A.dropna(subset=['AIIMS'])

In [19]:
#...Remove contaminated sample and Others samples 
df_A_clean = df_NonNaN_A[(df_NonNaN_A['Contamination'] == 0) & (df_NonNaN_A['Others_A'] == 0)]
df_A_clean.shape

(94, 18)

In [20]:
df_A_clean.head()

Unnamed: 0,UHID,RAMJA,AIIMS,cleanRAMJA,cleanAIIMS,Escherichia coli,Klebsiella pneumoniae,Pseudomonas aeruginosa,Acinetobacter baumannii,Proteus mirabilis,Enterococcus faecalis,Staphylococcus aureus,Staphylococcus saprophyticus,Sterile,Contamination,NAN,Others_A,Infection_A
0,20220065432,Enterococcus faecalis,Enterococcus faecalis,ENTEROCOCCUS FAECALIS,ENTEROCOCCUS FAECALIS,0,0,0,0,0,1,0,0,0,0,0,0,1
2,20230125389,"E.Coli, K.pneumoniae, P. aeruginosa, A. bauman...",E.coli,"E COLI, K PNEUMONIAE, P AERUGINOSA, A BAUMANNI...",E COLI,1,0,0,0,0,0,0,0,0,0,0,0,1
5,20240012395,"Polymicrobial (E.coli, A. baumannii, P.mirabil...",E.coli,"POLYMICROBIAL (E COLI, A BAUMANNII, P MIRABILI...",E COLI,1,0,0,0,0,0,0,0,0,0,0,0,1
6,20220166372,"Pseudomonas, P.mirabilis, E.Faecalis, S. aureus",E.coli & Pseudomnas,"PSEUDOMONAS, P MIRABILIS, E FAECALIS, S AUREUS",E COLI & PSEUDOMNAS,1,0,1,0,0,0,0,0,0,0,0,0,1
8,20230149608,"E. Coli, Pseudomonas, P.mirabilis, E.Faecalis,...","E.coli,A.Baumini","E COLI, PSEUDOMONAS, P MIRABILIS, E FAECALIS, ...","E COLI,A BAUMINI",1,0,0,1,0,0,0,0,0,0,0,0,1


In [21]:
#....After Dopping NaN these two set of values should be equal
print(df_A_clean['Infection_A'].value_counts())
print(df_A_clean['Sterile'].value_counts())

0    64
1    30
Name: Infection_A, dtype: int64
1    64
0    30
Name: Sterile, dtype: int64


In [399]:
#...Values for RAMJA
Null_A = data['AIIMS'].isnull().sum(axis=0)
Non_Null_A = data['AIIMS'].count()

#...VAlues for RAMJA after dropping NaN
conta_R = df_NonNaN_A['Contamination'].sum()
Pos_R = df_NonNaN_R['Infection_R'].sum()
Neg_R = df_NonNaN_R['Sterile'].sum()

In [400]:
#df_NonNaN_A.to_excel("temp1.xlsx")

In [401]:
#......
df_R_infec = df_R_clean[df_R_clean['Infection_R'] == 1]
df_R_infec = df_R_infec[['UHID','RAMJA','AIIMS']]
df_R_infec.shape

(97, 3)

In [402]:
df_A_infec = df_A_clean[df_A_clean['Infection_A'] == 1]
df_A_infec = df_A_infec[['UHID','RAMJA','AIIMS']]
df_A_infec.shape

(30, 3)

### Common analysis

In [403]:
df_R_tempc = df_R_clean[['UHID','RAMJA','AIIMS', 'Infection_R']]
df_R_tempc.shape

(143, 4)

In [404]:
df_A_tempc = df_A_clean[['UHID','Infection_A']]
df_A_tempc.shape

(94, 2)

In [405]:
df_A_tempc.head()

Unnamed: 0,UHID,Infection_A
0,20220065432,1
2,20230125389,1
5,20240012395,1
6,20220166372,1
8,20230149608,1


In [406]:
df_C_clean = pd.merge(df_R_tempc,df_A_tempc,how='inner', on='UHID')
df_C_clean.shape

(96, 5)

In [407]:
df_C_clean.head()

Unnamed: 0,UHID,RAMJA,AIIMS,Infection_R,Infection_A
0,20220065432,Enterococcus faecalis,Enterococcus faecalis,1,1
1,20230125389,"E.Coli, K.pneumoniae, P. aeruginosa, A. bauman...",E.coli,1,1
2,20240012395,"Polymicrobial (E.coli, A. baumannii, P.mirabil...",E.coli,1,1
3,20220166372,"Pseudomonas, P.mirabilis, E.Faecalis, S. aureus",E.coli & Pseudomnas,1,1
4,20230149608,"E. Coli, Pseudomonas, P.mirabilis, E.Faecalis,...","E.coli,A.Baumini",1,1


In [23]:
#...Concordance: Get infections in both RAMJA and AIIMS
df_R_temp1  = df_R[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus']]
df_A_temp1  = df_A[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus']]
df_C_temp1 = df_R_temp1 & df_A_temp1
df_C_temp1['Infection_C'] = df_C_temp1.any(axis=1)
df_C = data[['UHID','RAMJA','AIIMS']]
df_C['concordance'] = df_C_temp1['Infection_C'].astype(int)
df_C.head()

In [35]:
df_C['concordance'].value_counts()

0    128
1     18
Name: concordance, dtype: int64

In [38]:
df_concordance = df_C[df_C['concordance'] == 1]
df_concordance = df_concordance[['UHID','RAMJA','AIIMS']]
df_concordance.shape

(18, 3)

In [39]:
df_concordance.head()

Unnamed: 0,UHID,RAMJA,AIIMS
0,20220065432,Enterococcus faecalis,Enterococcus faecalis
2,20230125389,"E.Coli, K.pneumoniae, P. aeruginosa, A. bauman...",E.coli
5,20240012395,"Polymicrobial (E.coli, A. baumannii, P.mirabil...",E.coli
6,20220166372,"Pseudomonas, P.mirabilis, E.Faecalis, S. aureus",E.coli & Pseudomnas
8,20230149608,"E. Coli, Pseudomonas, P.mirabilis, E.Faecalis,...","E.coli,A.Baumini"


In [64]:
# Write the DataFrames to an Excel file
with pd.ExcelWriter('compared.xlsx') as writer:
    df_nan.to_excel(writer, sheet_name='Null', index=False)
    df_nonnull.to_excel(writer, sheet_name='Non Null', index=False)
    df_R_infec.to_excel(writer, sheet_name='Ramja Positive', index=False)
    df_A_infec.to_excel(writer, sheet_name='AIIMS Positive', index=False)
    df_concordance.to_excel(writer, sheet_name='Concordance', index=False)

In [None]:

#......VAlies for both RAMJA and AIIMS
Total_Samples  = data.shape[0]

#...Values for RAMJA
Null_R = data['RAMJA'].isnull().sum(axis=0)
Non_Null_R = data['RAMJA'].count()

#...VAlues for RAMJA after dropping NaN
conta_R = df_NonNaN_R['Contamination'].sum()
Pos_R = df_NonNaN_R['Infection_R'].sum()
Neg_R = df_NonNaN_R['Sterile'].sum()


### Confusion matrix bwtween RAMJA and AIIMS

In [84]:
df_infection = df_infection.dropna(subset=['RAMJA','AIIMS'])

In [86]:
df_infection.head()

Unnamed: 0,UHID,RAMJA,AIIMS,Infection_A,Infection_R
0,20220065432,Enterococcus faecalis,Enterococcus faecalis,1,1
2,20230125389,"E.Coli, K.pneumoniae, P. aeruginosa, A. bauman...",E.coli,1,1
3,20180022685,"K.pneumoniae, E. faecalis, P. mirabilis, A. b...",CONTAMINATION,0,1
5,20240012395,"Polymicrobial (E.coli, A. baumannii, P.mirabil...",E.coli,1,1
6,20220166372,"Pseudomonas, P.mirabilis, E.Faecalis, S. aureus",E.coli & Pseudomnas,1,1


In [88]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(df_infection['Infection_A'],df_infection['Infection_R'])
cm


array([[34, 54],
       [ 7, 23]], dtype=int64)

In [90]:
print(df_infection['Infection_A'].value_counts())
print(df_infection['Infection_R'].value_counts())

0    88
1    30
Name: Infection_A, dtype: int64
1    77
0    41
Name: Infection_R, dtype: int64


In [92]:
#...Values for AIIMS
N_Aiims  = data.shape[0]
aiimsNull = data['AIIMS'].isnull().sum(axis=0)
aiimsnonnull = N_Aiims - aiimsNull
conta_aiims = df_A['Contamination'].sum()
totalpositive_aiims = df_infection['Infection_A'].sum()

In [20]:
# from sklearn.metrics import confusion_matrix

# y_true = df_A_nonnull['Escherichia coli']
# y_pred = df_R_nonnull['Escherichia coli']

# cm = confusion_matrix(y_true, y_pred)
# print(cm)

[[68 32]
 [ 7 11]]


In [205]:
index = ['TN', 'FP', 'FN', 'TP']
columns = ['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile']
df_cm = pd.DataFrame(cm_dfs,index,columns)
df_cm

Unnamed: 0,Escherichia coli,Klebsiella pneumoniae,Pseudomonas aeruginosa,Acinetobacter baumannii,Proteus mirabilis,Enterococcus faecalis,Staphylococcus aureus,Staphylococcus saprophyticus,Sterile
TN,68,88,103,93,91,88,90,107,42
FP,32,24,13,25,26,24,27,11,12
FN,7,4,0,0,0,3,1,0,38
TP,11,2,2,0,1,3,0,0,26


In [206]:
totalPositive = df_cm.loc['TP'] + df_cm.loc['FP']
totalNegative = df_cm.loc['TN'] + df_cm.loc['FN']
df_cm.loc['Total Positive'] = totalPositive
df_cm.loc['Total Negative'] = totalNegative

In [207]:
df_cm

Unnamed: 0,Escherichia coli,Klebsiella pneumoniae,Pseudomonas aeruginosa,Acinetobacter baumannii,Proteus mirabilis,Enterococcus faecalis,Staphylococcus aureus,Staphylococcus saprophyticus,Sterile
TN,68,88,103,93,91,88,90,107,42
FP,32,24,13,25,26,24,27,11,12
FN,7,4,0,0,0,3,1,0,38
TP,11,2,2,0,1,3,0,0,26
Total Positive,43,26,15,25,27,27,27,11,38
Total Negative,75,92,103,93,91,91,91,107,80


In [211]:
aiims2 = [TotalSamples, aiimsNull, aiimsnonnull, common_nonnull, contaaiims]

In [212]:
aiims2

[146, 27, 119, 118, 14]