In [82]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [83]:
data = pd.read_excel("RAWLOT1_temp.xlsx")
data = data[['UHID','RAMJA Result','AIIMS Result culture']]
data.columns = ['UHID','RAMJA','AIIMS']
data.head()

Unnamed: 0,UHID,RAMJA,AIIMS
0,20220065432,Enterococcus faecalis,Enterococcus faecalis
1,20240016182,,
2,20230125389,E.Coli. and K.pne,E.coli
3,20180022685,P. mirabilis,CONTAMINATION
4,20201415102,,


In [22]:
#df_nan.to_excel('compareddata.xlsx', sheet_name='Null data')

In [86]:
import re
def cleanup(text):
    text = str(text)
    text = text.upper()
    text = text.replace(".", " ")
    text = text.replace(" AND", ",")
    text = text.replace(" &", ",")
    text = text.replace("POLYMICROBIAL (", "")
    text = text.replace("AUROGENOSA", "AERUGINOSA")
    text = re.sub(r'\s+', ' ', text)  # Use re.sub for regular expression replacement
    text = text.strip()
    return text


In [87]:
bacteriatypes = {
    "COLI" : 'Escherichia coli', 
    "PNE" : 'Klebsiella pneumoniae',
    "AERUGI" : 'Pseudomonas aeruginosa',
    "BAUM" : 'Acinetobacter baumannii', 
    "MIRABIL" : 'Proteus mirabilis', 
    "FAECALI" : 'Enterococcus faecalis', 
    "AUREUS" : 'Staphylococcus aureus', 
    "SAPROPHY" : 'Staphylococcus saprophyticus',
    "STERI" : 'Sterile',
    "CONTA" : 'Contamination',
    "NAN" : 'NAN'
}

In [88]:
data['cleanRAMJA'] = data['RAMJA'].apply(cleanup)
data['cleanAIIMS'] = data['AIIMS'].apply(cleanup)

In [89]:
#..FUNCTION TO CHECK IF TEST STRINGS(S1 OR S2) PRESENT IN MAIN STRING 
def check(mainstring, s1, s2):
    if (mainstring.count(s1) > 0 or mainstring.count(s2) > 0):
        #print("YES")
        return(1)
    else:
        #print("NO")
        return(0)

In [90]:
data.shape

(128, 5)

### RAMJA Analysis

In [91]:
df_R = data.copy()
genus = ["ESCHER","KLEBS","PSEUDO","ACINE","PROT","ENTER","AURE","SAPRO","STERI","CONTA","NAN"]
species = ["COLI", "PNE", "AERUGI", "BAUM", "MIRABIL", "FAECALI", "AUREUS", "SAPROPHY", "STERI","CONTA","NAN"]
for s1, s2 in zip(genus,species):
    df_R[s2] = df_R['cleanRAMJA'].apply(lambda x: check(x, s1, s2))

df_R.rename(columns=bacteriatypes, inplace=True) #change column names
# add suffix _R to column names
# columns_to_modify = ['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile','Contamination','NAN']
# data.rename(columns={col: f"{col}_R" for col in columns_to_modify}, inplace=True)

In [93]:
df_R.shape

(128, 16)

In [94]:
#....add a column named 'Others' contain values other than assigend catagories
df_R_temp = df_R[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile','Contamination','NAN']]
df_R_temp = df_R_temp == 0
df_R_temp['Others_R'] = df_R_temp.all(axis=1)
df_R['Others_R'] = df_R_temp['Others_R'].astype(int)
df_R.shape

(128, 17)

In [95]:
df_R.head()

Unnamed: 0,UHID,RAMJA,AIIMS,cleanRAMJA,cleanAIIMS,Escherichia coli,Klebsiella pneumoniae,Pseudomonas aeruginosa,Acinetobacter baumannii,Proteus mirabilis,Enterococcus faecalis,Staphylococcus aureus,Staphylococcus saprophyticus,Sterile,Contamination,NAN,Others_R
0,20220065432,Enterococcus faecalis,Enterococcus faecalis,ENTEROCOCCUS FAECALIS,ENTEROCOCCUS FAECALIS,0,0,0,0,0,1,0,0,0,0,0,0
1,20240016182,,,NAN,NAN,0,0,0,0,0,0,0,0,0,0,1,0
2,20230125389,E.Coli. and K.pne,E.coli,"E COLI , K PNE",E COLI,1,1,0,0,0,0,0,0,0,0,0,0
3,20180022685,P. mirabilis,CONTAMINATION,P MIRABILIS,CONTAMINATION,0,0,0,0,1,0,0,0,0,0,0,0
4,20201415102,,,NAN,NAN,0,0,0,0,0,0,0,0,0,0,1,0


In [96]:
#...Get infections positive column in RAMJA
df_R_temp  = df_R[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Contamination','Others_R']]
df_R_temp = df_R_temp == 1
df_R_temp['Infection_R'] = df_R_temp.any(axis=1)
df_R['Infection_R'] = df_R_temp['Infection_R'].astype(int)
df_R.shape

(128, 18)

In [97]:
#....Drop rows containing NaN in Ramaj column results
df_NonNaN_R = df_R.dropna(subset=['RAMJA'])
df_NonNaN_R.shape
df_R_clean = df_NonNaN_R

In [98]:
#display(df_NonNaN_R)

In [99]:
#...Remove contaminated sample and Others samples
# df_R_clean = df_NonNaN_R[(df_NonNaN_R['Contamination'] == 0) & (df_NonNaN_R['Others_R'] == 0)]
# df_R_clean.shape

In [100]:
#....After Dopping NaN these two set of values should be equal
print(df_R_clean['Infection_R'].value_counts())
print(df_R_clean['Sterile'].value_counts())

1    70
0    52
Name: Infection_R, dtype: int64
0    70
1    52
Name: Sterile, dtype: int64


In [101]:
#df_R.to_excel("temp1.xlsx")

### AIIMS Analysis

In [102]:
data.shape

(128, 5)

In [103]:
df_A = data.copy()
genus = ["ESCHER","KLEBS","PSEUDO","ACINE","PROT","ENTER","AURE","SAPRO","STERI","CONTA","NAN"]
species = ["COLI", "PNE", "AERUGI", "BAUM", "MIRABIL", "FAECALI", "AUREUS", "SAPROPHY", "STERI","CONTA","NAN"]
for s1, s2 in zip(genus,species):
    df_A[s2] = df_A['cleanAIIMS'].apply(lambda x: check(x, s1, s2))

df_A.rename(columns=bacteriatypes, inplace=True) #change column names

# add suffix _R to column names
#columns_to_modify = ['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile','Contamination','NAN']
#data.rename(columns={col: f"{col}_A" for col in columns_to_modify}, inplace=True)

In [104]:
df_A.shape

(128, 16)

In [105]:
#....add a column named 'Others' contain values other than assigend catagories
df_A_temp = df_A[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Sterile','Contamination','NAN']]
df_A_temp = df_A_temp == 0
df_A_temp['Others_A'] = df_A_temp.all(axis=1)
df_A['Others_A'] = df_A_temp['Others_A'].astype(int)
df_A.shape

(128, 17)

In [106]:
df_A.head()

Unnamed: 0,UHID,RAMJA,AIIMS,cleanRAMJA,cleanAIIMS,Escherichia coli,Klebsiella pneumoniae,Pseudomonas aeruginosa,Acinetobacter baumannii,Proteus mirabilis,Enterococcus faecalis,Staphylococcus aureus,Staphylococcus saprophyticus,Sterile,Contamination,NAN,Others_A
0,20220065432,Enterococcus faecalis,Enterococcus faecalis,ENTEROCOCCUS FAECALIS,ENTEROCOCCUS FAECALIS,0,0,0,0,0,1,0,0,0,0,0,0
1,20240016182,,,NAN,NAN,0,0,0,0,0,0,0,0,0,0,1,0
2,20230125389,E.Coli. and K.pne,E.coli,"E COLI , K PNE",E COLI,1,0,0,0,0,0,0,0,0,0,0,0
3,20180022685,P. mirabilis,CONTAMINATION,P MIRABILIS,CONTAMINATION,0,0,0,0,0,0,0,0,0,1,0,0
4,20201415102,,,NAN,NAN,0,0,0,0,0,0,0,0,0,0,1,0


In [107]:
#...Get infections positive (any infection) in AIIMS
df_A_temp  = df_A[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus','Contamination','Others_A']]
df_A_temp = df_A_temp == 1
df_A_temp['Infection_A'] = df_A_temp.any(axis=1)
df_A['Infection_A'] = df_A_temp['Infection_A'].astype(int)
df_A.shape


(128, 18)

In [108]:
#...dataset containing removed NaN based on AIIMS
df_NonNaN_A = df_A.dropna(subset=['AIIMS'])
df_A_clean = df_NonNaN_A

In [109]:
#...Remove contaminated sample and Others samples 
# df_A_clean = df_NonNaN_A[(df_NonNaN_A['Contamination'] == 0) & (df_NonNaN_A['Others_A'] == 0)]
# df_A_clean.shape

In [110]:
#....After Dopping NaN these two set of values should be equal
print(df_A_clean['Infection_A'].value_counts())
print(df_A_clean['Sterile'].value_counts())

0    56
1    55
Name: Infection_A, dtype: int64
1    56
0    55
Name: Sterile, dtype: int64


In [111]:
#df_NonNaN_A.to_excel("temp1.xlsx")

In [112]:
#......
df_R_infec = df_R_clean[df_R_clean['Infection_R'] == 1]
df_R_infec = df_R_infec[['UHID','RAMJA','AIIMS']]
df_R_infec.shape

(70, 3)

In [113]:
df_A_infec = df_A_clean[df_A_clean['Infection_A'] == 1]
df_A_infec = df_A_infec[['UHID','RAMJA','AIIMS']]
df_A_infec.shape

(55, 3)

### Concordance analysis

In [114]:
#...Concordance: Get same type of infections in both RAMJA and AIIMS
df_R_temp1  = df_R[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus']]
df_A_temp1  = df_A[['Escherichia coli','Klebsiella pneumoniae','Pseudomonas aeruginosa','Acinetobacter baumannii','Proteus mirabilis','Enterococcus faecalis','Staphylococcus aureus','Staphylococcus saprophyticus']]
df_C_temp1 = df_R_temp1 & df_A_temp1
df_C_temp1['Infection_C'] = df_C_temp1.any(axis=1)
df_C = data[['UHID','RAMJA','AIIMS']]
df_C['concordance'] = df_C_temp1['Infection_C'].astype(int)
df_C.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_C['concordance'] = df_C_temp1['Infection_C'].astype(int)


Unnamed: 0,UHID,RAMJA,AIIMS,concordance
0,20220065432,Enterococcus faecalis,Enterococcus faecalis,1
1,20240016182,,,0
2,20230125389,E.Coli. and K.pne,E.coli,1
3,20180022685,P. mirabilis,CONTAMINATION,0
4,20201415102,,,0


In [115]:
df_C.shape

(128, 4)

In [116]:
df_C['concordance'].value_counts()

0    103
1     25
Name: concordance, dtype: int64

In [117]:
df_NonNaN_C = df_C.dropna(subset=['RAMJA','AIIMS'])

In [118]:
df_NonNaN_C.shape

(109, 4)

### common clean samples

In [119]:
df_R_tempc = df_R_clean[['UHID','RAMJA','AIIMS', 'Infection_R']]
df_R_tempc.shape

(122, 4)

In [120]:
df_A_tempc = df_A_clean[['UHID','Infection_A']]
df_A_tempc.shape

(111, 2)

In [121]:
#..clean datset having common samples with No Nan, No contamination, No Others
df_C_clean = pd.merge(df_R_tempc,df_A_tempc,how='inner', on='UHID')
df_C_clean.shape

(109, 5)

In [122]:
df_C_clean.head()

Unnamed: 0,UHID,RAMJA,AIIMS,Infection_R,Infection_A
0,20220065432,Enterococcus faecalis,Enterococcus faecalis,1,1
1,20230125389,E.Coli. and K.pne,E.coli,1,1
2,20180022685,P. mirabilis,CONTAMINATION,1,1
3,20240012395,"Polymicrobial (E.coli, A. baumannii, P.mirabil...",E.coli,1,1
4,20220166372,"E.coli, Pseudomonas, P.mirabilis, E.Faecalis, ...",E.coli & Pseudomnas,1,1


#### Confusion matrix

In [123]:
from sklearn.metrics import confusion_matrix

y_true = df_C_clean['Infection_A']
y_pred = df_C_clean['Infection_R']

cm = confusion_matrix(y_true, y_pred)
print(cm)

[[38 18]
 [ 7 46]]


In [124]:
sum(sum(cm))

109

In [125]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn,fp,fn,tp)

(38, 18, 7, 46)

In [126]:
t_cm = np.array([tp, fp, fn, tn]).reshape(2,2)
t_cm

array([[46, 18],
       [ 7, 38]], dtype=int64)

In [127]:
df_cm =pd.DataFrame(t_cm,index=['RAMJA Positive','RAMJA Negative'], columns= ['AIIMS Positive','AIIMS Negative'])
df_cm

Unnamed: 0,AIIMS Positive,AIIMS Negative
RAMJA Positive,46,18
RAMJA Negative,7,38


#### Scores

In [73]:
def scores(y_test, y_pred):
        #y_pred = self.fit(x_train, x_test, y_train)
        from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
        
        cm = confusion_matrix(y_test,y_pred)
        
        # Compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
                
        #.....................Accuracy......................#
        #accur = accuracy_score(y_test,y_pred)
        accur = (tp + tn)/(tp+tn+fp+fn)
        
       #.....................Senstivity.....................#
        sens = tp/(tp + fn)

        #.....................Specificity...................#
        spec = tn / (tn + fp)

        #..........Positive predictive value (PPV)...........#
        PPV = tp/(tp + fp)

        #..........negative predictive value (NPV)...........#
        NPV = tn/(tn + fn)
        
        #..........Positive Likelihood ratio (PLR)...........#
        PLR = sens/(1-spec)

        #..........Negative Likelihood ratio (NLR)...........#
        NLR = (1-sens)/spec

        errorList = np.array([accur, sens, spec, PPV, NPV, PLR, NLR])
        return(errorList)

In [128]:
scores(y_true,y_pred)
scoreid = ['Accuracy', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'PLR', 'NLR']
df_score = pd.DataFrame(scores(y_true,y_pred),index=scoreid, columns= ['LOT1'])
df_score = round(df_score,4)
df_score

Unnamed: 0,LOT1
Accuracy,0.7706
Sensitivity,0.8679
Specificity,0.6786
PPV,0.7188
NPV,0.8444
PLR,2.7002
NLR,0.1946


In [44]:
""" print("#..........RAMJA Details..........#")
#......VAlies for both RAMJA and AIIMS
print("Total samples: ", data.shape[0])

#...Values for RAMJA
print("Null Samples RAMJA: ",data['RAMJA'].isnull().sum(axis=0))
print("Non-Null RAMJA: ", data['RAMJA'].count())

#...VAlues for RAMJA after dropping NaN
print("Contaminated Samples RAMJA: ", df_NonNaN_R['Contamination'].sum())
print("Other RAMJA: ", df_NonNaN_R['Others_R'].sum())
print("Positive RAMJA: ", df_NonNaN_R['Infection_R'].sum())
print("Negative RAMJA: ", df_NonNaN_R['Sterile'].sum())

print("#..........AIIMS Details..........#")
#...Values for AIIMS
print("Total samples: ", data.shape[0])
print("Null Samples AIIMS: ",data['AIIMS'].isnull().sum(axis=0))
print("Non-Null AIIMS: ", data['AIIMS'].count())

#...VAlues for RAMJA after dropping NaN
print("Contaminated Samples AIIMS: ", df_NonNaN_A['Contamination'].sum())
print("Other AIIMS: ", df_NonNaN_A['Others_A'].sum())
print("Positive AIIMS: ", df_NonNaN_A['Infection_A'].sum())
print("Negative AIIMS: ", df_NonNaN_A['Sterile'].sum())
 """

' print("#..........RAMJA Details..........#")\n#......VAlies for both RAMJA and AIIMS\nprint("Total samples: ", data.shape[0])\n\n#...Values for RAMJA\nprint("Null Samples RAMJA: ",data[\'RAMJA\'].isnull().sum(axis=0))\nprint("Non-Null RAMJA: ", data[\'RAMJA\'].count())\n\n#...VAlues for RAMJA after dropping NaN\nprint("Contaminated Samples RAMJA: ", df_NonNaN_R[\'Contamination\'].sum())\nprint("Other RAMJA: ", df_NonNaN_R[\'Others_R\'].sum())\nprint("Positive RAMJA: ", df_NonNaN_R[\'Infection_R\'].sum())\nprint("Negative RAMJA: ", df_NonNaN_R[\'Sterile\'].sum())\n\nprint("#..........AIIMS Details..........#")\n#...Values for AIIMS\nprint("Total samples: ", data.shape[0])\nprint("Null Samples AIIMS: ",data[\'AIIMS\'].isnull().sum(axis=0))\nprint("Non-Null AIIMS: ", data[\'AIIMS\'].count())\n\n#...VAlues for RAMJA after dropping NaN\nprint("Contaminated Samples AIIMS: ", df_NonNaN_A[\'Contamination\'].sum())\nprint("Other AIIMS: ", df_NonNaN_A[\'Others_A\'].sum())\nprint("Positive AIIM

#### Samples detail

In [75]:
samplesDetail = {
    "Total Samples": [data.shape[0],data.shape[0]],
    "Blank Results": [data['RAMJA'].isnull().sum(axis=0), data['AIIMS'].isnull().sum(axis=0)],
    "Non-Blank Results": [data['RAMJA'].count(),data['AIIMS'].count()],
    "Contaminated Samples": [df_NonNaN_R['Contamination'].sum(),df_NonNaN_A['Contamination'].sum()],
    "Others": [df_NonNaN_R['Others_R'].sum(),df_NonNaN_A['Others_A'].sum()],
    "Common samples":  [df_NonNaN_C.shape[0], df_NonNaN_C.shape[0]],
    #"Useful Samples":[df_R_clean.shape[0], df_A_clean.shape[0]],
    "Positive": [df_NonNaN_R['Infection_R'].sum(),df_NonNaN_A['Infection_A'].sum()],
    "Negative": [df_NonNaN_R['Sterile'].sum(),df_NonNaN_A['Sterile'].sum()],
    "Common Positive": [tp,tp],
    "Concordance [Common bacteria]": [df_C['concordance'].sum(),df_C['concordance'].sum()],
    "Common Negative": [tn,tn]
    
}

In [76]:
df_samplesDetail = pd.DataFrame(samplesDetail, index=['RAMJA','AIIMS']).T
df_samplesDetail

Unnamed: 0,RAMJA,AIIMS
Total Samples,131,131
Blank Results,6,17
Non-Blank Results,125,114
Contaminated Samples,0,17
Others,0,9
Common samples,112,112
Positive,70,58
Negative,55,56
Common Positive,46,46
Concordance [Common bacteria],25,25


#### Export matrices to excel

In [77]:
# Write the DataFrames to an Excel file
with pd.ExcelWriter('matrices2.xlsx') as writer:
    df_samplesDetail.to_excel(writer, sheet_name='SamplesDetail', index=True)
    df_cm.to_excel(writer, sheet_name='ConfusionMatrix', index=True)
    df_score.to_excel(writer, sheet_name='Scores', index=True)
    


#### Sort the required dataset and export to excel

In [48]:
df_A_conta = df_NonNaN_A[df_NonNaN_A['Contamination'] ==1][['UHID', 'RAMJA', 'AIIMS']]
df_A_conta.shape

(14, 3)

In [49]:
df_A_others = df_NonNaN_A[df_NonNaN_A['Others_A'] ==1][['UHID', 'RAMJA', 'AIIMS']]
df_A_others.shape

(9, 3)

In [212]:
display(df_A_others)

Unnamed: 0,UHID,RAMJA,AIIMS
26,20240019044,E.coli & Klebsiella pneumonia,Candida spp.
70,20240034567,"A. baumannii, P.mirabilis, E. Faecalis",Insignificant growth
90,20240046542,P.Aurogenosa,Acintobacter spp.
96,20240074281,Sterile,Commensal Flora
104,20240028527,Sterile,Insignificant growth of colonizer
107,20240087671,"S.Aureus ,Saprohutics, and Mirababilis",Insignificant growth gram positive cocci
108,20230121277,E. coli and A.Baumini,Insignificant Gram negative bacilli
109,20240094465,"A. baumanni, P mirabilis, S saprophyticus",Candida spp. and Insignificant gram negative b...
110,20230156801,S.Saprophytics,Insignificant gram negative bacilli


In [52]:
#...Both positives
df_Pos_C = df_C_clean[(df_C_clean['Infection_R'] == 1) & (df_C_clean['Infection_A'] == 1)][['UHID', 'RAMJA', 'AIIMS']]
df_Pos_C.shape

(46, 3)

In [53]:
#...Both Negatives
df_Neg_C = df_C_clean[(df_C_clean['Infection_R'] == 0) & (df_C_clean['Infection_A']==0)][['UHID', 'RAMJA', 'AIIMS']]
df_Neg_C.shape

(27, 3)

In [54]:
#...Common type of infection
df_concordance = df_C[df_C['concordance'] == 1][['UHID','RAMJA','AIIMS']]
df_concordance.shape

(25, 3)

In [55]:
# Write the DataFrames to an Excel file
with pd.ExcelWriter('sorteddata2.xlsx') as writer:
    df_A_others.to_excel(writer, sheet_name='Undefined Samples_A', index=False)
    df_A_conta.to_excel(writer, sheet_name='Contaminated Samples_A', index=False)
    df_Pos_C.to_excel(writer, sheet_name='Common Positives', index=False)
    df_concordance.to_excel(writer, sheet_name='Concordance', index=False)
    df_Neg_C.to_excel(writer, sheet_name='Common Negatives', index=False)

#### ................Finish.....................