# 1. Load up Excel file

In [67]:
import pandas as pd

resfinder_file = '../../resfinder.xlsx'

resfinder_excel = pd.ExcelFile(resfinder_file)
sheets_df_map_orig = {n: pd.read_excel(resfinder_excel, sheet_name=n, header=None) for n in resfinder_excel.sheet_names}
sheets_df_map_orig.keys()

dict_keys(['Aminoglycoside', 'B-lactam', 'Colistin', 'Fosfomycin', 'Fusidic acid', 'Glycopeptide', 'Macrolide', 'Nitroimidazole', 'Oxazolidinone', 'Phenicol', 'Quinolone', 'Sulphonamide', 'Rifampicin', 'Tetracycline', 'Trimethoprim', 'Non-functional'])

# 2. Add organism names/column names

In [68]:
sheets_df_map = {n.lower().replace(' ', ''): sheets_df_map_orig[n] for n in sheets_df_map_orig}

sheets_df_map['beta-lactam'] = sheets_df_map['b-lactam']
del sheets_df_map['b-lactam']

sheets_df_map.keys()

sheets_df_map = {n: sheets_df_map[n].rename(
    {0: 'gene_accession', 1: 'Drug'}, axis='columns') for n in sheets_df_map}
for n in sheets_df_map:
    sheets_df_map[n].insert(0, 'Class', n)
    
sheets_df_map['aminoglycoside'].head(3)

Unnamed: 0,Class,gene_accession,Drug,2
0,aminoglycoside,aac(6')-Ib_2_M23634,gentamicin,
1,aminoglycoside,aac(6')-Ib11_1_AY136758,gentamicin,
2,aminoglycoside,aac(6')-30-aac(6')-Ib'_1_AJ584652,gentamicin,


In [69]:
sheets_list = [sheets_df_map[n] for n in sorted(sheets_df_map.keys())]

resfinder_df_orig = pd.concat(sheets_list).reset_index(drop=True)
resfinder_df_orig.head(3)

Unnamed: 0,Class,gene_accession,Drug,2,3,4,5
0,aminoglycoside,aac(6')-Ib_2_M23634,gentamicin,,,,
1,aminoglycoside,aac(6')-Ib11_1_AY136758,gentamicin,,,,
2,aminoglycoside,aac(6')-30-aac(6')-Ib'_1_AJ584652,gentamicin,,,,


## 2.1. Join together extra columns and see what they contain

In [70]:
df = resfinder_df_orig.drop(['Class', 'gene_accession', 'Drug'],
                       axis='columns')
ds = pd.Series(df.fillna('').values.tolist()).str.join('')

resfinder_df_orig2 = resfinder_df_orig.copy()
resfinder_df_orig2['Other'] = ds
resfinder_df_orig2['Other'].value_counts()

Other
                                                                                                                                                                                                                            3174
formerly aadB                                                                                                                                                                                                                 19
AMRfinderEcoNITROFURANTOINnitroreductase NfsA                                                                                                                                                                                  5
formerly strA                                                                                                                                                                                                                  2
AMRfinderShig/Ecoeffluxchromosomal wildtype pump                                              

# 3. Fix up columns

In [71]:
resfinder_df_orig2['gene_accession']

0                     aac(6')-Ib_2_M23634
1                 aac(6')-Ib11_1_AY136758
2       aac(6')-30-aac(6')-Ib'_1_AJ584652
3                  aac(6')-Iaj_1_AB709942
4                  aac(6')-Ian_1_AP014611
                      ...                
3260                    dfrA17_9_FJ807902
3261                   dfrA17_10_AM937244
3262                                 dfrI
3263                               dfrA48
3264                               dfrA51
Name: gene_accession, Length: 3265, dtype: object

In [72]:
gene_accession_exceptions = {
   "ampC_C-42T": ["ampC_C-42T", ""],
}

def split_gene_accession(row):
    gene_accession = row['gene_accession']

    if gene_accession in gene_accession_exceptions:
       result = gene_accession_exceptions[gene_accession]

    elif any(accession_special in gene_accession for accession_special in ["_NC_", "_NG_", "_NZ_"]):
      tokens = gene_accession.rsplit("_", maxsplit=2)
      gene = tokens[0]
      accession = tokens[1] + "_" + tokens[2]

      result = [gene, accession]
    
    else:
      tokens = gene_accession.rsplit("_", maxsplit=1)
      if len(tokens) == 2:
        gene = tokens[0]
        accession = tokens[1]
      else:
        gene = gene_accession
        accession = ""
      
      result = [gene, accession]

    return pd.Series(result)

# Get rid of all "non-functional" entries
resfinder_df_orig3 = resfinder_df_orig2.copy()
resfinder_df_orig3 = resfinder_df_orig3[resfinder_df_orig3['Class'] != 'non-functional'].copy()

# Split apart the gene and accession
resfinder_df_orig3[['Gene', 'Accession']] = resfinder_df_orig3.apply(split_gene_accession, 1)
resfinder_df_orig3['Accession'] = resfinder_df_orig3['Accession'].str.replace(r'^_', '',
                                                                             regex=True)

resfinder_df = resfinder_df_orig3[['Class', 'Gene', 'Accession', 'Drug']].copy()

# Everything should be lower-case
resfinder_df['Drug'] = resfinder_df['Drug'].str.lower()
# Except I/R, which should be upper-case
resfinder_df['Drug'] = resfinder_df['Drug'].str.replace('i/r', 'I/R')

# There are mixtures of spaces and commas separating drugs here. First get rid of commas
resfinder_df['Drug'] = resfinder_df['Drug'].str.replace(',', ' ')

# Get rid of spaces at the very beginning or end
resfinder_df['Drug'] = resfinder_df['Drug'].str.strip()

# Now, get rid of spaces and replace with commas
resfinder_df['Drug'] = resfinder_df['Drug'].str.replace(r'\s+', ',', regex=True)

# Now fix up specific cases where there should be spaces
resfinder_df['Drug'] = resfinder_df['Drug'].str.replace(',acid', ' acid')
resfinder_df['Drug'] = resfinder_df['Drug'].str.replace(',I/R', ' I/R')
resfinder_df['Drug'] = resfinder_df['Drug'].str.\
  replace('(,decreased,susceptibility,to,meropenem)',
          'decreased susceptibility to meropenem', regex=False)

# Fix up lack of spaces or other issues
resfinder_df['Drug'] = resfinder_df['Drug'].str.replace('inI/R', 'in I/R')
resfinder_df['Drug'] = resfinder_df['Drug'].str.replace('(I/R)', ' I/R', regex=False)

# Change "None" to NA
resfinder_df['Drug'] = resfinder_df['Drug'].replace('none', 'None')
# resfinder_df['Drug'] = resfinder_df['Drug'].replace('None', pd.NA)

resfinder_df

Unnamed: 0,Class,Gene,Accession,Drug
0,aminoglycoside,aac(6')-Ib_2,M23634,gentamicin
1,aminoglycoside,aac(6')-Ib11_1,AY136758,gentamicin
2,aminoglycoside,aac(6')-30-aac(6')-Ib'_1,AJ584652,gentamicin
3,aminoglycoside,aac(6')-Iaj_1,AB709942,gentamicin
4,aminoglycoside,aac(6')-Ian_1,AP014611,gentamicin
...,...,...,...,...
3260,trimethoprim,dfrA17_9,FJ807902,trimethoprim
3261,trimethoprim,dfrA17_10,AM937244,trimethoprim
3262,trimethoprim,dfrI,,trimethoprim
3263,trimethoprim,dfrA48,,trimethoprim


# 4. Save ResFinder resistance table

In [73]:
resfinder_df.to_csv('../../staramr/databases/resistance/data/ARG_drug_key_resfinder.tsv', sep='\t', index=False)