# 1. Load up Excel file

In [6]:
import pandas as pd

pointfinder_file = '../../pointfinder.xlsx'

pointfinder_excel = pd.ExcelFile(pointfinder_file)
sheets_df_map_orig = {n: pd.read_excel(pointfinder_excel, sheet_name=n, header=None) for n in pointfinder_excel.sheet_names}
sheets_df_map_orig.keys()

dict_keys(['Salmonella', 'Shigella E. coli', 'Campylobacter', 'NCBI AMRfinder'])

# 2. Add organism names/column names

In [7]:
sheets_df_map = {
    'salmonella': sheets_df_map_orig['Salmonella'],
    'escherichia_coli': sheets_df_map_orig['Shigella E. coli'],
    'campylobacter': sheets_df_map_orig['Campylobacter'],
}

sheets_df_map = {n: sheets_df_map[n].rename({0: 'gene_mutation', 1: 'Drug'},
                                        axis='columns') for n in sheets_df_map}
for n in sheets_df_map:
    sheets_df_map[n].insert(0, 'Organism', n)
    
sheets_df_map['salmonella'].head(3)

Unnamed: 0,Organism,gene_mutation,Drug
0,salmonella,pmrA(15),colistin
1,salmonella,pmrA(53),colistin
2,salmonella,pmrA(81),colistin


In [8]:
pointfinder_df = pd.concat([sheets_df_map['salmonella'],
                  sheets_df_map['escherichia_coli'],
                  sheets_df_map['campylobacter']
]).reset_index(drop=True)

pointfinder_df.head(3)

Unnamed: 0,Organism,gene_mutation,Drug,2
0,salmonella,pmrA(15),colistin,
1,salmonella,pmrA(53),colistin,
2,salmonella,pmrA(81),colistin,


# 3. Fix up columns

In [9]:
extracted_columns = pointfinder_df['gene_mutation'].str.extract(r'^([^(]+)\((\d+)\)',
                                                                expand=True)
pointfinder_df_reduced = pointfinder_df.copy()
pointfinder_df_reduced[['Gene', 'Codon Pos.']] = extracted_columns

extracted_columns2 = pointfinder_df['gene_mutation'].str.extract(r'^(ampCprom)(-\d+)',
                                                                expand=True)
extracted_columns2 = extracted_columns2.rename({0: 'Gene', 1: 'Codon Pos.'},
                                              axis='columns')
extracted_columns2 = extracted_columns2[~extracted_columns2['Gene'].isna()]

pointfinder_df_reduced['gene_mutation'].str.match(r'^ampCprom')
pointfinder_df_reduced.loc[pointfinder_df_reduced['gene_mutation']\
                           .str.match(r'^ampCprom'), ['Gene', 'Codon Pos.']] = extracted_columns2

pointfinder_df_reduced = pointfinder_df_reduced[['Organism', 'Gene', 'Codon Pos.', 'Drug']]
pointfinder_df_reduced['Gene'] = pointfinder_df_reduced['Gene'].str.replace('ampCprom', 'ampC_promoter_size_53bp')
pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].str.strip()
pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].str.lower()
pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].str.replace('i/r', 'I/R')
pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].str.replace(' ', ',')
pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].str.replace(',acid', ' acid')
pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].str.replace(',I/R', ' I/R')
pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].str.replace('inI/R', 'in I/R')
pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].replace('none', 'None')
# pointfinder_df_reduced['Drug'] = pointfinder_df_reduced['Drug'].replace('None', pd.NA)
pointfinder_df_reduced

Unnamed: 0,Organism,Gene,Codon Pos.,Drug
0,salmonella,pmrA,15,colistin
1,salmonella,pmrA,53,colistin
2,salmonella,pmrA,81,colistin
3,salmonella,pmrB,14,colistin
4,salmonella,pmrB,22,colistin
...,...,...,...,...
173,campylobacter,,,"ciprofloxacin,nalidixic acid"
174,campylobacter,,,erythromycin
175,campylobacter,,,streptomycin
176,campylobacter,,,"ciprofloxacin,nalidixic acid"


# 4. Save pointfinder resitance table

In [10]:
pointfinder_df_reduced.to_csv('../../staramr/databases/resistance/data/ARG_drug_key_pointfinder.tsv', sep='\t', index=False)