### Table of replicon annotations

In [1]:
import os
import pandas as pd

In [2]:
# Read in data
pf32_calls_dir = '/home/mf019/longread_pangenome/expanded_dataset_analysis/genotyping/replicons/20241211.1222/pf32/tables/'
wp_calls_dir = '/home/mf019/longread_pangenome/expanded_dataset_analysis/genotyping/replicons/20241211.1222/wp/tables/'
# Each isolate has its own file
pf32_tables = os.listdir(pf32_calls_dir)
wp_tables = os.listdir(wp_calls_dir)

# Combine data from all isolates into one df
pf32_calls = pd.DataFrame()
for table in pf32_tables:
    pf32_calls = pd.concat([pf32_calls,
                            pd.read_csv(pf32_calls_dir+table, sep='\t')
                           ])

# Combine data from all isolates into one df
wp_calls = pd.DataFrame()
for table in wp_tables:
    wp_calls = pd.concat([wp_calls,
                          pd.read_csv(wp_calls_dir+table, sep='\t')
                          ])

In [3]:
# Sort by percent identity and percent coverage, then keep only the plasmid name with the highest percent identity & coverage
prioritized_pf32_calls = pf32_calls.sort_values(
                                                by=['assembly_id', 'contig_id', 'overall_percent_identity', 'query_coverage_percent'], 
                                                ascending=[True, True, False, False]
                                  ).reset_index(drop=True)
best_pf32_calls = prioritized_pf32_calls.drop_duplicates(subset=['assembly_id', 'contig_id'], keep='first')
best_pf32_calls['call_method'] = 'pf32'
best_pf32_calls['call_method_priority_rank'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_pf32_calls['call_method'] = 'pf32'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_pf32_calls['call_method_priority_rank'] = 0


In [4]:
# Sort by percent identity and percent coverage, then keep only the plasmid name with the highest percent identity & coverage
prioritized_wp_calls = wp_calls.sort_values(
                                            by=['assembly_id', 'contig_id', 'overall_percent_identity', 'query_coverage_percent'], 
                                            ascending=[True, True, False, False]
                              ).reset_index(drop=True)
best_wp_calls = prioritized_wp_calls.drop_duplicates(subset=['assembly_id', 'contig_id'], keep='first')
best_wp_calls['call_method'] = 'wp'
best_wp_calls['call_method_priority_rank'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_wp_calls['call_method'] = 'wp'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_wp_calls['call_method_priority_rank'] = 1


In [17]:
# Create best calls 
best_calls = pd.concat([best_pf32_calls, best_wp_calls]).sort_values(
                                                                     by=['assembly_id', 'contig_id', 'call_method_priority_rank'],
                                                                     ascending=[True, True, True]
                                                       ).reset_index(drop=True)
best_calls = best_calls.drop_duplicates(subset=['assembly_id', 'contig_id'], keep='first')
best_calls = best_calls.drop(columns='call_method_priority_rank').reset_index(drop=True)
best_calls.to_csv('replicon_annotations.csv', index=False)

In [22]:
# If contig is less than 1kbp, drop annotation
idxs = best_calls['contig_len']<1000
best_calls_1000 = best_calls.loc[~idxs, :]
best_calls_1000.to_csv('replicon_annotations_1000bp.csv', index=False)