### Table of replicon annotations

In [1]:
import os
import pandas as pd

In [2]:
# Read in data
pf32_calls_dir = '/home/mf019/longread_pangenome/expanded_dataset_analysis/output/genotyping/replicons/calls_v10/pf32/tables/'
wp_calls_dir = '/home/mf019/longread_pangenome/expanded_dataset_analysis/output/genotyping/replicons/calls_v10/wp/tables/'
# Each isolate has its own file
pf32_tables = os.listdir(pf32_calls_dir)
wp_tables = os.listdir(wp_calls_dir)

# Combine data from all isolates into one df
pf32_calls = pd.DataFrame()
for table in pf32_tables:
    pf32_calls = pd.concat([pf32_calls,
                            pd.read_csv(pf32_calls_dir+table, sep='\t')
                           ])

# Combine data from all isolates into one df
wp_calls = pd.DataFrame()
for table in wp_tables:
    wp_calls = pd.concat([wp_calls,
                          pd.read_csv(wp_calls_dir+table, sep='\t')
                          ])

In [3]:
# Sort by percent identity and percent coverage, then keep only the plasmid name with the highest percent identity & coverage
prioritized_pf32_calls = pf32_calls.sort_values(
                                                by=['assembly_id', 'contig_id', 'overall_percent_identity', 'query_coverage_percent'], 
                                                ascending=[True, True, False, False]
                                  ).reset_index(drop=True)
best_pf32_calls = prioritized_pf32_calls.drop_duplicates(subset=['assembly_id', 'contig_id'], keep='first')
best_pf32_calls['call_method'] = 'pf32'
best_pf32_calls['call_method_priority_rank'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_pf32_calls['call_method'] = 'pf32'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_pf32_calls['call_method_priority_rank'] = 0


In [4]:
# Sort by percent identity and percent coverage, then keep only the plasmid name with the highest percent identity & coverage
prioritized_wp_calls = wp_calls.sort_values(
                                            by=['assembly_id', 'contig_id', 'overall_percent_identity', 'query_coverage_percent'], 
                                            ascending=[True, True, False, False]
                              ).reset_index(drop=True)
best_wp_calls = prioritized_wp_calls.drop_duplicates(subset=['assembly_id', 'contig_id'], keep='first')
best_wp_calls['call_method'] = 'wp'
best_wp_calls['call_method_priority_rank'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_wp_calls['call_method'] = 'wp'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_wp_calls['call_method_priority_rank'] = 1


In [10]:
# Create best calls 
best_calls = pd.concat([best_pf32_calls, best_wp_calls]).sort_values(
                                                                     by=['assembly_id', 'contig_id', 'call_method_priority_rank'],
                                                                     ascending=[True, True, True]
                                                       ).reset_index(drop=True)
best_calls = best_calls.drop_duplicates(subset=['assembly_id', 'contig_id'], keep='first')
best_calls = best_calls.drop(columns='call_method_priority_rank').reset_index(drop=True)
best_calls.to_csv('dataset_v5_best_replicon_hits.csv', index=False)

In [6]:
# If contig is longer than 100kb, call it chromosome 
idxs = best_calls['contig_len']>100000
best_calls.loc[idxs, 'plasmid_name'] = 'chromosome'

In [8]:
# If contig is less than 1kbp, drop annotation
idxs = best_calls['contig_len']<250
best_calls_1000 = best_calls.loc[~idxs, :]

In [9]:
# Write
best_calls_1000.to_csv('dataset_v5_replicon_annotations_250bp.csv', index=False)

In [15]:
prioritized_wp_calls[(prioritized_wp_calls['assembly_id']=='ESI26H') & (prioritized_wp_calls['plasmid_name']=='lp54')]

Unnamed: 0,assembly_id,contig_id,contig_len,plasmid_id,plasmid_name,strain,query_length,ref_length,overall_percent_identity,query_covered_length,ref_covered_length,covered_intervals,query_intervals,subject_hit_coords,query_coverage_percent
300,ESI26H,contig_13 [gcode=11] [topology=linear],3042,gb|CP124099.1|,lp54,NE_5261,3042,53730,100.0,3040,1520,"[(1, 1521)]","[(1522, 3042), (1, 1521)]","[(1, 1521), (1521, 1)]",99.934254
301,ESI26H,contig_13 [gcode=11] [topology=linear],3042,gb|CP124095.1|,lp54,NE_5267,3042,53730,99.934254,3040,1519,"[(1, 1520)]","[(1522, 3042), (1, 1521)]","[(1, 1520), (1520, 1)]",99.934254
302,ESI26H,contig_13 [gcode=11] [topology=linear],3042,gb|CP124103.1|,lp54,NE_5248,3042,53675,99.868508,3040,1519,"[(1, 1520)]","[(1522, 3042), (1, 1521)]","[(1, 1520), (1520, 1)]",99.934254
303,ESI26H,contig_13 [gcode=11] [topology=linear],3042,gb|AE000790.2|,lp54,B31,3042,53657,99.802502,3036,1518,"[(1, 1519)]","[(1524, 3042), (1, 1519)]","[(1, 1519), (1519, 1)]",99.802761
304,ESI26H,contig_13 [gcode=11] [topology=linear],3042,gb|CP031397.1|,lp54,MM1,3042,53798,99.518652,3318,1661,"[(1, 1662)]","[(1, 1660), (1383, 3042)]","[(1, 1662), (1662, 1)]",109.072978


In [16]:
prioritized_pf32_calls[(prioritized_pf32_calls['assembly_id']=='ESI26H') & (prioritized_pf32_calls['plasmid_name']=='lp54')]

Unnamed: 0,assembly_id,contig_id,contig_len,plasmid_id,plasmid_name,strain,query_length,ref_length,overall_percent_identity,query_covered_length,ref_covered_length,covered_intervals,query_intervals,subject_hit_coords,query_coverage_percent
