In [71]:
import pandas as pd
from glob import glob
import numpy as np
import re

In [72]:
pd.set_option('display.max_colwidth', None)

import data

In [73]:
blast_aln_description_files = glob('/home/tobamo/analize/project-tobamo/analysis/whole_orf/results/blast/blast_luka_results/*Alignment-Descriptions_*.csv')
blast_aln_hits_files = glob('/home/tobamo/analize/project-tobamo/analysis/whole_orf/results/blast/blast_luka_results/*Alignment-HitTable*.csv')

In [74]:
contigs_on_tobamo_tree_path = '/home/tobamo/analize/project-tobamo/analysis/whole_orf/results/blast/contigs_on_tree-tobamo2.txt'
movement_only_contigs_path = '/home/tobamo/analize/project-tobamo/analysis/whole_orf/results/blast/movement_only_contigs.txt'
other_contigs_path = '/home/tobamo/analize/project-tobamo/analysis/whole_orf/results/blast/other_contigs.txt'
all_contigs_for_blast_path = '/home/tobamo/analize/project-tobamo/analysis/whole_orf/results/blast/all_contigs_for_blast.txt'

with open(contigs_on_tobamo_tree_path) as file: contigs_on_tobamo_tree = [line.strip() for line in file.readlines()]
with open(movement_only_contigs_path) as file: movement_only_contigs = [line.strip() for line in file.readlines()]
with open(other_contigs_path) as file: other_contigs = [line.strip() for line in file.readlines()]
with open(all_contigs_for_blast_path) as file: all_contigs_for_blast = [line.strip() for line in file.readlines()]

define parsing functions

In [75]:
def read_hits_files(filenames):
    dfs = [pd.read_csv(filename, names=['qaccver', 'saccver', 'identity', 'alnlen', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'], header=None) for filename in filenames]
    df = pd.concat(dfs, ignore_index=True)
    return df

df_hits = read_hits_files(blast_aln_hits_files)

In [76]:
def read_desc_files(filenames):
    dfs = [pd.read_csv(filename) for filename in filenames]
    df = pd.concat(dfs, ignore_index=True)

    def extract_accession_number(hyperlink):
        pattern = r'/([A-Z0-9]+\.[A-Z0-9]+)'
        match = re.search(pattern, hyperlink)
        if match:
            return match.group(1)
        return None

    df['saccver'] = df['Accession  '].apply(extract_accession_number)
    df = df[['Scientific Name', 'saccver']]
    return df

df_desc = read_desc_files(blast_aln_description_files).drop_duplicates()

In [77]:
merged = df_hits.merge(df_desc, on='saccver').drop_duplicates().sort_values(by=['qaccver', 'bitscore'], ascending=False)

In [78]:
def determine_status(group):
    top_10 = group.head(10)
    if top_10['Scientific Name'].nunique() == 1 and (top_10['identity'] > 90).any():
        return 'known'
    else:
        return 'check'

mapping = merged.groupby('qaccver').apply(lambda x: determine_status(x)).to_dict()
merged['status'] = merged.qaccver.map(mapping)

In [79]:
# Find the qaccver values that are missing from the DataFrame
missing_qaccver = [item for item in all_contigs_for_blast if item not in merged['qaccver'].values]

# Create a DataFrame for the missing qaccver values with NaN for the other columns
missing_df = pd.DataFrame({'qaccver': missing_qaccver})

# Append the missing rows to the original DataFrame
mega = pd.concat([merged, missing_df], ignore_index=True)

In [80]:
mega['batch'] = mega['qaccver'].apply(lambda x: 'movement_only' if x in movement_only_contigs else 'other')
mega['on_tobamo_tree_palmprint'] = np.where(mega['qaccver'].isin(contigs_on_tobamo_tree), 1, 0)

save table

In [82]:
mega.to_csv('whole_orf_blast_out_luka_20-05-2024.csv', index=False)