In [1]:
from pathlib import Path
import scanpy as sc
import re, sys
import pandas as pd

In [15]:
def parse_sam(file):
	result = set()
	with open(file, 'r') as inputFile:
		for line in inputFile.readlines():
			fields = line.split('\t')
			acc = fields[2]
			for item in fields[19:]:
				if item.find('CB:Z:') > -1:
					barcode = re.sub(r'CB:Z:', r'', item)
				if item.find('UB:Z:') > -1:
					umi = re.sub(r'UB:Z:', '', item)
			result.add(f'{acc}\t{barcode}\t{umi}')
	return result

def generate_table(data, target_set, target_name, suffix):
	filtered_data = []
	for item in data:
		if item.split('\t')[0] in target_set:
			filtered_data.append(item)

	with open('exp_aae_denv.txt', 'w') as outputFile:
		for item in filtered_data:
			outputFile.write(item)
	table = pd.read_table('exp_aae_denv.txt', header=None, delimiter='\t')
	table.columns = ['Accession', 'Barcode', 'UMI']
	table['Accession'] = table['Accession'].map(lambda x: target_name[x])
	table = table[table['Barcode'] != '-']
	table = table[table['UMI'] != '-']
	table = table.groupby(['Accession', 'Barcode'])['UMI'].count().reset_index()
	table['Barcode'] = table['Barcode'] + suffix
	Path('exp_aae_denv.txt').unlink()
	return table

def write2csv(adata, table, target_name, suffix):
	umap_df = pd.DataFrame(
	adata.obsm['X_umap'], 
	index=adata.obs_names,
	columns=['UMAP_1', 'UMAP_2']  # 列名可自定义
	)
	umap_df = pd.concat([adata.obs['cluster'], umap_df], axis=1).reset_index()
	umap_df.columns = ['Barcode', 'Cluster', 'UMAP_1', 'UMAP_2']
	umap_df.to_csv('temp.csv', header=True, index=False)
	umap_df = pd.read_csv('temp.csv', header=0)
	for name in target_name:
		col_name = list(umap_df.columns)
		temp_column = table[table['Accession'] == name][['Barcode', 'UMI']]
		umap_df = umap_df.merge(temp_column, how='left', on='Barcode').fillna(0)
		umap_df.columns = col_name + [name]
	umap_df.to_csv(f'exp_{suffix}.csv', index=False)
	Path('temp.csv').unlink()
	print(umap_df)

In [3]:
data = parse_sam('exp_alb.sam')

In [4]:
target_set = ['MW174761.1', 'ON949933.1', 'PP510875.1',
			  'PP510876.1', 'BK059489.1', 'BK059490.1']
target_name = {
	'MW174761.1': 'Totichi',
	'ON949933.1': 'HKIFV',
	'PP510875.1': 'TMBPTLV',
	'PP510876.1': 'TMBPTLV',
	'BK059489.1': 'TMBSTLV2',
	'BK059490.1': 'TMBSTLV2'
}
table = generate_table(data, target_set, target_name, '')
table.Accession.unique()

array(['HKIFV', 'TMBPTLV', 'TMBSTLV2', 'Totichi'], dtype=object)

In [17]:
adata = sc.read('../../../UMAP_h5ad/alb.h5ad')
write2csv(adata, table, ['Totichi', 'HKIFV', 'TMBPTLV', 'TMBSTLV2'], 'alb')

                             Barcode      Cluster     UMAP_1    UMAP_2  \
0      CTCACGTTC_AACGCTAGT_AACAAGTGG    EC-like-1   3.356602  4.737028   
1      AATCGCCAC_AACGTCCAA_AACAAGTGG           EC   0.227536  8.939644   
2      CTTACGCAG_AACGTCCAA_AACAAGTGG         EE-1  -5.031310  5.924231   
3      GAACGCTAT_AACGTCCAA_AACAAGTGG  ISC/EB-prol  -0.711623 -0.554777   
4      ACGAAGCTC_AAGGTGGTA_AACAAGTGG     Cardia-2   6.571848  1.559371   
...                              ...          ...        ...       ...   
17533  GTTACTGGT_TGGACTTGT_TTGTGTACG    EC-like-2   2.133067  2.314789   
17534  GATGGCTCA_TGGTCAGTT_TTGTGTACG     Cardia-2  11.537580  4.939885   
17535  ATCGACACG_TTGAGACAG_TTGTGTACG  ISC/EB-prol  -1.702858 -0.475249   
17536  GAGCAGCTT_TTGAGACAG_TTGTGTACG     Cardia-2  11.414620  4.270540   
17537  TACCTCTCC_TTGAGACAG_TTGTGTACG     Cardia-1   5.500094  3.699416   

       Totichi  HKIFV  TMBPTLV  TMBSTLV2  
0          0.0    0.0      0.0       3.0  
1          0.0    0.0    

In [25]:
parc['TMBSTLV2'].map(lambda x: 1 if x > 0 else 0).value_counts()

TMBSTLV2
1    12793
0     4745
Name: count, dtype: int64