In [1]:
from pathlib import Path
import scanpy as sc
import re, sys
import pandas as pd

In [6]:
data = pd.read_csv('exp_aae_denv.csv', index_col=0)
data['DENV'].map(lambda x: 1 if x > 10 else 0).value_counts()

DENV
0    15697
1     4884
Name: count, dtype: int64

In [2]:
def parse_sam(file):
	result = set()
	with open(file, 'r') as inputFile:
		for line in inputFile.readlines():
			fields = line.split('\t')
			acc = fields[2]
			for item in fields[19:]:
				if item.find('CB:Z:') > -1:
					barcode = re.sub(r'CB:Z:', r'', item)
				if item.find('UB:Z:') > -1:
					umi = re.sub(r'UB:Z:', '', item)
			result.add(f'{acc}\t{barcode}\t{umi}')
	return result

def generate_table(data, target_set, target_name, suffix):
	filtered_data = []
	for item in data:
		if item.split('\t')[0] in target_set:
			filtered_data.append(item)

	with open('exp_aae_denv.txt', 'w') as outputFile:
		for item in filtered_data:
			outputFile.write(item)
	table = pd.read_table('exp_aae_denv.txt', header=None, delimiter='\t')
	table.columns = ['Accession', 'Barcode', 'UMI']
	table['Accession'] = table['Accession'].map(lambda x: target_name[x])
	table = table[table['Barcode'] != '-']
	table = table[table['UMI'] != '-']
	table = table.groupby(['Accession', 'Barcode'])['UMI'].count().reset_index()
	table['Barcode'] = table['Barcode'] + suffix
	Path('exp_aae_denv.txt').unlink()
	return table

def write2csv(adata, table, target_name, suffix):
	umap_df = pd.DataFrame(
	adata.obsm['X_umap'], 
	index=adata.obs_names,
	columns=['UMAP_1', 'UMAP_2']  # 列名可自定义
	)
	umap_df = pd.concat([adata.obs['cluster'], umap_df], axis=1).reset_index()
	umap_df.columns = ['Barcode', 'Cluster', 'UMAP_1', 'UMAP_2']
	umap_df.to_csv('temp.csv', header=True, index=False)
	umap_df = pd.read_csv('temp.csv', header=0)
	for name in target_name.values():
		temp_column = table[table['Accession'] == name][['Barcode', 'UMI']]
		umap_df = umap_df.merge(temp_column, how='left', on='Barcode').fillna(0)
	umap_df.columns = ['Barcode', 'Cluster', 'UMAP_1', 'UMAP_2'] + list(target_name.values())
	umap_df.to_csv(f'exp_{suffix}.csv', index=False)
	Path('temp.csv').unlink()
	print(umap_df)

In [4]:
data = parse_sam('exp_aae_denv.sam')

In [5]:
target_set = ['MW174761.1', 'JF327392.1', 'ON949933.1']
target_name = {
	'MW174761.1': 'Totichi',
	'JF327392.1': 'DENV',
	'ON949933.1': 'HKIFV'
}
table = generate_table(data, target_set, target_name, '-aae-dv')

In [6]:
adata = sc.read('../../../processed_h5ad/aae_bl_denv.h5ad')
adata = adata[adata.obs['batch'] == 'aae-dv']
write2csv(adata, table, target_name, 'aae_denv')

                                    Barcode      Cluster    UMAP_1     UMAP_2  \
0      TACCGTCTG_AACGCTAGT_AACAAGTGG-aae-dv           EC  3.111218  14.715935   
1      AGCTGAGTC_AACGTCCAA_AACAAGTGG-aae-dv  ISC/EB-prol  2.301203  -5.423813   
2      CGAGATAGT_AACGTCCAA_AACAAGTGG-aae-dv    EC-like-1  5.077836   7.294407   
3      GATGTTACG_AACGTCCAA_AACAAGTGG-aae-dv           EC  4.170012  15.500715   
4      TCTTGCTTG_AATCCGGTG_AACAAGTGG-aae-dv     ISC/EB-1  0.290944  -1.707308   
...                                     ...          ...       ...        ...   
20576  CGTTAGCGT_TTGAGACAG_TTGTGTACG-aae-dv    EC-like-2  1.316366   5.112323   
20577  ACGAATGGA_TTGCCGTCA_TTGTGTACG-aae-dv           EC  3.273036  12.975205   
20578  ACTGCCTAG_TTGCCGTCA_TTGTGTACG-aae-dv  ISC/EB-prol  4.153238  -3.547717   
20579  TCCGTATCA_TTGCCGTCA_TTGTGTACG-aae-dv    EC-like-2  3.380244   9.479116   
20580  GTGCGACTA_TTGGTGACC_TTGTGTACG-aae-dv     ISC/EB-1  1.347174  -2.024645   

       Totichi  DENV  HKIFV

In [None]:
adata = sc.read('../../../processed_h5ad/aae_bl_denv.h5ad')
adata = adata[adata.obs['batch'] == 'aae-dv']
umap_df = pd.DataFrame(
	adata.obsm['X_umap'], 
	index=adata.obs_names,
	columns=['UMAP_1', 'UMAP_2']  # 列名可自定义
).reset_index()
umap_df.columns = ['Barcode', 'UMAP_1', 'UMAP_2']
for name in target_name.values():
	temp_column = table[table['Accession'] == name][['Barcode', 'UMI']]
	umap_df = umap_df.merge(temp_column, how='left', on='Barcode').fillna(0)
umap_df.columns = ['Barcode', 'UMAP_1', 'UMAP_2'] + list(target_name.values())
# DENV = table[table['Accession'] == 'DENV'][['Barcode', 'UMI']]
# HKIFV = table[table['Accession'] == 'HKIFV'][['Barcode', 'UMI']]
# umap_df = umap_df.merge(DENV, how='left', on='Barcode').fillna(0)
# umap_df = umap_df.merge(HKIFV, how='left', on='Barcode').fillna(0)
# umap_df.columns = ['Barcode', 'UMAP_1', 'UMAP_2',]
umap_df.to_csv('exp_aae_denv.csv')

Unnamed: 0,Barcode,UMAP_1,UMAP_2,Totichi,DENV,HKIFV
0,TACCGTCTG_AACGCTAGT_AACAAGTGG-aae-dv,3.111218,14.715935,0.0,1.0,5.0
1,AGCTGAGTC_AACGTCCAA_AACAAGTGG-aae-dv,2.301203,-5.423813,0.0,1.0,0.0
2,CGAGATAGT_AACGTCCAA_AACAAGTGG-aae-dv,5.077836,7.294407,0.0,2.0,6.0
3,GATGTTACG_AACGTCCAA_AACAAGTGG-aae-dv,4.170012,15.500715,1.0,67.0,0.0
4,TCTTGCTTG_AATCCGGTG_AACAAGTGG-aae-dv,0.290944,-1.707308,0.0,0.0,1.0
...,...,...,...,...,...,...
20576,CGTTAGCGT_TTGAGACAG_TTGTGTACG-aae-dv,1.316366,5.112323,0.0,7.0,2.0
20577,ACGAATGGA_TTGCCGTCA_TTGTGTACG-aae-dv,3.273036,12.975205,0.0,0.0,2.0
20578,ACTGCCTAG_TTGCCGTCA_TTGTGTACG-aae-dv,4.153238,-3.547717,0.0,3.0,1.0
20579,TCCGTATCA_TTGCCGTCA_TTGTGTACG-aae-dv,3.380244,9.479116,0.0,5.0,0.0


In [104]:
adata = sc.read('../../../processed_h5ad/aae_bl_denv.h5ad')
adata = adata[adata.obs['batch'] == 'aae-dv']
write2csv(adata, table)

                                    Barcode    UMAP_1     UMAP_2  Totichi  \
0      TACCGTCTG_AACGCTAGT_AACAAGTGG-aae-dv  3.111218  14.715935      0.0   
1      AGCTGAGTC_AACGTCCAA_AACAAGTGG-aae-dv  2.301203  -5.423813      0.0   
2      CGAGATAGT_AACGTCCAA_AACAAGTGG-aae-dv  5.077836   7.294407      0.0   
3      GATGTTACG_AACGTCCAA_AACAAGTGG-aae-dv  4.170012  15.500715      1.0   
4      TCTTGCTTG_AATCCGGTG_AACAAGTGG-aae-dv  0.290944  -1.707308      0.0   
...                                     ...       ...        ...      ...   
20576  CGTTAGCGT_TTGAGACAG_TTGTGTACG-aae-dv  1.316366   5.112323      0.0   
20577  ACGAATGGA_TTGCCGTCA_TTGTGTACG-aae-dv  3.273036  12.975205      0.0   
20578  ACTGCCTAG_TTGCCGTCA_TTGTGTACG-aae-dv  4.153238  -3.547717      0.0   
20579  TCCGTATCA_TTGCCGTCA_TTGTGTACG-aae-dv  3.380244   9.479116      0.0   
20580  GTGCGACTA_TTGGTGACC_TTGTGTACG-aae-dv  1.347174  -2.024645      0.0   

       DENV  HKIFV  
0       1.0    5.0  
1       1.0    0.0  
2       2.0 