In [3]:
import pandas as pd

### Convert the .tsv to .fasta format

In [2]:
input_filename = "human_mpra/K562_clean.tsv"
data = pd.read_csv(input_filename, sep = "\t", header=0)
print(data.head())

               seq_id                                                seq  \
0              peak10  AGGACCGGATCAACTTGTCGCCTTAATCCAAGCCTACGTTTTTACA...   
1    peak10_Reversed:  AGGACCGGATCAACTAGTATGAGGAGGGTTGTGGAGTGGAAGTGAA...   
2            peak1000  AGGACCGGATCAACTCATCTACATAGAAGTCGCCCTGTCCGTGATG...   
3  peak1000_Reversed:  AGGACCGGATCAACTTTGCTCCTTAACACAGGCTAAGGACCAGCTT...   
4           peak10000  AGGACCGGATCAACTATGGCAGCAGCAGCCCGTCTAGGGTGGCCAC...   

   mean_value  fold  
0      -0.490     6  
1      -0.131     6  
2       0.628     8  
3       1.263     8  
4      -0.131     7  


In [None]:
with open(input_filename[:-4]+".fasta", "w") as f:
    for index, row in data.iterrows():
        #print(f"\>{row["seq_id"]} \n {row["seq"]}")
        f.write(">" + row["seq_id"] + "\n" + row["seq"] + "\n")

### Run the BLAST+ tool

jbsub -cores 8+0  -mem 40g -q x86_12h ./blastn -query ../../human_mpra/K562_clean.fasta -db human_genome_db -out ../../human_mpra/K562_clean_outfmt6.out  -max_target_seqs 1 -outfmt 6


In [4]:
aligned_filename = "human_mpra/K562_clean_outfmt6.out"
df_aligned = pd.read_csv(aligned_filename, sep="\t", header=None)
print(df_aligned.head())

  df = pd.read_csv(aligned_filename, sep="\t", header=None)


                 0  1        2    3   4   5   6    7         8         9   \
0            peak10  1  100.000  200   0   0  16  215    569690    569889   
1  peak10_Reversed:  1  100.000  202   0   0  14  215    569891    569690   
2          peak1000  1  100.000  202   0   0  14  215  16840990  16841191   
3          peak1000  1   99.015  203   2   0  14  216  16993652  16993854   
4          peak1000  1   98.086  209   4   0  14  222  17066802  17066594   

              10     11  
0  1.930000e-100  370.0  
1  1.490000e-101  374.0  
2  1.490000e-101  374.0  
3   8.970000e-99  364.0  
4   8.970000e-99  364.0  


### Count the statistics of each chunk...

In [5]:
df_aligned.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
df_aligned = df_aligned.sort_values(['qseqid', 'pident'], ascending=False)
print(df_aligned.head())
#print(df.groupby("qseqid").first())

              qseqid sseqid  pident  length  mismatch  gapopen  qstart  qend  \
16096991   range_9_4     12   100.0     201         0        0      16   216   
16101161  range_99_7      2   100.0     200         0        0      16   215   
16101160  range_98_3     17   100.0     202         0        0      16   217   
16101158  range_97_5      2   100.0     202         0        0      14   215   
16101159  range_97_5      2    82.5      80        12        2      26   103   

             sstart       send         evalue  bitscore  
16096991   57973521   57973721  5.360000e-101     372.0  
16101161  167087782  167087981  1.930000e-100     370.0  
16101160    6607809    6608010  1.490000e-101     374.0  
16101158  242586493  242586694  1.490000e-101     374.0  
16101159  134679418  134679497   7.880000e-10      69.4  


In [None]:
df_grp_ct = df_aligned.groupby("qseqid")["sseqid"].count()
print(df_grp_ct)

In [None]:
print(df_aligned.loc[df_aligned["qseqid"]=="BCL11A_1",:])
import matplotlib.pyplot as plt

plt.hist(df_grp_ct, bins=50)
print(max(df_grp_ct))

In [None]:
df_grp_ct = df_aligned.groupby("qseqid").first()["pident"]
print(df_grp_ct)
import matplotlib.pyplot as plt

plt.hist(df_grp_ct, bins=50)

In [None]:
df_grp_ct = df_aligned.groupby("qseqid").first()["gapopen"]
print(df_grp_ct)
plt.hist(df_grp_ct, bins=50)

In [8]:
df_grp_first = df_aligned.groupby("qseqid").first()
print(df_grp_first.sseqid.value_counts())

sseqid
2             14635
11            13936
1             11793
1             10579
X              9769
              ...  
GL000241.1        2
GL000246.1        2
GL000196.1        2
GL000200.1        2
GL000219.1        1
Name: count, Length: 71, dtype: int64


In [11]:
df_grp_first.head()
df_grp_first[['sseqid', 'qstart', 'qend', 'sstart', 'send']].to_csv(aligned_filename[:-4]+"processed_.out", sep="\t")

In [4]:
# Select the first 
import csv

import tqdm

with open(aligned_filename[:-4]+"processed.out", "w") as f:
    spamwriter = csv.writer(f, delimiter='\t',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    prev_id = ""
    spamwriter.writerow(['id', 'chr', 'q_st', 'q_end', 's_st', 's_end'])
    for ind, row in tqdm.tqdm(df_aligned.iterrows(), total=df_aligned.shape[0]):
        #print(ind, row)
        if id==prev_id:
            continue
        writerow = [id, row[1], row[6], row[7], row[8], row[9]]
        id = row[0]
        
        spamwriter.writerow(writerow)
        prev_id = id


100%|██████████| 25827809/25827809 [27:49<00:00, 15465.96it/s]


### This is for post-processing the file to add labels to the chunks with SNP imputed after Hongyang's script.

In [2]:
label_file = '/dccstor/bmfm-targets/data/omics/genome/MPGA/human_mpra/K562_clean.tsv'
input_file  = '/dccstor/bmfm-targets/users/hongyang/bmfm-targets/data/K562_biallele_sequence_200.txt'
output_file = '/dccstor/bmfm-targets/data/omics/genome/MPGA/human_mpra/K562_biallele_sequence_200.csv'


In [12]:
df0 = pd.read_csv(input_file,sep=',', header=None)
df0.columns = ['seq_id', 'chunk']
print(df0.shape, df0.head(2))

df_label = pd.read_csv(label_file,sep='\t')
print(df_label.shape, df_label.head(2))

print("retriving the labels...")
df0 = df0.merge(df_label, how='left', left_on='seq_id', right_on='seq_id')

print(df0.shape, df0.head(2))
#df0[['seq_id', 'chunk', 'mean_value', 'fold']].to_csv(output_file, index=None)


(215461, 2) Index(['seq_id', 'chunk'], dtype='object')
(226253, 4) Index(['seq_id', 'seq', 'mean_value', 'fold'], dtype='object')
retriving the labels...
(215461, 5) Index(['seq_id', 'chunk', 'seq', 'mean_value', 'fold'], dtype='object')


In [14]:
df0['fold'] = df0['fold'].astype(int)
df0['fold'].value_counts()

fold
1    26987
4    26858
3    23523
0    23341
9    19312
5    19265
2    19192
6    19060
8    19027
7    18896
Name: count, dtype: int64

In [17]:
from pathlib import Path

output_file = '/dccstor/bmfm-targets/data/omics/genome/MPGA/human_mpra/K562_ref_sequence_200.csv'
output_dir = Path('/dccstor/bmfm-targets/data/omics/genome/MPGA/human_mpra/K562_ref_sequence_200')
output_dir.mkdir(parents=True, exist_ok=True)

df0 = pd.read_csv(output_file,sep=',', header=0)
df0 = df0.astype({"fold":'int16'})
print(df0.columns, df0.dtypes)

train_df = df0.loc[df0['fold'].isin(range(6)),:]
print(train_df.columns)
print(train_df.shape, train_df.head(1))
train_df[['chunk','mean_value']].to_csv(Path(output_dir / "train.csv"), index=False)


test_df = df0.loc[df0['fold'].isin([8,9]),:]
print(test_df.columns)
print(test_df.shape, test_df.head(1))
test_df[['chunk','mean_value']].to_csv(Path(output_dir / "test.csv"), index=False)


dev_df = df0.loc[df0['fold'].isin([6,7]),:]
print(dev_df.columns)
print(dev_df.shape, dev_df.head(1))
dev_df[['chunk','mean_value']].to_csv(Path(output_dir / "dev.csv"), index=False)


Index(['seq_id', 'chunk', 'mean_value', 'fold'], dtype='object') seq_id         object
chunk          object
mean_value    float64
fold            int16
dtype: object
Index(['seq_id', 'chunk', 'mean_value', 'fold'], dtype='object')
(139166, 4)             seq_id                                              chunk  \
0  ENSG00000000457  AGGCTGTGGCCACTACACCCACAATCTTCTGGGGGCCGGGTTTCTC...   

   mean_value  fold  
0       0.515     0  
Index(['seq_id', 'chunk', 'mean_value', 'fold'], dtype='object')
(38339, 4)             seq_id                                              chunk  \
6  ENSG00000000971  ATATCACCAGCTGCTGATTTGCACATACCAAGAACATGAACATTTT...   

   mean_value  fold  
6      -0.866     8  
Index(['seq_id', 'chunk', 'mean_value', 'fold'], dtype='object')
(37956, 4)             seq_id                                              chunk  \
4  ENSG00000000938  AATTTCTTGCAGAACACACAGCCCATTCCAGGTTCCCTGCTACAGA...   

   mean_value  fold  
4      -0.929     7  


In [12]:
df0['fold'] = df0['fold'].astype(int)
df0['fold'].value_counts()

fold
1       26987
4       26858
3       23523
0       23341
9       19312
5       19265
2       19192
6       19060
8       19027
7       18896
fold       21
Name: count, dtype: int64