In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict

import regex as re

import my_utils as utils
from my_utils import create_logger

In [2]:
# get sequences
dict_seq = utils.get_sequences('mm10', X=True, Y=True, M=True)

In [2]:
# get chroms and chrom sizes and ...j 
chroms = utils.get_chroms('mouse', X=True, Y=True, M=True, with_chr=False)
df_chrom_sizes = utils.get_chrom_sizes('mm10', X=True, Y=True, M=True)
df_chrom_sizes.head()

Unnamed: 0,size
1,195471971
2,182113224
3,160039680
4,156508116
5,151834684


In [4]:
# check if the imported sequences are correct
for chrom, seq in dict_seq.items():
    if len(seq) != df_chrom_sizes.loc[chrom, 'size']:
        raise ValueError("Chromosome size doesn't match: chr{}".format(chrom))
print('Imported genome sequences have the correct sizes!')

Imported genome sequences have the correct sizes!


In [6]:
# get all CpGs: primary key (chr, pos)
# 1-based



# for chrom, seq in dict_seq.items():

dict_list = []
pattern = 'CG'
for chrom in chroms:
    # No need to overlap with CG, but need to overlap with CC
    for match in re.finditer(pattern, dict_seq[chrom], overlapped=False): 
        pos = match.span()[0] + 1   # 1-based
        dict_list.append({'chr': chrom, 'pos': pos, 
                          'strand': '+', 'context': dict_seq[chrom][pos-1: pos+2]})
        dict_list.append({'chr': chrom, 'pos': pos+1, 
                          'strand': '-', 'context': utils.complement(dict_seq[chrom][pos-2: pos+1][::-1])})
        
df_cg = pd.DataFrame(dict_list)
print(df_cg.shape)
df_cg.head()

(43735674, 4)


Unnamed: 0,chr,context,pos,strand
0,1,CGT,3000827,+
1,1,CGG,3000828,-
2,1,CGG,3001007,+
3,1,CGA,3001008,-
4,1,CGT,3001018,+


In [7]:
df_cg = df_cg[['chr', 'pos', 'strand', 'context']]
df_cg.to_csv('/cndd/Public_Datasets/CEMBA/snmCSeq/References/Genome/mm10_all_cg.tsv', 
             sep='\t', na_rep='NA', header=True, index=False)

In [23]:
k = 'abc'
k[::-1]

'cba'

In [None]:
# get sequece-features of each CpG site (deprecated, see 00.sequence2features.py)

# very slow
ext_lengs = [5, 10, 20, 50, 100, 200, 500]

dict_list = []
for idx, row in df_cg.iterrows():
    feature_dict = OrderedDict() 
    feature_dict['chr'] = row.chr
    feature_dict['pos'] = row.pos
    
    for leng in ext_lengs: 
        seq = dict_seq[row.chr][row.pos-leng-1 : row.pos+leng]
        feature_dict['n_cpg_{}'.format(str(leng))] = seq.count('CG')
        feature_dict['n_gnc_{}'.format(str(leng))] = (seq.count('G') + seq.count('C'))/float(2*leng + 1)
        
    dict_list.append(feature_dict)
    
    if (idx % 1000000) == 0:
        print(idx, idx/df_cg.shape[0])
    
df_cpg_info = pd.DataFrame(dict_list)

In [None]:
# examine features

feature_f = './data/features/features_mm10_cpg_v1.tsv'
df_cpg_info = pd.read_table(feature_f, index_col=['chr', 'pos'], dtype={'chr': object})
print(df_cpg_info.shape)
df_cpg_info.head()



In [5]:
# gencode annotation 
# df_chrom_sizes.sum()
f = '/cndd/Public_Datasets/CEMBA/snmCSeq/References/Annotation/gencode.vM16.annotation_genes.tsv'
df_genes = pd.read_table(f, index_col='gene_id')
print(df_genes.shape)
df_genes.head()

(53379, 6)


Unnamed: 0_level_0,gene_name,chr,start,end,strand,gene_type
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000102693.1,4933401J01Rik,chr1,3073253,3074322,+,TEC
ENSMUSG00000064842.1,Gm26206,chr1,3102016,3102125,+,snRNA
ENSMUSG00000051951.5,Xkr4,chr1,3205901,3671498,-,protein_coding
ENSMUSG00000102851.1,Gm18956,chr1,3252757,3253236,+,processed_pseudogene
ENSMUSG00000103377.1,Gm37180,chr1,3365731,3368549,-,TEC


In [16]:
(df_genes.loc[df_genes['gene_type']=='protein_coding', 'end'] 
 - df_genes.loc[df_genes['gene_type']=='protein_coding', 'start']).sum()/df_chrom_sizes.sum()

size    0.391702
dtype: float64

In [7]:
df_chrom_sizes.sum()

size    2725537669
dtype: int64

In [14]:
df_genes_new = df_genes[(~df_genes['gene_name'].str.contains('^Gm')) & (df_genes['gene_type'] == 'protein_coding')]

In [15]:
(df_genes_new.loc[:, 'end'] 
 - df_genes.loc[:, 'start']).sum()/df_chrom_sizes.sum()

size    0.379122
dtype: float64

In [21]:
overlaps_all = 0
for chrom, df_sub in df_genes.groupby('chr'):
    overlaps = 0
    for i, (idx, row) in enumerate(df_sub.iterrows()):
        if i == 0:
            last_end = row.end
        else:
            overlap = last_end - row.start
            if overlap > 0:
                overlaps += overlap

            last_end = row.end
    overlaps_all += overlaps
        
    
    

In [22]:
overlaps_all/df_chrom_sizes.sum()

size    0.192927
dtype: float64