## Determining CpG ratio per Gene Function
## in the Geoduck data.
### Files needed
GO-slim with unique data

"analyses/Geoduck-transcriptome-v2-GO-SlimUnique.csv"

fasta file with contigs

"analyses/Geoduck-transcriptome-v2.fasta"

### Importing libraries

In [1]:
from Bio import SeqIO
from pandas import Series, DataFrame
import pandas as pd
import pylab
from Bio.SeqUtils import GC
import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from pylab import *
import numpy as np

Checking working directory and files

In [2]:
!pwd

/Users/migueldelrio/Desktop/panopea/panopea_data


In [3]:
!ls analyses/

Dheilly-Fastafile_Microarray.fa
Dheilly-Microarray.nhr
Dheilly-Microarray.nin
Dheilly-Microarray.nsq
Dheilly_Geo_blast_analises.xlsx
Dheilly_blastx_Geoduck.out
Geo-pep_tblastn_Dheilly.out
Geo_blast_BacteriaEukaryota.xlsx
Geoduck-transcriptome-v2-GO-SlimUnique.csv
[31mGeoduck-transcriptome-v2.fasta[m[m
Geoduck-transcriptome-v2.nhr
Geoduck-transcriptome-v2.nin
Geoduck-transcriptome-v2.nsq
[34mGeoduck-transcriptome-v3[m[m
Geoduck-tranv2-blastx_sprot.sorted
Geoduck-tranv2-minus_direction.tab
Geoduck-tranv3-blastx_sprot.sorted
Geoduck-v2-GOslim_only%.txt
Geoduck-v2-GOslim_only.sorted
Geoduck-v2-GOslim_only.txt
Geoduck-v2-protein.phr
Geoduck-v2-protein.pin
Geoduck-v2-protein.psq
Geoduck-v3_blastn_Cgigas_cdna.out
Geoduck-v3_blastn_Cgigas_cdna_e10.out
Geoduck-v3_blastn_GIGAton_e10.out
Geoduck-v3_blastn_RuphiBase.out
Geoduck-v3_blastn_RuphiBase_e10.out
Geoduck-v3_blastn_RuphiBase_e10.xlsx
Geoduck-v3_blastn_RuphiBase_e60.out
Geoduck-v3_blastn_RuphiBase_e60.xls

In [4]:
!head "analyses/Geoduck-transcriptome-v2-GO-SlimUnique.csv"

comp141380_c2_seq35,sp,Q9H295,D^C


## Step 1: Obtaining GO slim information (gene function)

In [5]:
f = pd.read_csv('analyses/Geoduck-transcriptome-v2-GO-SlimUnique.csv')

Create a temporary dataframe *f1* with the sequence 'id' and its 'GOSlim_bin'

In [6]:
f1=DataFrame({'id':f['Column1'],'GOSlimbin': f['GOSlim_bin']})
f1=f1.sort('id')
f1

Unnamed: 0,GOSlimbin,id
4145,transport,comp100097_c0_seq1
4146,protein metabolism,comp100104_c2_seq1
4147,RNA metabolism,comp100105_c1_seq1
4148,transport,comp100108_c1_seq1
4149,other biological processes,comp100109_c0_seq1
4150,other biological processes,comp100113_c0_seq1
4152,protein metabolism,comp100113_c0_seq2
4151,protein metabolism,comp100113_c1_seq1
4153,other biological processes,comp100129_c0_seq1
4154,other metabolic processes,comp100141_c0_seq1


## step 2 reading fasta file

In [7]:
# fasta file to calculate the CpG content per contig
handle = "analyses/Geoduck-transcriptome-v2.fasta"

### using biopython fasta file management and routines to count "C", "G" and "CG, together with the sequence length
### calculates CpG 

Creates temporary variables

In [8]:
record_id = []
record_cpg = []
for record in SeqIO.parse(handle, "fasta"):
    g= record.seq.count("G")
    c= record.seq.count("C")
    cg= record.seq.count("CG")
    lar= len(record.seq)
    try:
        g*c==0
    except:
        print (record.id)
        record_id.append(record.id)
        record_cpg.append(0.0000)
    else:
        print (record.id, round(cg/(g*c)*(lar**2/(lar-1)) ,4))
        record_id.append(record.id)
        record_cpg.append(round(cg/(g*c)*(lar**2/(lar-1)) ,4))

comp7_c0_seq1 1.3882
comp30_c0_seq1 0.9665
comp35_c0_seq1 0.6306
comp36_c0_seq1 0.0
comp49_c0_seq1 0.837
comp59_c0_seq1 0.5102
comp60_c0_seq1 0.2415
comp65_c0_seq1 0.3077
comp66_c0_seq1 0.7564
comp69_c0_seq1 0.4943
comp71_c0_seq1 0.4552
comp90_c0_seq1 0.1674
comp93_c0_seq1 1.1423
comp95_c0_seq1 0.6196
comp105_c0_seq1 0.6544
comp115_c0_seq1 0.6782
comp117_c0_seq1 0.0
comp128_c0_seq1 0.325
comp135_c0_seq1 0.0
comp137_c0_seq1 0.7535
comp142_c0_seq1 0.1629
comp146_c0_seq1 1.2283
comp148_c0_seq1 0.3136
comp171_c0_seq1 1.4192
comp171_c0_seq2 1.3113
comp176_c0_seq1 0.1412
comp177_c0_seq1 1.1289
comp178_c0_seq1 0.7138
comp182_c0_seq1 0.6048
comp190_c0_seq1 1.0541
comp195_c0_seq1 0.9776
comp216_c0_seq1 1.0158
comp217_c0_seq1 0.3013
comp234_c0_seq1 0.0
comp235_c0_seq1 0.357
comp242_c0_seq1 1.409
comp246_c0_seq1 0.0
comp250_c0_seq1 0.7553
comp259_c0_seq1 1.3844
comp268_c0_seq1 0.8356
comp275_c0_seq1 0.1819
comp277_c0_seq1 0.2047
comp277_c1_seq1 0.2581
comp279_c0_seq1 0.1171
comp282_c0_seq1 0.2003

### Uses temporary variables to obtain the CpG and sequence.id as a dataframe, which will be used later

In [9]:
records = DataFrame({'id':record_id, 'CpG':record_cpg})
records = records.sort('id')
records

Unnamed: 0,CpG,id
37873,0.9190,comp100000_c0_seq1
37874,0.2642,comp100001_c1_seq1
37875,0.2336,comp100001_c2_seq1
37876,0.9859,comp100002_c0_seq1
37877,0.4392,comp100004_c1_seq1
37878,0.6894,comp100007_c0_seq1
37879,1.1367,comp100010_c0_seq1
37880,0.5625,comp100011_c0_seq1
37881,0.1521,comp100012_c0_seq1
37882,0.5719,comp100014_c0_seq1


In [10]:
records.describe()

Unnamed: 0,CpG
count,154407.0
mean,0.549013
std,0.324474
min,0.0
25%,0.3239
50%,0.4929
75%,0.73
max,3.5143


### Joints temporary dataframes into a single dataframe and saves csv file

In [11]:
f2=pd.merge(f1,records, how='outer')
f2

Unnamed: 0,GOSlimbin,id,CpG
0,transport,comp100097_c0_seq1,0.4868
1,protein metabolism,comp100104_c2_seq1,0.5637
2,RNA metabolism,comp100105_c1_seq1,0.5038
3,transport,comp100108_c1_seq1,0.5092
4,other biological processes,comp100109_c0_seq1,0.6331
5,other biological processes,comp100113_c0_seq1,1.1629
6,protein metabolism,comp100113_c0_seq2,0.9827
7,protein metabolism,comp100113_c1_seq1,1.4217
8,other biological processes,comp100129_c0_seq1,0.3669
9,other metabolic processes,comp100141_c0_seq1,0.3492


# file save as csv 

In [12]:
f2.to_csv('analyses/Geoduck-transcriptome-CpG_GOSlim.csv')

### Plotting

In [13]:
f2['CpG'].hist()
show()

In [14]:
f2.groupby('GOSlimbin')['CpG'].mean().plot(kind='barh', color=list('ybg'))
plt.axis([0.4, 0.7, 0, 15])
show()

![TEKTA1](img/figure_1histo.png)

In [15]:
# pandas density plot
f2['CpG'].plot(kind='kde', linewidth=3);
plt.axis([0, 1.5, 0, 1.9])
show()

![TEKTA1](img/figure_1dens.png)