# Download TCGA-OV Data

In [1]:
%load_ext rpy2.ipython

In [2]:
%%R
#source("https://bioconductor.org/biocLite.R")
#biocLite("TCGAbiolinks")
#library("TCGAbiolinks")
#devtools::install_github(repo = "BioinformaticsFMRP/TCGAbiolinks")
library("TCGAbiolinks")
sessionInfo()

# Uses the example from 
# https://bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/tcgaBiolinks.html#mirna-expression-data-downstream-analysis-brca
CancerProject <- "TCGA-OV"
DataDirectory <- paste0("GDC/",gsub("-","_",CancerProject))
FileNameData <- paste0(DataDirectory, "_","miRNA_gene_quantification",".rda")

query.miR <- GDCquery(project = CancerProject, 
                      data.category = "Gene expression",
                      data.type = "miRNA gene quantification",
                      file.type = "hg19.mirna",
                      legacy = TRUE)

samplesDown.miR <- query.miR$results[[1]]$cases

dataSmTP.miR <- TCGAquery_SampleTypes(barcode = samplesDown.miR, typesample = 'TP')

queryDown.miR <- GDCquery(project = CancerProject, 
                          data.category = "Gene expression",
                          data.type = "miRNA gene quantification",
                          file.type = "hg19.mirna",
                          legacy = TRUE,
                          barcode = c(dataSmTP.miR))

GDCdownload(query = queryDown.miR,
            directory = DataDirectory)

dataAssy.miR <- GDCprepare(query = queryDown.miR, 
                           save = TRUE, 
                           save.filename = FileNameData, 
                           summarizedExperiment = TRUE,
                           directory =DataDirectory )
rownames(dataAssy.miR) <- dataAssy.miR$miRNA_ID

# using read_counts data 
#read_countData <-  colnames(dataAssy.miR)[grep("count", colnames(dataAssy.miR))]
#dataAssy.miR <- dataAssy.miR[,read_countData]
#colnames(dataAssy.miR) <- gsub("read_count_","", colnames(dataAssy.miR))

# using normalized reads per million data
read_rpmData <-  colnames(dataAssy.miR)[grep("reads_per_million_miRNA_mapped", colnames(dataAssy.miR))]
dataAssy.miR <- dataAssy.miR[,read_rpmData]
colnames(dataAssy.miR) <- gsub("reads_per_million_miRNA_mapped_","", colnames(dataAssy.miR))

#write.table(data.frame("miRNA"=rownames(dataAssy.miR),dataAssy.miR),"miRNA_quantifications.txt", row.names=FALSE)
#dataAssy.miR.transposed = t(dataAssy.miR)
#write.table(data.frame("TCGA_barcode"=rownames(dataAssy.miR.transposed),dataAssy.miR.transposed),"miRNA_quantifications_transposed.txt", row.names=FALSE)

mirna.rpm <- dataAssy.miR
#rownames(mirna.rpm) <-gsub("(?<=[A-Z])-","",gsub("HSA-LET-","MIRLET",gsub("HSA-MIR-","MIR",toupper(rownames(mirna.rpm)))), perl=TRUE)
write.table(data.frame("miRNA"=rownames(mirna.rpm),mirna.rpm),"miRNA_quantifications.reads_per_million_miRNA_mapped.txt", row.names=FALSE, sep = "\t")
mirna.rpm.transposed <- data.frame(t(mirna.rpm))

patient_barcode_from_sample_barcode <- function (sample_barcode) {
  return(paste(strsplit(sample_barcode,"-")[[1]][1:3], collapse="-")[1])
}

#rownames(mirna.rpm.transposed) <- lapply(rownames(mirna.rpm.transposed), patient_barcode_from_sample_barcode)
#mirna.rpm.transposed$bcr_patient_barcode <- rownames(mirna.rpm.transposed)
mirna.rpm.transposed$bcr_patient_barcode <- sapply(rownames(mirna.rpm.transposed), patient_barcode_from_sample_barcode)
write.table(data.frame("miRNA"=rownames(mirna.rpm.transposed),mirna.rpm.transposed),"miRNA_quantifications.reads_per_million_miRNA_mapped.transposed.txt", row.names=FALSE, sep = "\t")

clin.query <- GDCquery(project = "TCGA-OV", data.category = "Clinical")
json  <- tryCatch(GDCdownload(clin.query), 
                  error = function(e) GDCdownload(clin.query, method = "client"))
clinical.patient <- GDCprepare_clinic(clin.query, clinical.info = "patient")
clinical.patient.followup <- GDCprepare_clinic(clin.query, clinical.info = "follow_up")
clinical.patient.drug <- GDCprepare_clinic(clin.query, clinical.info = "drug")
clinical.patient.radiation <- GDCprepare_clinic(clin.query, clinical.info = "radiation")
clinical.patient.new_tumor_event <- GDCprepare_clinic(clin.query, clinical.info = "new_tumor_event")
clinical.patient.admin <- GDCprepare_clinic(clin.query, clinical.info = "admin")
clinical.patient.stage_event <- GDCprepare_clinic(clin.query, clinical.info = "stage_event")
clinical.index <- GDCquery_clinic("TCGA-OV")
write.table(clinical.patient,"clinical.patient.txt", row.names=FALSE, sep = "\t")

colnames(clinical.patient.followup) <- gsub("form_completion","form_completion_followup",colnames(clinical.patient.followup))
write.table(clinical.patient.followup,"clinical.patient.followup.txt", row.names=FALSE, sep = "\t")

colnames(clinical.patient.drug) <- gsub("form_completion","form_completion_drug",colnames(clinical.patient.drug))
write.table(clinical.patient.drug,"clinical.patient.drug.txt", row.names=FALSE, sep = "\t")

colnames(clinical.patient.radiation) <- gsub("form_completion","form_completion_radiation",colnames(clinical.patient.new_tumor_event))
write.table(clinical.patient.radiation,"clinical.patient.radiation.txt", row.names=FALSE, sep = "\t")

colnames(clinical.patient.new_tumor_event) <- gsub("form_completion","form_completion_new_tumor_event",colnames(clinical.patient.new_tumor_event))
write.table(clinical.patient.new_tumor_event,"clinical.patient.new_tumor_event.txt", row.names=FALSE, sep = "\t")

colnames(clinical.patient.stage_event) <- gsub("form_completion","form_completion_stage_event",colnames(clinical.patient.stage_event))
write.table(clinical.patient.stage_event,"clinical.patient.stage_event.txt", row.names=FALSE, sep = "\t")

colnames(clinical.patient.admin) <- gsub("form_completion","form_completion_admin",colnames(clinical.patient.admin))
write.table(clinical.patient.admin,"clinical.patient.admin.txt", row.names=FALSE, sep = "\t")

clinical.patient <- merge(clinical.patient,clinical.patient.drug, all=TRUE, by = "bcr_patient_barcode")

clinical.patient <- merge(clinical.patient,clinical.patient.followup, all=TRUE, by = "bcr_patient_barcode")

clinical.patient <- merge(clinical.patient,clinical.patient.admin, all=TRUE, by = "bcr_patient_barcode")

clinical.patient <- merge(clinical.patient,clinical.patient.stage_event, all=TRUE, by = "bcr_patient_barcode")

#clinical.patient <- merge(clinical.patient,clinical.patient.radiation, all=TRUE, by = "bcr_patient_barcode")

clinical.patient <- merge(clinical.patient,clinical.patient.new_tumor_event, all=TRUE, by = "bcr_patient_barcode")

write.table(clinical.patient,"clinical.patient.drug.followup.admin.stage_event.new_tumor_event.txt", row.names=FALSE, sep = "\t")

clinical.patient.dhs_mirna_rpm <- clinical.patient

clinical.patient.mirna_rpm <- merge(clinical.patient, mirna.rpm.transposed, all=TRUE, by = "bcr_patient_barcode")

write.table(clinical.patient.mirna_rpm,"clinical.patient.drug.followup.admin.stage_event.new_tumor_event.mirna_rpm.txt", row.names=FALSE, sep = "\t")

dhs.ensg_ids <- read.table("allGeneCorrelations100000.p05_v3.MIR.ENSG_IDs.txt",stringsAsFactors = FALSE)
ensembl_like_names <- gsub("(?<=[A-Z])-","",gsub("HSA-LET-","MIRLET",gsub("HSA-MIR-","MIR",toupper(rownames(mirna.rpm)))), perl=TRUE)
dhs.mirna.rpm <- mirna.rpm[ensembl_like_names %in% intersect(ensembl_like_names, dhs.ensg_ids[,1]),]
dhs.mirna.rpm.transposed <- data.frame(t(dhs.mirna.rpm))
dhs.mirna.rpm.transposed$bcr_patient_barcode <- sapply(rownames(dhs.mirna.rpm.transposed), patient_barcode_from_sample_barcode)
write.table(data.frame("miRNA"=rownames(dhs.mirna.rpm.transposed),dhs.mirna.rpm.transposed),"DHS_subset.miRNA_quantifications.reads_per_million_miRNA_mapped.transposed.txt", row.names=FALSE, sep = "\t")

clinical.patient.dhs_mirna_rpm <- merge(clinical.patient.dhs_mirna_rpm, dhs.mirna.rpm.transposed, all=TRUE, by = "bcr_patient_barcode")
write.table(clinical.patient.dhs_mirna_rpm,"clinical.patient.drug.followup.admin.stage_event.new_tumor_event.dhs_subset_mirna_rpm.txt", row.names=FALSE, sep = "\t")



Pathview is an open source software package distributed under GNU General
Public License version 3 (GPLv3). Details of GPLv3 is available at
http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
formally cite the original Pathview paper (not just mention it) in publications
or products. For details, do citation("pathview") within R.

The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
license agreement (details at http://www.kegg.jp/kegg/legal.html).
##############################################################################
























=> drugs: drug 
=> follow_ups: follow_up 
=> radiations: radiation




Error in file(file, "rt") : cannot open the connection




 



# Get miRNAs with DHS

In [3]:
%%bash
wget -O allGeneCorrelations100000.p05_v3.txt.gz http://big.databio.org/RED/allGeneCorrelations100000.p05_v3.txt.gz
zcat allGeneCorrelations100000.p05_v3.txt.gz|cut -f8,10|grep '^MIR\|^hsa-mir'|uniq|sort|uniq > allGeneCorrelations100000.p05_v3.MIR.ENSG_IDs.txt
head <(zcat allGeneCorrelations100000.p05_v3.txt.gz)
head allGeneCorrelations100000.p05_v3.MIR.ENSG_IDs.txt

dhs_chr	dhs_start	dhs_end	dhs_id	gene_chr	gene_start	gene_end	gene_name	metaprobeset_id	ensemblID	cor	pval
chrX	99861860	99862010	2861605	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	-0.5445678	0.006
chrX	99870060	99870210	2861609	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	 0.3683153	0.993
chrX	99880960	99881110	2861619	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	-0.4026298	0.020
chrX	99891065	99891215	2861626	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	 0.2857770	0.988
chrX	99891260	99891410	2861627	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	 0.2643461	0.983
chrX	99891740	99891890	2861629	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	 0.4594339	0.998
chrX	99891900	99892050	2861630	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	 0.4258412	0.997
chrX	99899060	99899210	2861638	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	 0.4342573	0.998
chrX	99904080	99904230	2861647	chrX	99885757	99891766	TSPAN6	1	ENSG00000000003	 0.3240553	0.990
hsa-mir-220a	ENSG00000207655


--2017-03-12 20:07:11--  http://big.databio.org/RED/allGeneCorrelations100000.p05_v3.txt.gz
Resolving big.databio.org (big.databio.org)... 192.64.119.17
Connecting to big.databio.org (big.databio.org)|192.64.119.17|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: http://www.biomedical-sequencing.at/bocklab/nsheffield/RED/allGeneCorrelations100000.p05_v3.txt.gz [following]
--2017-03-12 20:07:12--  http://www.biomedical-sequencing.at/bocklab/nsheffield/RED/allGeneCorrelations100000.p05_v3.txt.gz
Resolving www.biomedical-sequencing.at (www.biomedical-sequencing.at)... 149.148.226.60
Connecting to www.biomedical-sequencing.at (www.biomedical-sequencing.at)|149.148.226.60|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.biomedical-sequencing.at/bocklab/nsheffield/RED/allGeneCorrelations100000.p05_v3.txt.gz [following]
--2017-03-12 20:07:14--  https://www.biomedical-sequencing.at/bocklab/nsheffield/RED/allGeneC

# Get DHS to MIR Lookup

In [4]:
%%bash
zcat allGeneCorrelations100000.p05_v3.txt.gz|awk '{print $8"\t"$1":"$2"-"$3}'|grep '^MIR'|sort|uniq|sort \
> MIRNA_to_DHS_Region.txt
head MIRNA_to_DHS_Region.txt

MIR100	chr11:121933580-121933730
MIR100	chr11:121947520-121947670
MIR100	chr11:121953060-121953210
MIR100	chr11:121955100-121955250
MIR100	chr11:121956080-121956230
MIR100	chr11:121959860-121960010
MIR100	chr11:121962720-121962870
MIR100	chr11:121963520-121963670
MIR100	chr11:121964560-121964710
MIR100	chr11:121966780-121966930


# Try to reformat so that a DHS region has only one line with all related genes
Might come in handy.

In [5]:
%%R
MIRNA_to_DHS_Region <- read.table('MIRNA_to_DHS_Region.txt', stringsAsFactors = FALSE, 
                                  col.names = c("Gene","DHS_Region"))
head(MIRNA_to_DHS_Region)

    Gene                DHS_Region
1 MIR100 chr11:121933580-121933730
2 MIR100 chr11:121947520-121947670
3 MIR100 chr11:121953060-121953210
4 MIR100 chr11:121955100-121955250
5 MIR100 chr11:121956080-121956230
6 MIR100 chr11:121959860-121960010


In [6]:
%%R
MIRNA_to_DHS_Region.aggregate <- aggregate(Gene ~ DHS_Region, data = MIRNA_to_DHS_Region, paste, collapse = ' ')
colnames(MIRNA_to_DHS_Region.aggregate) <- c("DHS_Region", "Genes")
head(MIRNA_to_DHS_Region.aggregate)
write.table(MIRNA_to_DHS_Region.aggregate,"MIRNA_to_DHS_Region.aggregate.txt",sep="\t",row.names=FALSE,quote=FALSE)

In [7]:
%%bash
head MIRNA_to_DHS_Region.aggregate.txt

DHS_Region	Genes
chr10:100055780-100055930	MIR1287
chr10:100062800-100062950	MIR1287
chr10:100069885-100070035	MIR1287
chr10:100074460-100074610	MIR1287
chr10:100078720-100078870	MIR1287
chr10:100083120-100083270	MIR1287
chr10:100098180-100098330	MIR1287
chr10:100098700-100098850	MIR1287
chr10:100104360-100104510	MIR1287


In [8]:
%%bash
cat MIRNA_to_DHS_Region.aggregate.txt|cut -f1|tail -n +2|sort|uniq|sort|sed 's|:|\t|'|sed 's|-|\t|' \
|awk 'BEGIN{FS=OFS="\t"}{print $1,$2-1,$3,$1":"$2"-"$3}'|awk '{print $0":+\t0\t+";print $0":-\t1\t-";}' \
|sort -k1,1 -k2,2n \
> MIRNA_to_DHS_Region.aggregate.bothstrands.bed
head MIRNA_to_DHS_Region.aggregate.bothstrands.bed
cat MIRNA_to_DHS_Region.aggregate.bothstrands.bed|awk '$6=="+"' \
> MIRNA_to_DHS_Region.aggregate.plus_strand.bed

chr1	1003204	1003355	chr1:1003205-1003355:+	0	+
chr1	1003204	1003355	chr1:1003205-1003355:-	1	-
chr1	1004124	1004275	chr1:1004125-1004275:+	0	+
chr1	1004124	1004275	chr1:1004125-1004275:-	1	-
chr1	1004284	1004435	chr1:1004285-1004435:+	0	+
chr1	1004284	1004435	chr1:1004285-1004435:-	1	-
chr1	1004524	1004675	chr1:1004525-1004675:+	0	+
chr1	1004524	1004675	chr1:1004525-1004675:-	1	-
chr1	1004684	1004835	chr1:1004685-1004835:+	0	+
chr1	1004684	1004835	chr1:1004685-1004835:-	1	-


In [9]:
%%bash
>hg19.fa
hg19_url_prefix="ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/"
for chr_num in $(seq 22) M X Y
do
    hg19_chr_filename="chr""$chr_num"".fa.gz"
    wget -O "$hg19_chr_filename" "$hg19_url_prefix""$hg19_chr_filename"
    zcat "$hg19_chr_filename" >> hg19.fa
done
ls hg19.fa

hg19.fa


--2017-03-12 20:07:34--  ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/chr1.fa.gz
           => ‘chr1.fa.gz’
Resolving hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)|128.114.119.163|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /goldenPath/hg19/chromosomes ... done.
==> SIZE chr1.fa.gz ... 73773666
==> PASV ... done.    ==> RETR chr1.fa.gz ... done.
Length: 73773666 (70M) (unauthoritative)

     0K .......... .......... .......... .......... ..........  0%  548K 2m11s
    50K .......... .......... .......... .......... ..........  0% 1001K 1m42s
   100K .......... .......... .......... .......... ..........  0% 1.15M 88s
   150K .......... .......... .......... .......... ..........  0% 1.79M 76s
   200K .......... .......... .......... .......... ..........  0% 2.14M 67s
   250K .......... .......... ....

In [10]:
%%bash
#http://some-hints.blogspot.com/2013/07/how-to-convert-fasta-file-to.html
rm chr*.fa.gz
cat hg19.fa|awk '{ if ($0 !~ />/) {print toupper($0)} else {print $0} }' \
>hg19.upper.fa
samtools faidx hg19.upper.fa
rm hg19.fa

In [39]:
%%bash
fasta-get-markov -m 1 -dna hg19.upper.fa hg19.upper.markov_background

25 16571 249250621 123827759.3 3095693983


processed: 0.0%processed: 2.2%processed: 4.4%processed: 6.7%processed: 8.9%processed: 11.2%processed: 13.4%processed: 15.6%processed: 17.9%processed: 20.1%processed: 22.4%processed: 24.6%processed: 26.9%processed: 29.2%processed: 31.4%processed: 33.7%processed: 35.9%processed: 38.2%processed: 40.3%processed: 42.5%processed: 44.6%processed: 46.8%processed: 48.9%processed: 51.2%processed: 53.4%processed: 55.6%processed: 57.8%processed: 60.0%processed: 62.3%processed: 64.5%processed: 66.6%processed: 68.9%processed: 71.1%processed: 73.1%processed: 75.2%processed: 77.4%processed: 79.5%processed: 81.7%processed: 83.9%processed: 86.1%processed: 88.2%processed: 90.3%processed: 92.4%processed: 94.5%processed: 96.7%processed: 99.0%processed: 100.0%[K

In [11]:
%%bash
#bedtools getfasta -fi hg19.upper.fa -bed MIRNA_to_DHS_Region.aggregate.bothstrands.bed -s -name \
#-fo MIRNA_to_DHS_Region.aggregate.bothstrands.hg19.upper.fa

In [12]:
%%bash
#bedtools getfasta -fi hg19.upper.fa -bed MIRNA_to_DHS_Region.aggregate.plus_strand.bed -s -name \
#-fo MIRNA_to_DHS_Region.aggregate.plus_strand.hg19.upper.fa

In [13]:
# HOMER vertebrate motif search over all DHS for miRNA with DHS

In [14]:
%%bash

#findMotifsGenome.pl MIRNA_to_DHS_Region.aggregate.plus_strand.bed hg19 MIRNA_to_DHS_Region.aggregate.plus_strand -size given -preparse -mset vertebrates

# Get gene names of important genes

In [15]:
%%R
get_important_mirna_names <- function(gene_importance_file_path, output_important_gene_names_file_path){
  gene_importance <- read.csv(gene_importance_file_path, row.names = 1)
  gene_importance_threshold <- abs(min(0, min(gene_importance)))
  gene_importance_threshold
  important_genes <- gene_importance[which(gene_importance > gene_importance_threshold), 1, FALSE]
  important_genes
  important_gene_names <- rownames(important_genes)
  important_gene_names
  important_gene_ensembl_like_names <- gsub("(?<=[A-Z])-","",gsub("HSA-LET-","MIRLET",gsub("HSA-MIR-","MIR",toupper(gsub("\\.","-",(important_gene_names))))), perl=TRUE)
  important_gene_ensembl_like_names
  cat(important_gene_ensembl_like_names , file = output_important_gene_names_file_path, sep = "\n", fill = FALSE, labels = NULL,append = FALSE)
  return(important_gene_ensembl_like_names)
}

In [16]:
%%R
get_important_mirna_names("os_gene_importance.csv", "os_gene_importance.important_genes.txt")
get_important_mirna_names("stage_gene_importance.csv", "stage_gene_importance.important_genes.txt")

character(0)


In [17]:
%%bash
wc -l "os_gene_importance.important_genes.txt"
cat "os_gene_importance.important_genes.txt"
wc -l "stage_gene_importance.important_genes.txt"
cat "stage_gene_importance.important_genes.txt"

14 os_gene_importance.important_genes.txt
MIRLET7B
MIRLET7G
MIR130B
MIR143
MIR187
MIR18A
MIR191
MIR194-1
MIR203
MIR24-2
MIR3187
MIR877
MIR96
MIR98
1 stage_gene_importance.important_genes.txt



# Get BED of DHS of MIRNA

In [18]:
%%bash
cat MIRNA_to_DHS_Region.aggregate.txt \
|tail -n +2 \
|sort|uniq|sort \
|sed 's|:|\t|'|sed 's|-|\t|' \
|awk 'BEGIN{FS=OFS="\t"}{print $1,$2-1,$3,$1":"$2"-"$3":+:"$4"::"}' \
|sed 's| |::|g' \
|sort -k1,1 -k2,2n|uniq \
> MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed


# Get FASTA of DHS of MIRNA

In [19]:
%%bash
bedtools getfasta \
-fi hg19.upper.fa \
-bed MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
-s -name \
-fo MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa


# HOMER vertebrate transcription factor motif search over all DHS for miRNA with DHS, no duplicate sequences

In [20]:
%%bash
findMotifsGenome.pl \
MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
hg19 \
MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.homer \
-size given \
-preparse \
-mset vertebrates


	Position file = MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed
	Genome = hg19
	Output Directory = MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.homer
	Using actual sizes of regions (-size given)
	Fragment size set to given
	Peak/BED file conversion summary:
		BED/Header formatted lines: 4130
		peakfile formatted lines: 0

	Peak File Statistics:
		Total Peaks: 4130
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Background fragment size set to 150 (avg size of targets)
	Genome preparsing was FORCED.
	Preparsing genome for 150 bp fragments...(will probably take 1-5 min)
	preparse size set to 150

	By default, using /home/ndelossantos/homer/.//data/genomes/hg19//hg19.tss for reference positions
	Output files will be placed in /home/ndelossantos/homer/.//data/genomes/hg

# HOMER vertebrate transcription factor motif search for DHS of miRNAs important to Overall Survival, no duplicate sequences

In [21]:
%%bash
important_gene_list="os_gene_importance.important_genes.txt"
cat MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
|grep -Fwf \
<(cat "$important_gene_list"|awk '{print ":"$1":"}') \
> $(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed

findMotifsGenome.pl \
$(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
hg19 \
$(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.homer \
-size given \
-preparse \
-mset vertebrates


	Position file = os_gene_importance.important_genes.txt.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed
	Genome = hg19
	Output Directory = os_gene_importance.important_genes.txt.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.homer
	Using actual sizes of regions (-size given)
	Fragment size set to given
	Peak/BED file conversion summary:
		BED/Header formatted lines: 394
		peakfile formatted lines: 0

	Peak File Statistics:
		Total Peaks: 394
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Background fragment size set to 150 (avg size of targets)
	Genome preparsing was FORCED.
	Preparsing genome for 150 bp fragments...(will probably take 1-5 min)
	preparse size set to 150

	By default, using /home/ndelossantos/homer/.//data/genomes/hg19//hg19.tss for reference positions

In [43]:
%%bash
important_gene_list="os_gene_importance.important_genes.txt"
cat MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
|grep -Fwf \
<(cat "$important_gene_list"|awk '{print ":"$1":"}') \
> $(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed

cat MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
|grep -vFwf \
<(cat "$important_gene_list"|awk '{print ":"$1":"}') \
> $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed


findMotifsGenome.pl \
$(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
hg19 \
$(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.homer_with_background \
-size given \
-preparse \
-mset vertebrates \
-bg $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed


	Position file = os_gene_importance.important_genes.txt.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed
	Genome = hg19
	Output Directory = os_gene_importance.important_genes.txt.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.homer_with_background
	Using actual sizes of regions (-size given)
	Fragment size set to given
	background position file: os_gene_importance.important_genes.txt.background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed
	Peak/BED file conversion summary:
		BED/Header formatted lines: 394
		peakfile formatted lines: 0

	Peak File Statistics:
		Total Peaks: 394
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Peak/BED file conversion summary:
		BED/Header formatted lines: 3736
		peakfile formatted lines: 0
	Max distance to merge: direct 

In [46]:
%%bash
important_gene_list="os_gene_importance.important_genes.txt"
findMotifsGenome.pl \
$(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
hg19 \
$(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.homer_with_background.hg \
-size given \
-preparse \
-mset vertebrates \
-bg $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
-h


	Position file = os_gene_importance.important_genes.txt.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed
	Genome = hg19
	Output Directory = os_gene_importance.important_genes.txt.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.homer_with_background.hg
	Using actual sizes of regions (-size given)
	Fragment size set to given
	background position file: os_gene_importance.important_genes.txt.background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed
	Using hypergeometric distribution for p-values
	Peak/BED file conversion summary:
		BED/Header formatted lines: 394
		peakfile formatted lines: 0

	Peak File Statistics:
		Total Peaks: 394
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Peak/BED file conversion summary:
		BED/Header formatted lines: 3736
		peakfile

# FIMO JASPAR vertebrate transcripton factor binding motif search over all DHS for miRNA with DHS, no duplicate sequences

## Download motifs databases

In [23]:
%%bash
wget -O motif_databases.12.15.tar.gz http://meme-suite.org/meme-software/Databases/motifs/motif_databases.12.15.tgz
tar -zxvf motif_databases.12.15.tar.gz

motif_databases/
motif_databases/ARABD/
motif_databases/CIS-BP/
motif_databases/CISBP-RNA/
motif_databases/ECOLI/
motif_databases/EUKARYOTE/
motif_databases/FLY/
motif_databases/HUMAN/
motif_databases/JASPAR/
motif_databases/log.txt
motif_databases/MALARIA/
motif_databases/MIRBASE/
motif_databases/motif_db.csv
motif_databases/MOUSE/
motif_databases/PROKARYOTE/
motif_databases/RNA/
motif_databases/TFBSshape/
motif_databases/tmp
motif_databases/WORM/
motif_databases/YEAST/
motif_databases/YEAST/macisaac_yeast.v1.meme
motif_databases/YEAST/scpd_matrix.meme
motif_databases/YEAST/SwissRegulon_s_cer.meme
motif_databases/YEAST/yeast_uniprobe_GR09.meme
motif_databases/YEAST/YEASTRACT_20130918.meme
motif_databases/WORM/uniprobe_worm.meme
motif_databases/TFBSshape/TFBSshape_JASPAR.meme
motif_databases/TFBSshape/TFBSshape_UniPROBE.meme
motif_databases/RNA/Ray2013_rbp_All_Species.dna_encoded.meme
motif_databases/RNA/Ray2013_rbp_All_Species.meme
motif_databases/RNA/Ray2013_rbp_Arabidopsis_thaliana.

--2017-03-12 20:52:22--  http://meme-suite.org/meme-software/Databases/motifs/motif_databases.12.15.tgz
Resolving meme-suite.org (meme-suite.org)... 54.68.135.202
Connecting to meme-suite.org (meme-suite.org)|54.68.135.202|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12563298 (12M)
Saving to: ‘motif_databases.12.15.tar.gz’

     0K .......... .......... .......... .......... ..........  0%  541K 23s
    50K .......... .......... .......... .......... ..........  0% 1.14M 16s
   100K .......... .......... .......... .......... ..........  1% 13.2M 11s
   150K .......... .......... .......... .......... ..........  1% 8.83M 9s
   200K .......... .......... .......... .......... ..........  2% 1.19M 9s
   250K .......... .......... .......... .......... ..........  2% 6.12M 8s
   300K .......... .......... .......... .......... ..........  2% 9.94M 7s
   350K .......... .......... .......... .......... ..........  3% 2.56M 6s
   400K .......... .......... .....

In [35]:
%%bash
fimo \
--max-stored-scores $((X=2**63-1)) \
--oc MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fimo_out \
motif_databases/JASPAR/JASPAR_CORE_2016_vertebrates.meme \
MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa


Using motif +MA0002.2 of width 11.
Using motif -MA0002.2 of width 11.
Computing q-values.
Estimating pi_0 from a uniformly sampled set of 10000 p-values.
Estimating pi_0.
Estimated pi_0=0.967073
Using motif +MA0003.3 of width 11.
Using motif -MA0003.3 of width 11.
Computing q-values.
Estimating pi_0 from a uniformly sampled set of 10000 p-values.
Estimating pi_0.
Estimated pi_0=0.8125
Using motif +MA0004.1 of width 6.
Using motif -MA0004.1 of width 6.
Computing q-values.
Using motif +MA0006.1 of width 6.
Using motif -MA0006.1 of width 6.
Computing q-values.
Using motif +MA0007.3 of width 17.
Using motif -MA0007.3 of width 17.
Computing q-values.
Estimating pi_0 from a uniformly sampled set of 10000 p-values.
Estimating pi_0.
Estimated pi_0=0.991739
Using motif +MA0009.2 of width 16.
Using motif -MA0009.2 of width 16.
Computing q-values.
Estimating pi_0 from a uniformly sampled set of 10000 p-values.
Estimating pi_0.
Estimated pi_0=0.977757
Using motif +MA0014.2 of width 19.
Using motif

# MCAST JASPAR vertebrate transcripton factor binding motif search over all DHS for miRNA with DHS, no duplicate sequences

In [36]:
%%bash
mcast \
--oc MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.mcast_out \
motif_databases/JASPAR/JASPAR_CORE_2016_vertebrates.meme \
MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa


Creating HMM from motif array.
Building HMM: (0) 1 2-3-4-5-6-7-8-9-10-11-12 | 13-14-15-16-17-18-19-20-21-22-23 | 24-25-26-27-28-29-30-31-32-33-34 | 35-36-37-38-39-40-41-42-43-44-45 | 46-47-48-49-50-51 | 52-53-54-55-56-57 | 58-59-60-61-62-63 | 64-65-66-67-68-69 | 70-71-72-73-74-75-76-77-78-79-80-81-82-83-84-85-86 | 87-88-89-90-91-92-93-94-95-96-97-98-99-100-101-102-103 | 104-105-106-107-108-109-110-111-112-113-114-115-116-117-118-119 | 120-121-122-123-124-125-126-127-128-129-130-131-132-133-134-135 | 136-137-138-139-140-141-142-143-144-145-146-147-148-149-150-151-152-153-154 | 155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173 | 174-175-176-177-178-179-180-181-182-183-184-185-186 | 187-188-189-190-191-192-193-194-195-196-197-198-199 | 200-201-202-203-204-205-206-207 | 208-209-210-211-212-213-214-215 | 216-217-218-219-220-221-222-223-224-225-226-227 | 228-229-230-231-232-233-234-235-236-237-238-239 | 240-241-242-243-244-245-246-247-248-249-250-251 | 252-253-254-2

# MAST JASPAR vertebrate transcripton factor binding motif search over all DHS for miRNA with DHS, no duplicate sequences

In [37]:
%%bash
mast \
--oc MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.mast_out \
motif_databases/JASPAR/JASPAR_CORE_2016_vertebrates.meme \
MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa


sequences:    100 sequences:    200 sequences:    300 sequences:    400 sequences:    500 sequences:    600 sequences:    700 sequences:    800 sequences:    900 sequences:   1000 sequences:   1100 sequences:   1200 sequences:   1300 sequences:   1400 sequences:   1500 sequences:   1600 sequences:   1700 sequences:   1800 sequences:   1900 sequences:   2000 sequences:   2100 sequences:   2200 sequences:   2300 sequences:   2400 sequences:   2500 sequences:   2600 sequences:   2700 sequences:   2800 sequences:   2900 sequences:   3000 sequences:   3100 sequences:   3200 sequences:   3300 sequences:   3400 sequences:   3500 sequences:   3600 sequences:   3700 sequences:   3800 sequences:   3900 sequences:   4000 sequences:   4100 Writing results to output directory 'MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.mast_out'.



In [48]:
%%bash
ame \
--oc MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.ame_out \
MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa \
motif_databases/JASPAR/JASPAR_CORE_2016_vertebrates.meme

Writing results to output directory 'MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.ame_out'.


# AME and MEME using non-important DHS as background

In [3]:
%%bash
important_gene_list="os_gene_importance.important_genes.txt"
cat MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
|grep -Fwf \
<(cat "$important_gene_list"|awk '{print ":"$1":"}') \
> $(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed

cat MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
|grep -vFwf \
<(cat "$important_gene_list"|awk '{print ":"$1":"}') \
> $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed

bedtools getfasta \
-fi hg19.upper.fa \
-bed $(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
-s -name \
-fo $(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa

bedtools getfasta \
-fi hg19.upper.fa \
-bed $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.bed \
-s -name \
-fo $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa

fasta-get-markov -m 3 \
-dna $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa \
$(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.markov_background

ame \
--bgformat 2 \
--bgfile $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.markov_background \
--oc $(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.ame_out \
$(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa \
motif_databases/JASPAR/JASPAR_CORE_2016_vertebrates.meme

meme \
$(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.fa  \
-dna \
-bfile $(basename "$important_gene_list").background.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.markov_background \
-oc $(basename "$important_gene_list").MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.meme_out \
-nmotifs $(cat "$important_gene_list"| wc -l) \
-revcomp


3736 151 371 151.6 566196


processed: 0.0%processed: 100.0%[KWriting results to output directory 'os_gene_importance.important_genes.txt.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.ame_out'.
Writing results to output directory 'os_gene_importance.important_genes.txt.MIRNA_to_DHS_Region.aggregate.MIRNA_DHS_named.plus_strand.hg19.upper.meme_out'.
Initializing the motif probability tables for 2 to 394 sites...
nsites = 2nsites = 3nsites = 4nsites = 5nsites = 6nsites = 7nsites = 8nsites = 9nsites = 10nsites = 11nsites = 12nsites = 13nsites = 14nsites = 15nsites = 16nsites = 17nsites = 18nsites = 19nsites = 20nsites = 21nsites = 22nsites = 23nsites = 24nsites = 25nsites = 26nsites = 27nsites = 28nsites = 29nsites = 30nsites = 31nsites = 32nsites = 33nsites = 34nsites = 35nsites = 36nsites = 37nsites = 38nsites = 39nsites = 40nsites = 41nsites = 42nsites = 43nsites = 44nsites = 45nsites = 46nsites = 47nsites = 48nsites = 49nsites = 50nsites 

# HOMER vertebrate motif search over all DHS for miRNA with DHS

# HOMER vertebrate transcription factor motif search for DHS of miRNAs important to Overall Survival

In [None]:
%%bash
join \
-t$'\t' \
<(sort -k1,1 "os_gene_importance.important_genes.txt") \
<(sort -k1,1 MIRNA_to_DHS_Region.txt) \
|awk 'BEGIN{FS=OFS="\t"}{print $2,$1}' \
|sed 's|:|\t|'|sed 's|-|\t|' \
|awk 'BEGIN{FS=OFS="\t"}{print $1,$2-1,$3,$4"|DHS|"$1":"$2"-"$3,0,"+"}' \
|sort|uniq \
|sort -k1,1 -k2,2n \
> os_gene_importance.important_genes.MIRNA_to_DHS_Region.bed

findMotifsGenome.pl os_gene_importance.important_genes.MIRNA_to_DHS_Region.bed \
hg19 \
os_gene_importance.important_genes.MIRNA_to_DHS_Region \
-size given -preparse \
-mset vertebrates


[Homer Results for Overall Survival](os_gene_importance.important_genes.MIRNA_to_DHS_Region/homerResults.html)

# HOMER vertebrate transcription factor motif search for DHS of miRNAs important to Stage

In [None]:
%%bash
join \
-t$'\t' \
<(sort -k1,1 "stage_gene_importance.important_genes.txt") \
<(sort -k1,1 MIRNA_to_DHS_Region.txt) \
|awk 'BEGIN{FS=OFS="\t"}{print $2,$1}' \
|sed 's|:|\t|'|sed 's|-|\t|' \
|awk 'BEGIN{FS=OFS="\t"}{print $1,$2-1,$3,$4"|DHS|"$1":"$2"-"$3,0,"+"}' \
|sort|uniq \
|sort -k1,1 -k2,2n \
> stage_gene_importance.important_genes.MIRNA_to_DHS_Region.bed

findMotifsGenome.pl stage_gene_importance.important_genes.MIRNA_to_DHS_Region.bed \
hg19 \
stage_gene_importance.important_genes.MIRNA_to_DHS_Region \
-size given -preparse \
-mset vertebrates


[Homer Results for Stage](stage_gene_importance.important_genes.MIRNA_to_DHS_Region/homerResults.html)

# FIMO analysis setup

## Download FIMO motif database

In [None]:
%%bash
wget -O motif_databases.12.15.tar.gz http://meme-suite.org/meme-software/Databases/motifs/motif_databases.12.15.tgz
tar -zxvf motif_databases.12.15.tar.gz
ls motif_databases/HUMAN/

## Get sequence of all miRNA DHS

In [None]:
%%bash
sort -k1,1 MIRNA_to_DHS_Region.txt \
|awk 'BEGIN{FS=OFS="\t"}{print $2,$1}' \
|sed 's|:|\t|'|sed 's|-|\t|' \
|awk 'BEGIN{FS=OFS="\t"}{print $1,$2-1,$3,$4"|DHS|"$1":"$2"-"$3":+",0,"+";print $1,$2-1,$3,$4"|DHS|"$1":"$2"-"$3":-",1,"-"}' \
|sort|uniq \
|sort -k1,1 -k2,2n \
> all_mirna.both_strands.MIRNA_to_DHS_Region.bed
head all_mirna.both_strands.MIRNA_to_DHS_Region.bed
cat all_mirna.both_strands.MIRNA_to_DHS_Region.bed|awk '$6=="+"' \
> all_mirna.plus_strand.MIRNA_to_DHS_Region.bed
head all_mirna.plus_strand.MIRNA_to_DHS_Region.bed

In [None]:
%%bash
sort -k1,1 MIRNA_to_DHS_Region.txt \
|awk 'BEGIN{FS=OFS="\t"}{print $2,$1}' \
|sed 's|:|\t|'|sed 's|-|\t|' \
|awk 'BEGIN{FS=OFS="\t"}{print $1,$2-1,$3,$1":"$2"-"$3" "$4"|DHS"}' \
> all_mirna.plus_strand.coordinate_named.MIRNA_to_DHS_Region.bed
bedtools getfasta -fi hg19.upper.fa -bed all_mirna.plus_strand.coordinate_named.MIRNA_to_DHS_Region.bed -s -name \
-fo all_mirna.plus_strand.coordinate_named.MIRNA_to_DHS_Region.hg19.upper.fa
head all_mirna.plus_strand.coordinate_named.MIRNA_to_DHS_Region.hg19.upper.fa

In [None]:
%%bash
bedtools getfasta -fi hg19.upper.fa -bed all_mirna.both_strands.MIRNA_to_DHS_Region.bed -s -name \
-fo all_mirna.both_strands.MIRNA_to_DHS_Region.hg19.upper.fa
wc -l all_mirna.both_strands.MIRNA_to_DHS_Region.bed
cat all_mirna.both_strands.MIRNA_to_DHS_Region.hg19.upper.fa|grep '^>'|wc -l
head all_mirna.both_strands.MIRNA_to_DHS_Region.hg19.upper.fa

In [None]:
%%bash
bedtools getfasta -fi hg19.upper.fa -bed all_mirna.plus_strand.MIRNA_to_DHS_Region.bed -s -name \
-fo all_mirna.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fa
wc -l all_mirna.plus_strand.MIRNA_to_DHS_Region.bed
cat all_mirna.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fa|grep '^>'|wc -l
head all_mirna.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fa

## Run FIMO on JASPAR vertebrate transcription factor Motifs for all MIRNA strands

In [None]:
%%bash
fimo \
--max-stored-scores $((X=2**63-1)) \
-oc all_mirna.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out \
motif_databases/JASPAR/JASPAR_CORE_2016_vertebrates.meme \
all_mirna.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fa

In [None]:
%%bash
ls all_mirna.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out/fimo.txt
head all_mirna.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out/fimo.txt

[FIMO Results for all MIRNA DHS on JASPAR vertebrate transcription factor motifs](all_mirna.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out/fimo.html)

# FIMO vertebrate transcription factor motif search for DHS of miRNAs important to Overall Survival

In [None]:
%%bash
join \
-t$'\t' \
<(sort -k1,1 "os_gene_importance.important_genes.txt") \
<(sort -k1,1 MIRNA_to_DHS_Region.txt) \
|awk 'BEGIN{FS=OFS="\t"}{print $2,$1}' \
|sed 's|:|\t|'|sed 's|-|\t|' \
|awk 'BEGIN{FS=OFS="\t"}{print $1,$2-1,$3,$4"|DHS|"$1":"$2"-"$3":+",0,"+";print $1,$2-1,$3,$4"|DHS|"$1":"$2"-"$3":-",1,"-"}' \
|sort|uniq \
|sort -k1,1 -k2,2n \
> os_gene_importance.important_genes.both_strands.MIRNA_to_DHS_Region.bed
head os_gene_importance.important_genes.both_strands.MIRNA_to_DHS_Region.bed
cat os_gene_importance.important_genes.both_strands.MIRNA_to_DHS_Region.bed|awk '$6=="+"' \
> os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.bed
head os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.bed

bedtools getfasta -fi hg19.upper.fa -bed os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.bed -s -name \
-fo os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fa
wc -l os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.bed
cat os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fa|grep '^>'|wc -l
head os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fa

fimo \
--max-stored-scores $((X=2**63-1)) \
-oc os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out \
motif_databases/JASPAR/JASPAR_CORE_2016_vertebrates.meme \
os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fa


In [None]:
%%bash
ls os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out/fimo.txt
head os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out/fimo.txt

In [None]:
%%bash
cat os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out/fimo.txt \
|awk '(NR==1)||($8 < 0.01)'|cut -f

[FIMO Results for DHS of miRNA important to Overall Survival on JASPAR vertebrate transcription factor motifs](os_gene_importance.important_genes.plus_strand.MIRNA_to_DHS_Region.hg19.upper.fimo_out/fimo.html)

# TODO

Discuss previous literature and motivation  
Discuss biological relevance of what you found (your results)  
Discuss how HOMER and randomforest relates to class material  
Compare overall results of the project against something that's already been published (in this case, on transcriptional regulation of microRNAs, need not be ovarian cancer)  
How does the literature compare the results, is what we found justifiable  
Find a way to justify results - compare against other methods for motif search, compare against kmer content, compare against results from random draw of enhancers. Something to say this is not a one-off result from a one-off method.  
How much background do I assume the reader has in genomics and transcriptional regulation?  
Differences from approaches discussed in the proposal should be mentioned. E.g. switching from alignment of enhancers to kmer content of enhancers. Try alignment of motif consensus sequence against enhancers (eval mechanism, compare against homer method).  