Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base fork: madprime/MSCC-Reads-Processing
base: master
...
head fork: jhannah/MSCC-Reads-Processing
compare: master
Checking mergeability… Don't worry, you can still create the pull request.
  • 8 commits
  • 11 files changed
  • 0 commit comments
  • 1 contributor
View
27 AddDist.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl
+
+# This program is listed as a one-liner in matching.html. One-liners are hard to debug,
+# though, so I'm pulling it out into a normal program. --jhannah 20100311
+
+use strict;
+
+my (@data, @olddata, $dist);
+while (<>) {
+ @data = split;
+ if ($data[3] eq "-") {
+ if ($data[1] ne $olddata[1]) {
+ if (@olddata) {
+ print "@olddata[0..5] 1000 $data[6]\n";
+ }
+ print "@data[0..5] 1000 $data[6]\n"
+ } else {
+ print "@olddata[0..5] $dist $data[6]\n@data[0..5] $dist $data[6]\n";
+ }
+ } else {
+ $dist = @olddata ? $data[2] - $olddata[2] : 1000;
+ }
+ @olddata = @data;
+}
+print "@olddata[0..5] 1000 $data[6]\n";
+
+
View
0  CountMatches.pl 100644 → 100755
File mode changed
View
0  GenerateVariants_3MM.pl 100644 → 100755
File mode changed
View
0  GetCutSites.pl 100644 → 100755
File mode changed
View
0  GetTags.pl 100644 → 100755
File mode changed
View
61 MatchFinder.pl 100644 → 100755
@@ -7,42 +7,43 @@
$end = $ARGV[1];
$INCREMENT = $ARGV[2];
-my $file1 = "/home/pm23/CCGG_mouse/find_unique_tags/CCGG_tags_18_withdist_shuffled";
-my $file2 = "/home/pm23/CCGG_mouse/find_unique_tags/CCGG_tags_18_withdist_alphasort";
+my $base_dir = '/home/biocore/jhannah/src/MSCC-Reads-Processing';
+my $work_dir = "$base_dir/work";
+my $file1 = "$work_dir/CCGG_tags_18_withdist_shuffled";
+my $file2 = "$work_dir/CCGG_tags_18_withdist_alphasort";
-my $out = "CCGG_tags_18_upto2MM_${start}_${end}";
+my $out = "$start.$end.CCGG_tags_18_upto2MM";
my ($tagfilename, $variantfilename, $sortedvariantfilename, $matchfilename);
-$tagfilename = "reads_" . $start . "_" . $end;
-$variantfilename = $tagfilename . "_variants";
-$sortedvariantfilename = $variantfilename . "_sorted";
-$matchfilename = $tagfilename . "_matches";
+$tagfilename = "$start.$end.reads";
+$variantfilename = $tagfilename . ".variants";
+$sortedvariantfilename = $variantfilename . ".sorted";
+$matchfilename = $tagfilename . ".matches";
+my $cmd;
if ($end < $start + $INCREMENT) {
- my $tail = $end - $start;
- `time tail -$tail $file1 > /scratch/$tagfilename`;
+ my $tail = $end - $start;
+ $cmd = "tail -$tail $file1 > $work_dir/$tagfilename";
} else {
- `time head -$end $file1 | tail -$INCREMENT > /scratch/$tagfilename`;
+ $cmd = "head -$end $file1 | tail -$INCREMENT > $work_dir/$tagfilename";
}
-print "/home/pm23/CCGG_mouse/find_unique_tags/GenerateVariants_3MM.pl /scratch/$tagfilename $MISMATCH > /scratch/$variantfilename\n";
-`time /home/pm23/CCGG_mouse/find_unique_tags/GenerateVariants_3MM.pl /scratch/$tagfilename $MISMATCH > /scratch/$variantfilename`;
-print "sort /scratch/$variantfilename > /scratch/$sortedvariantfilename\n";
-`time sort /scratch/$variantfilename > /scratch/$sortedvariantfilename`;
-print "rm /scratch/$variantfilename\n";
-`rm /scratch/$variantfilename`;
-print "join $file2 /scratch/$sortedvariantfilename > /scratch/$matchfilename\n";
-`time join $file2 /scratch/$sortedvariantfilename > /scratch/$matchfilename`;
-print "rm /scratch/$sortedvariantfilename\n";
-`rm /scratch/$sortedvariantfilename`;
-print "sort --key=9,9 --key=10n,10 --key=11,11 --key=17n,17 /scratch/$matchfilename > /scratch/${out}_sorted\n";
-`time sort --key=9,9 --key=10n,10 --key=11,11 --key=17n,17 /scratch/$matchfilename > /scratch/${out}_sorted`;
-print "/home/pm23/CCGG_mouse/find_unique_tags/CountMatches.pl /scratch/${out}_sorted > /scratch/${out}_counts\n";
-`time /home/pm23/CCGG_mouse/find_unique_tags/CountMatches.pl /scratch/${out}_sorted > /scratch/${out}_counts`;
-`cp /scratch/${out}_counts /home/pm23/CCGG_mouse/find_unique_tags/${out}_counts`;
-
-`rm /scratch/$tagfilename`;
-`rm /scratch/$matchfilename`;
-`rm /scratch/${out}_sorted`;
-`rm /scratch/${out}_counts`;
+foreach my $cmd (
+ $cmd,
+ "$base_dir/GenerateVariants_3MM.pl $work_dir/$tagfilename $MISMATCH > $work_dir/$variantfilename",
+ "sort $work_dir/$variantfilename > $work_dir/$sortedvariantfilename",
+ "rm $work_dir/$variantfilename",
+ "join $file2 $work_dir/$sortedvariantfilename > $work_dir/$matchfilename",
+ "rm $work_dir/$sortedvariantfilename",
+ "sort --key=9,9 --key=10n,10 --key=11,11 --key=17n,17 $work_dir/$matchfilename > $work_dir/$out.sorted",
+ "$base_dir/CountMatches.pl $work_dir/$out.sorted > $work_dir/$out.counts",
+ "rm $work_dir/$tagfilename",
+ "rm $work_dir/$matchfilename",
+ "rm $work_dir/$out.sorted",
+) {
+ print "$cmd\n";
+ `time $cmd`;
+}
+
+
View
12 SelectRepeats.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+
+use strict;
+
+while (<>) {
+ chomp;
+ my @line = split / /;
+ if ($line[5] == 1 && $line[6] >= 50 && $line[7] eq 'NA') {
+ print "$_\n";
+ }
+}
+
View
0  MetaMatchFinder.pl → bsub/MetaMatchFinder.pl
File renamed without changes
View
16 matching.html
@@ -76,12 +76,12 @@
</pre>
<p>
This shows the tag sequence, the chromosome ID, position, strand (upstream or downstream tag), tag length, if it's repeat-masked, and MmeI site presence. The last column, MmeI site presence, is "NA" if no conflicting MmeI site is found in the flanking sequence or, if one is detected, the distance in bp to the target site. This is recorded here because if there is an MmeI site in genomic sequence it could conflict with the tag construction and so sites with MmeI sites that are too close should be excluded in later analysis (although they are potentially generated and so should be in our set of potential matches).
-<p>
-Because we'll want to exclude analysis of sites too close to each other (and so they would interfere with each other) we want to add a column to the data indicating distance to the neighboring site. We did this with command line perl code like the following (note that this code assumes that the upstream sites precede the downstream sites in the input file):
-</p>
-<p>
-cat CCGG_tags_18 | perl -ne 'BEGIN {$dist = 1000;} @data = split; if ($data[3] eq "-") {if ($data[1] ne $olddata[1]) { if (@olddata) { print "@olddata[0..5] 1000 $data[6]\n";}; print "@data[0..5] 1000 $data[6]\n";} else {$dist = $data[2] - $olddata[2]; print "@olddata[0..5] $dist $data[6]\n@data[0..5] $dist $data[6]\n";} }; @olddata = @data; END {print "@olddata[0..5] 1000 $data[6]\n";}' &gt CCGG_tags_18_withdist
+</p><p>
+Because we'll want to exclude analysis of sites too close to each other (and so they would interfere with each other) we want to add a column to the data indicating distance to the neighboring site.
</p>
+<pre>
+perl <A HREF=AddDist.txt>AddDist.pl</A> CCGG_tags_18 &gt CCGG_tags_18_withdist
+</pre>
<p>
This ends up looking like this:
</p>
@@ -155,6 +155,12 @@
head -32000 reads_all | tail -2000 > reads_subset
</pre>
<p>
+Or, if you have the unix command "split", you can create all your subset files with a single command:
+</p>
+<pre>
+split -l 2000 -d -a 4 CCGG_tags_18_withdist_shuffled
+</pre>
+<p>
Once a subset is made, it is matched against the target library with the following steps:
</p>
<ol>
View
57 qsub/MetaMatchFinder.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use Template;
+
+my $INCREMENT = 2000;
+
+my $base_dir = "/home/biocore/jhannah/src/MSCC-Reads-Processing";
+my $work_file = "$base_dir/work/CCGG_tags_18_withdist_shuffled";
+my ($lines) = `wc -l $work_file`;
+$lines =~ s/ .*//s;
+
+for (my $i = 0; $i < $lines; $i+=$INCREMENT) {
+ my $start = $i;
+ my $end = $i + $INCREMENT;
+
+ if ($end > $lines) {
+ $end = $lines;
+ }
+
+ my $file = "$base_dir/work/$start.$end.pbs";
+ my $cmd = "$base_dir/MatchFinder.pl $start $end $INCREMENT";
+ print "$file\n$cmd\n";
+ create_pbs_file($file, $cmd);
+ qsub($file);
+}
+
+
+exit;
+
+# ---------
+# END MAIN
+# ---------
+
+sub qsub {
+ my ($file) = @_;
+ my $dir = $file;
+ $dir =~ s#/[^/]+$##;
+ my $cmd = "cd $dir; qsub -l nodes=1 $file";
+ print "$cmd\n";
+ `$cmd`;
+}
+
+
+sub create_pbs_file {
+ my ($file, $cmd) = @_;
+ my $config = {};
+ my $template = Template->new($config);
+ my $vars = {
+ cmd => $cmd,
+ };
+ $template->process('run.pbs.tt', $vars, $file)
+ || die $template->error();
+}
+
+
View
8 qsub/run.pbs.tt
@@ -0,0 +1,8 @@
+#!/bin/bash
+cd $PBS_O_WORKDIR
+echo "Started `date`"
+cat $PBS_NODEFILE
+NPROCS=`wc -l < $PBS_NODEFILE`
+echo $NPROCS
+[% cmd %]
+echo "Ended `date`"

No commit comments for this range

Something went wrong with that request. Please try again.