Permalink
Browse files

Now oligotyping pipeline generates Gephi

(https://gephi.org/) compatible XML files for network analysis (Gephi
is an open source software similar to Cytoscape, but in my opinion,
more promising in terms of usability). Version bumped to 0.95.
  • Loading branch information...
meren committed Mar 25, 2013
1 parent 8813d46 commit 9ead76024a27a72a53d5138204deea19effc5e8a
@@ -7,6 +7,10 @@ A simple change log file to keep track on what is new.
Versions
========
* 0.95
* GEXF support. Now oligotyping pipeline generates Gephi (https://gephi.org/) compatible XML file for network analysis.
* Bug fixes.
* 0.9
* Multithreading support. Oligotyping pipeline runs processes in parallel whenever it is possible.
* More comprehensible HTML output.
@@ -10,7 +10,7 @@
#
# Please read the COPYING file.
__version__ = '0.9'
__version__ = '0.95'
import os
import sys
@@ -43,8 +43,10 @@
from Oligotyping.utils.utils import append_reads_to_FASTA
from Oligotyping.utils.utils import check_input_alignment
from Oligotyping.utils.utils import generate_MATRIX_files
from Oligotyping.utils.utils import get_sample_mapping_dict
from Oligotyping.utils.utils import mapping_file_simple_check
from Oligotyping.utils.utils import generate_ENVIRONMENT_file
from Oligotyping.utils.utils import generate_gexf_network_file
from Oligotyping.utils.utils import get_unit_counts_and_percents
from Oligotyping.utils.utils import get_units_across_datasets_dicts
from Oligotyping.utils.utils import mask_defline_whitespaces_in_FASTA
@@ -88,6 +90,7 @@ def __init__(self, args = None):
self.log_file_path = None
self.skip_check_input_file = False
self.skip_basic_analyses = False
self.skip_gexf_network_file = False
self.no_threading = False
self.number_of_threads = None
@@ -124,13 +127,15 @@ def __init__(self, args = None):
self.sample_mapping = args.sample_mapping
self.skip_check_input_file = args.skip_check_input_file
self.skip_basic_analyses = args.skip_basic_analyses
self.skip_gexf_network_file = args.skip_gexf_network_file
self.no_threading = args.no_threading
self.number_of_threads = args.number_of_threads
self.run = Run()
self.progress = Progress()
self.datasets_dict = {}
self.sample_mapping_dict = {}
self.excluded_read_ids_tracker = {}
self.representative_sequences_per_oligotype = {}
self.across_datasets_sum_normalized = {}
@@ -344,6 +349,9 @@ def run_all(self):
self.progress.end()
self.column_entropy = [int(x.strip().split()[0]) for x in open(self.entropy).readlines()]
if self.sample_mapping:
self.sample_mapping_dict = get_sample_mapping_dict(self.sample_mapping)
self.run.info('project', self.project)
self.run.info('run_date', get_date())
@@ -395,6 +403,7 @@ def run_all(self):
self._contrive_abundant_oligos()
self._refine_datasets_dict()
self._get_unit_counts_and_percents()
self._get_units_across_datasets_dicts()
self._generate_random_colors()
self._generate_FASTA_file()
@@ -404,7 +413,6 @@ def run_all(self):
self._store_read_distribution_table()
if self.generate_sets:
self._get_units_across_datasets_dicts()
self._generate_MATRIX_files_for_units_across_datasets()
self._agglomerate_oligos_based_on_cosine_similarity()
self._generate_MATRIX_files_for_oligotype_sets()
@@ -427,6 +435,9 @@ def run_all(self):
if ((not self.no_figures) and (not self.quick)) and self.sample_mapping:
self._generate_exclusive_figures()
if (not self.skip_gexf_network_file) and (not self.quick):
self._generate_gexf_network_file()
# store the final information about oligos
self.run.info('final_oligos', self.abundant_oligos, quiet = True)
@@ -752,9 +763,9 @@ def _refine_datasets_dict(self):
self.progress.end()
number_of_reads_in_datasets_dict = sum([sum(self.datasets_dict[dataset].values()) for dataset in self.datasets_dict])
self.num_sequences_after_qc = sum([sum(self.datasets_dict[dataset].values()) for dataset in self.datasets_dict])
self.run.info('num_sequences_after_qc', number_of_reads_in_datasets_dict)
self.run.info('num_sequences_after_qc', self.num_sequences_after_qc)
if len(datasets_to_remove):
self.run.info('datasets_removed_after_qc', datasets_to_remove)
@@ -1403,6 +1414,40 @@ def _generate_exclusive_figures(self):
self.run.info('exclusive_figures_dict_file_path', exclusive_figures_dict_file_path)
def _generate_gexf_network_file(self):
self.gexf_network_file_path = self.generate_output_destination("NETWORK.gexf")
self.progress.new('GEXF Network File')
oligos_for_network = []
oligo_abundance_threshold = self.num_sequences_after_qc / 10000.0
for oligo in self.final_oligo_counts_dict:
if self.final_oligo_counts_dict[oligo] > oligo_abundance_threshold:
oligos_for_network.append(oligo)
if not oligos_for_network:
self.logger.info('GEXF network file generation failed: all oligotypes were eliminated (oligo_abundance_threshold: %f)'\
% (oligo_abundance_threshold))
self.progress.end()
return None
else:
self.logger.info('GEXF network file will be generated for %d oligos'\
% (len(oligos_for_network)))
generate_gexf_network_file(self.datasets_dict,
oligos_for_network,
self.datasets,
self.across_datasets_sum_normalized,
self.gexf_network_file_path,
sample_mapping_dict = self.sample_mapping_dict,
min_sum_normalized_percent = 0,
project = self.project)
self.progress.end()
self.run.info('gexf_network_file_path', self.gexf_network_file_path)
def _generate_html_output(self):
if self.no_figures:
sys.stdout.write('\n\n\t"--no-figures" parameter is given, skipping HTML output...\n\n')
@@ -87,5 +87,6 @@
'read_distribution_table_path': 'Read distribution among datasets table',
'node_representatives_file_path': 'Representative sequences per node',
'sample_mapping': 'Mapping file',
'gexf_network_file_path': 'GEXF file for network analysis',
'skip_basic_analyses': 'Skip performing basic analyses'
}
@@ -190,6 +190,9 @@ def copy_as(source, dest_name):
except:
html_dict['entropy_figure'] = copy_as(os.path.join(run_info_dict['entropy'] + '.png'), 'entropy.png')
if run_info_dict['gexf_network_file_path']:
html_dict['gexf_network_file_path'] = copy_as(run_info_dict['gexf_network_file_path'], 'network.gexf')
if run_info_dict['sample_mapping']:
html_dict['sample_mapping'] = copy_as(run_info_dict['sample_mapping'], 'sample_mapping.txt')
else:
@@ -171,6 +171,9 @@
{% if sample_mapping %}
<tr><td id="l">{{pretty_names.sample_mapping}}</td><td id="r"><a href="{{sample_mapping}}">sample_mapping.txt</a></td></tr>
{% endif %}
{% if gexf_network_file_path %}
<tr><td id="l">{{pretty_names.gexf_network_file_path}}</td><td id="r"><a href="{{gexf_network_file_path}}">network.gexf</a></td></tr>
{% endif %}
<tr><td id="l">{{pretty_names.environment_file_path}}</td><td id="r"><a href="{{environment_file_path}}">environment.txt</a></td></tr>
<tr><td id="l">{{pretty_names.oligos_fasta_file_path}}</td><td id="r"><a href="{{oligos_fasta_file_path}}">oligos.fa.txt</a></td></tr>
<tr><td id="l">{{pretty_names.oligos_nexus_file_path}}</td><td id="r"><a href="{{oligos_nexus_file_path}}">oligos.nex.txt</a></td></tr>
@@ -211,6 +211,8 @@ def oligotyping():
parser.add_argument('--skip-basic-analyses', action = 'store_true', default = False,
help = 'When set, basic analyses, such as basic NMDS plots and clustering, will be\
skipped')
parser.add_argument('--skip-gexf-network-file', action = 'store_true', default = False,
help = 'When set, GEXF network file will not be generated')
parser.add_argument('-T', '--no-threading', action = 'store_true', default = False,
help = 'When set, oligotyping will not spawn multiple threads. Default behavior is\
multi-threaded whenever possible.')
@@ -323,6 +323,71 @@ def get_vectors_from_oligotypes_across_datasets_matrix(file_path):
return (oligos, vectors)
def generate_gexf_network_file(samples_dict, oligos, samples, across_datasets_sum_normalized, output_file, sample_mapping_dict = None, min_sum_normalized_percent = 0, project = None):
output = open(output_file, 'w')
mapping_categories = sorted(sample_mapping_dict.keys()) if sample_mapping_dict else None
output.write('''<?xml version="1.0" encoding="UTF-8"?>\n''')
output.write('''<gexf xmlns:viz="http:///www.gexf.net/1.1draft/viz" xmlns="http://www.gexf.net/1.2draft" version="1.2">\n''')
output.write('''<meta lastmodifieddate="2010-01-01+23:42">\n''')
output.write(''' <creator>Oligotyping pipeline</creator>\n''')
if project:
output.write(''' <creator>Network description for %s</creator>\n''' % (project))
output.write('''</meta>\n''')
output.write('''<graph type="static">\n\n''')
if sample_mapping_dict:
output.write('''<attributes class="node" type="static">\n''')
for i in range(0, len(mapping_categories)):
category = mapping_categories[i]
output.write(''' <attribute id="%d" title="%s" type="string"/>\n''' % (i, category))
output.write('''</attributes>\n\n''')
output.write('''<nodes>\n''')
for sample in samples:
output.write(''' <node id="%s" label="%s">\n''' % (sample, sample))
output.write(''' <viz:size value="40"/>\n''')
if sample_mapping_dict:
output.write(''' <attvalues>\n''')
for i in range(0, len(mapping_categories)):
category = mapping_categories[i]
output.write(''' <attvalue id="%d" value="%s"/>\n''' % (i, sample_mapping_dict[category][sample]))
output.write(''' </attvalues>\n''')
output.write(''' </node>\n''')
for oligo in oligos:
output.write(''' <node id="%s">\n''' % (oligo))
output.write(''' <viz:size value="5"/>\n''')
if sample_mapping_dict:
output.write(''' <attvalues>\n''')
for i in range(0, len(mapping_categories)):
output.write(''' <attvalue id="%d" value="__NA__"/>\n''' % (i))
output.write(''' </attvalues>\n''')
output.write(''' </node>\n''')
output.write('''</nodes>\n''')
edge_id = 0
output.write('''<edges>\n''')
for i in range(len(samples)):
sample = samples[i]
for oligo in oligos:
if across_datasets_sum_normalized[oligo][i] > min_sum_normalized_percent:
output.write(''' <edge id="%d" source="%s" target="%s" weight="%f" />\n''' % (edge_id, oligo, sample, across_datasets_sum_normalized[oligo][i]))
edge_id += 1
output.write('''</edges>\n''')
output.write('''</graph>\n''')
output.write('''</gexf>\n''')
output.close()
def get_qual_stats_dict(quals_dict, output_file_path = None, verbose = True):
"""This function takes quals dict (which can be obtained by calling the
utils.utils.get_quals_dict function) and returns a dictionary that

0 comments on commit 9ead760

Please sign in to comment.