-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Generating JSON file from VEP variant effects
- Loading branch information
Cristina Yenyxe Gonzalez Garcia
committed
May 7, 2014
1 parent
bd67bed
commit 3b92d0c
Showing
6 changed files
with
453 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
260 changes: 260 additions & 0 deletions
260
cellbase-build/src/main/java/org/opencb/cellbase/build/transform/VariantEffectParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,260 @@ | ||
package org.opencb.cellbase.build.transform; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStreamReader; | ||
import java.nio.file.Path; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
import org.apache.commons.lang.StringUtils; | ||
import org.opencb.biodata.models.variant.effect.ConsequenceType; | ||
import org.opencb.biodata.models.variant.effect.ConsequenceTypeMappings; | ||
import org.opencb.biodata.models.variant.effect.ProteinSubstitutionScores; | ||
import org.opencb.biodata.models.variant.effect.VariantEffect; | ||
import org.opencb.cellbase.build.transform.serializers.json.JsonSerializer; | ||
|
||
|
||
/** | ||
* | ||
* @author Cristina Yenyxe Gonzalez Garcia <cyenyxe@ebi.ac.uk> | ||
*/ | ||
public class VariantEffectParser { | ||
|
||
private JsonSerializer serializer; | ||
|
||
public VariantEffectParser(JsonSerializer serializer) { | ||
this.serializer = serializer; | ||
} | ||
|
||
public int parse(Path file) throws IOException { | ||
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file.toFile()))); | ||
String line = null; | ||
VariantEffect currentEffect = null; | ||
String currentAllele = null; | ||
|
||
int numEffectsWritten = 0; | ||
|
||
while((line = reader.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; // Header will just be ignored | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
String[] positionFields = fields[0].split("[\\_\\/]"); | ||
|
||
if (positionFields.length < 4) { | ||
// Only entries chr_pos_ref/alt will be parsed, ie, 1_909238_G/C or 3_361464_A/- | ||
// Entries like 5_121187650_duplication will be ignored | ||
continue; | ||
} | ||
|
||
if (isNewVariant(positionFields[0], Integer.parseInt(positionFields[1]), positionFields[2], positionFields[3], currentEffect, currentAllele)) { | ||
if (currentEffect != null && serializer != null) { | ||
if (serializer.write(currentEffect)) { | ||
numEffectsWritten++; | ||
} | ||
} | ||
|
||
currentEffect = new VariantEffect(positionFields[0], Integer.parseInt(positionFields[1]), Integer.parseInt(positionFields[1]), positionFields[2]); | ||
currentAllele = positionFields[3]; | ||
} else if (isNewAllele(positionFields[0], Integer.parseInt(positionFields[1]), positionFields[2], positionFields[3], currentEffect, currentAllele)) { | ||
currentAllele = positionFields[3]; | ||
} | ||
|
||
parseLine(fields, currentEffect, currentAllele); | ||
} | ||
|
||
// Don't forget to serialize the last effect read! | ||
if (currentEffect != null && serializer != null) { | ||
if (serializer.write(currentEffect)) { | ||
numEffectsWritten++; | ||
} | ||
} | ||
|
||
return numEffectsWritten; | ||
} | ||
|
||
private boolean isNewVariant(String chromosome, int start, String referenceAllele, String alternateAllele, | ||
VariantEffect current, String currentAllele) { | ||
if (current == null) { | ||
return true; | ||
} | ||
|
||
return !chromosome.equals(current.getChromosome()) | ||
|| start != current.getStart() | ||
|| !referenceAllele.equals(current.getReferenceAllele()) | ||
// || !alternateAllele.equals(currentAllele) | ||
; | ||
} | ||
|
||
private boolean isNewAllele(String chromosome, int start, String referenceAllele, String alternateAllele, | ||
VariantEffect current, String currentAllele) { | ||
if (current == null) { | ||
return true; | ||
} | ||
|
||
return chromosome.equals(current.getChromosome()) | ||
&& start == current.getStart() | ||
&& referenceAllele.equals(current.getReferenceAllele()) | ||
&& !alternateAllele.equals(currentAllele); | ||
} | ||
|
||
private void parseLine(String[] fields, VariantEffect effect, String alternateAllele) { | ||
ConsequenceType ct = new ConsequenceType(alternateAllele); | ||
effect.addConsequenceType(alternateAllele, ct); | ||
|
||
// Gene and feature information | ||
ct.setGeneId(fields[3]); | ||
ct.setFeatureId(fields[4]); | ||
ct.setFeatureType(fields[5]); | ||
|
||
// List of consequence types as SO codes | ||
String[] consequencesName = fields[6].split(","); | ||
int[] consequencesSo = new int[consequencesName.length]; | ||
for (int i = 0; i < consequencesName.length; i++) { | ||
Integer so = ConsequenceTypeMappings.termToAccession.get(consequencesName[i]); | ||
if (so != null) { | ||
consequencesSo[i] = so; | ||
} else { | ||
Logger.getLogger(JsonSerializer.class.getName()).log(Level.WARNING, "{0} is not a valid consequence type", consequencesName[i]); | ||
} | ||
} | ||
ct.setConsequenceTypes(consequencesSo); | ||
|
||
// Fields related to position can be empty (marked with "-") | ||
if (!"-".equals(fields[7]) && StringUtils.isNumeric(fields[7])) { | ||
ct.setcDnaPosition(Integer.parseInt(fields[7])); | ||
} | ||
if (!"-".equals(fields[8]) && StringUtils.isNumeric(fields[8])) { | ||
ct.setCdsPosition(Integer.parseInt(fields[8])); | ||
} | ||
if (!"-".equals(fields[9]) && StringUtils.isNumeric(fields[9])) { | ||
ct.setProteinPosition(Integer.parseInt(fields[9])); | ||
} | ||
|
||
// Fields related to AA and codon changes can also be empty (marked with "-") | ||
if (!"-".equals(fields[10])) { | ||
ct.setAminoacidChange(fields[10]); | ||
} | ||
if (!"-".equals(fields[11])) { | ||
ct.setCodonChange(fields[11]); | ||
} | ||
|
||
// Variant ID | ||
if (!"-".equals(fields[12])) { | ||
ct.setVariationId(fields[12]); | ||
} | ||
|
||
parseExtraFields(fields[13], effect, ct); | ||
} | ||
|
||
private void parseExtraFields(String extra, VariantEffect effect, ConsequenceType ct) { | ||
for (String field : extra.split(";")) { | ||
String[] keyValue = field.split("="); | ||
|
||
switch (keyValue[0].toLowerCase()) { | ||
case "aa_maf": | ||
effect.getFrequencies().setMafNhlbiEspAfricanAmerican(Float.parseFloat(keyValue[1])); | ||
break; | ||
case "afr_maf": | ||
effect.getFrequencies().setMaf1000GAfrican(Float.parseFloat(keyValue[1])); | ||
break; | ||
case "amr_maf": | ||
effect.getFrequencies().setMaf1000GAmerican(Float.parseFloat(keyValue[1])); | ||
break; | ||
case "asn_maf": | ||
effect.getFrequencies().setMaf1000GAsian(Float.parseFloat(keyValue[1])); | ||
break; | ||
case "biotype": | ||
ct.setFeatureBiotype(keyValue[1]); | ||
break; | ||
case "canonical": | ||
ct.setCanonical(keyValue[1].equalsIgnoreCase("YES") || keyValue[1].equalsIgnoreCase("Y")); | ||
break; | ||
case "ccds": | ||
ct.setCcdsId(keyValue[1]); | ||
break; | ||
case "cell_type": | ||
effect.getRegulatoryEffect().setCellType(keyValue[1]); | ||
break; | ||
case "clin_sig": | ||
ct.setClinicalSignificance(keyValue[1]); | ||
break; | ||
case "distance": | ||
ct.setVariantToTranscriptDistance(Integer.parseInt(keyValue[1])); | ||
break; | ||
case "domains": | ||
ct.setProteinDomains(keyValue[1]); | ||
break; | ||
case "ea_maf": | ||
effect.getFrequencies().setMafNhlbiEspEuropeanAmerican(Float.parseFloat(keyValue[1])); | ||
break; | ||
case "ensp": | ||
ct.setProteinId(keyValue[1]); | ||
break; | ||
case "eur_maf": | ||
effect.getFrequencies().setMaf1000GEuropean(Float.parseFloat(keyValue[1])); | ||
break; | ||
case "exon": | ||
ct.setExonNumber(keyValue[1]); | ||
break; | ||
case "gmaf": // Format is GMAF=G:0.2640 | ||
String[] gmafFields = keyValue[1].split(":"); | ||
effect.getFrequencies().setAllele1000g(gmafFields[0]); | ||
effect.getFrequencies().setMaf1000G(Float.parseFloat(gmafFields[1])); | ||
break; | ||
case "hgvsc": | ||
ct.setHgvsc(keyValue[1]); | ||
break; | ||
case "hgvsp": | ||
ct.setHgvsp(keyValue[1]); | ||
break; | ||
case "high_inf_pos": | ||
effect.getRegulatoryEffect().setHighInformationPosition(keyValue[1].equalsIgnoreCase("YES") || keyValue[1].equalsIgnoreCase("Y")); | ||
break; | ||
case "intron": | ||
ct.setIntronNumber(keyValue[1]); | ||
break; | ||
case "motif_name": | ||
effect.getRegulatoryEffect().setMotifName(keyValue[1]); | ||
break; | ||
case "motif_pos": | ||
effect.getRegulatoryEffect().setMotifPosition(Integer.parseInt(keyValue[1])); | ||
break; | ||
case "motif_score_change": | ||
effect.getRegulatoryEffect().setMotifScoreChange(Float.parseFloat(keyValue[1])); | ||
break; | ||
case "polyphen": // Format is PolyPhen=possibly_damaging(0.859) | ||
String[] polyphenFields = keyValue[1].split("[\\(\\)]"); | ||
effect.getProteinSubstitutionScores().setPolyphenEffect(ProteinSubstitutionScores.PolyphenEffect.valueOf(polyphenFields[0].toUpperCase())); | ||
effect.getProteinSubstitutionScores().setPolyphenScore(Float.parseFloat(polyphenFields[1])); | ||
break; | ||
case "pubmed": | ||
ct.setPubmed(keyValue[1].split(",")); | ||
break; | ||
case "sift": // Format is SIFT=tolerated(0.07) | ||
String[] siftFields = keyValue[1].split("[\\(\\)]"); | ||
effect.getProteinSubstitutionScores().setSiftEffect(ProteinSubstitutionScores.SiftEffect.valueOf(siftFields[0].toUpperCase())); | ||
effect.getProteinSubstitutionScores().setSiftScore(Float.parseFloat(siftFields[1])); | ||
break; | ||
case "strand": | ||
ct.setFeatureStrand(keyValue[1]); | ||
break; | ||
case "sv": | ||
ct.setStructuralVariantsId(keyValue[1].split(",")); | ||
break; | ||
case "symbol": | ||
ct.setGeneName(keyValue[1]); | ||
break; | ||
case "symbol_source": | ||
ct.setGeneNameSource(keyValue[1]); | ||
break; | ||
default: | ||
// ALLELE_NUM, FREQS, IND, ZYG | ||
break; | ||
} | ||
} | ||
|
||
} | ||
} |
113 changes: 113 additions & 0 deletions
113
...ld/src/main/java/org/opencb/cellbase/build/transform/serializers/json/JsonSerializer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
package org.opencb.cellbase.build.transform.serializers.json; | ||
|
||
import com.fasterxml.jackson.core.JsonFactory; | ||
import com.fasterxml.jackson.core.JsonGenerator; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import java.io.FileOutputStream; | ||
import java.io.IOException; | ||
import java.io.OutputStream; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.util.List; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
import java.util.zip.GZIPOutputStream; | ||
import org.opencb.commons.io.DataWriter; | ||
|
||
/** | ||
* | ||
* @author Cristina Yenyxe Gonzalez Garcia <cyenyxe@ebi.ac.uk> | ||
*/ | ||
public class JsonSerializer<T> implements DataWriter<T> { | ||
|
||
private Path outdir; | ||
private Path file; | ||
|
||
protected JsonFactory factory; | ||
protected ObjectMapper jsonObjectMapper; | ||
protected JsonGenerator generator; | ||
private OutputStream stream; | ||
|
||
public JsonSerializer(Path outdir, Path file) { | ||
this.outdir = outdir; | ||
this.file = file; | ||
this.factory = new JsonFactory(); | ||
this.jsonObjectMapper = new ObjectMapper(this.factory); | ||
} | ||
|
||
@Override | ||
public boolean open() { | ||
try { | ||
stream = new GZIPOutputStream(new FileOutputStream( | ||
Paths.get(outdir.toString(), file.getFileName().toString()).toAbsolutePath().toString() + ".json.gz")); | ||
} catch (IOException ex) { | ||
Logger.getLogger(JsonSerializer.class.getName()).log(Level.SEVERE, null, ex); | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
@Override | ||
public boolean pre() { | ||
try { | ||
generator = factory.createGenerator(stream); | ||
} catch (IOException ex) { | ||
Logger.getLogger(JsonSerializer.class.getName()).log(Level.SEVERE, null, ex); | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
@Override | ||
public boolean write(T elem) { | ||
try { | ||
generator.writeObject(elem); | ||
generator.writeRaw('\n'); | ||
} catch (IOException ex) { | ||
Logger.getLogger(JsonSerializer.class.getName()).log(Level.SEVERE, elem.toString(), ex); | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
@Override | ||
public boolean write(List<T> batch) { | ||
for (T elem : batch) { | ||
try { | ||
generator.writeObject(elem); | ||
generator.writeRaw('\n'); | ||
} catch (IOException ex) { | ||
Logger.getLogger(JsonSerializer.class.getName()).log(Level.SEVERE, elem.toString(), ex); | ||
return false; | ||
} | ||
} | ||
|
||
return true; | ||
} | ||
|
||
@Override | ||
public boolean post() { | ||
try { | ||
stream.flush(); | ||
generator.flush(); | ||
} catch (IOException ex) { | ||
Logger.getLogger(JsonSerializer.class.getName()).log(Level.SEVERE, null, ex); | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
@Override | ||
public boolean close() { | ||
try { | ||
generator.close(); | ||
} catch (IOException ex) { | ||
Logger.getLogger(JsonSerializer.class.getName()).log(Level.SEVERE, null, ex); | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
} |
Oops, something went wrong.