diff --git a/test/test_cosmic_mutations.py b/test/test_cosmic_mutations.py
index 67a9f96..61cd2d9 100644
--- a/test/test_cosmic_mutations.py
+++ b/test/test_cosmic_mutations.py
@@ -1,6 +1,7 @@
+from pyensembl import EnsemblRelease
 from varcode import (
-    VariantAnnotator,
     Variant,
+    # transcript effects
     Substitution,
     Deletion,
     Insertion,
@@ -9,15 +10,15 @@
     FrameShiftTruncation,
 )
 
-annot = VariantAnnotator(75)
+ensembl = EnsemblRelease(75)
 
 def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
-    variant = Variant(chrom, pos, dna_ref, dna_alt)
-    result = annot.effect(variant)
-    assert transcript_id in result.transcript_effects, \
+    variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
+    effect_collection = variant.effects()
+    assert transcript_id in effect_collection.transcript_effect_dict, \
         "Expected transcript ID %s for variant %s not found in %s" % (
             transcript_id, variant, result)
-    return result.transcript_effects[transcript_id]
+    return effect_collection.transcript_effect_dict[transcript_id]
 
 def _substitution(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref, aa_alt):
     effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
diff --git a/test/test_dbnsfp_validation.py b/test/test_dbnsfp_validation.py
index 2d990f1..07c7a14 100644
--- a/test/test_dbnsfp_validation.py
+++ b/test/test_dbnsfp_validation.py
@@ -13,20 +13,22 @@
 # limitations under the License.
 
 import pandas as pd
-from varcode import VariantAnnotator, Substitution, Variant
+from pyensembl import EnsemblRelease
+from varcode import Substitution, Variant
 
-annot = VariantAnnotator(75)
+ensembl = EnsemblRelease(75)
 
 def validate_transcript_mutation(
 	ensembl_transcript,
         chrom, dna_position,
         dna_ref, dna_alt,
         aa_pos, aa_alt):
-    result = annot.effect(
-        Variant(chrom, dna_position, dna_ref, dna_alt))
-    assert ensembl_transcript in result.transcript_effects, \
+    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
+    result = variant.effects()
+
+    assert ensembl_transcript in result.transcript_effect_dict, \
         "%s not found in %s" % (ensembl_transcript, result)
-    effect = result.transcript_effects[ensembl_transcript]
+    effect = result.transcript_effect_dict[ensembl_transcript]
     assert (
         isinstance(effect, Substitution) and
         effect.aa_pos + 1 == aa_pos and
diff --git a/test/test_drop_duplicates.py b/test/test_drop_duplicates.py
index 6df011b..0b42208 100644
--- a/test/test_drop_duplicates.py
+++ b/test/test_drop_duplicates.py
@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pyensembl import EnsemblRelease
 from varcode import Variant, VariantCollection
 
 def test_drop_duplicates():
-    v1 = Variant("1", 3000, "A", "G")
-    v1_copy = Variant("1", 3000, "A", "G")
-    v2 = Variant("2", 10, "G", "T")
+    ensembl = EnsemblRelease(78)
+    v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
+    v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
+    v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
     collection_with_duplicates = VariantCollection(
-        variants=[v1, v1, v1_copy, v2],
-        reference_name="hg19")
+        variants=[v1, v1, v1_copy, v2])
     assert len(collection_with_duplicates) == 4
     collection_without_duplicates = collection_with_duplicates.drop_duplicates()
     assert len(collection_without_duplicates) == 2
diff --git a/test/test_maf.py b/test/test_maf.py
index 375882b..9d76a72 100644
--- a/test/test_maf.py
+++ b/test/test_maf.py
@@ -1,15 +1,19 @@
-from varcode import load_variants, VariantCollection, Variant
 from nose.tools import eq_
+from pyensembl import EnsemblRelease
+from varcode import load_maf, VariantCollection, Variant
 
 def test_maf():
-    variant_collection_from_maf = load_variants("data/tcga_ov.head.maf")
-    eq_(variant_collection_from_maf.reference_name, "GRCh37")
+    ensembl = EnsemblRelease(75)
+    variant_collection_from_maf = load_maf("data/tcga_ov.head.maf")
     expected_variants = [
-        Variant(1, 1650797, "A", "G"),
-        Variant(1, 231401797, "A", "C"),
-        Variant(1, 23836447, "C", "A"),
-        Variant(11,124617502, "C", "G"),
+        Variant(1, 1650797, "A", "G", ensembl),
+        Variant(1, 231401797, "A", "C", ensembl),
+        Variant(1, 23836447, "C", "A", ensembl),
+        Variant(11,124617502, "C", "G", ensembl),
     ]
     eq_(len(variant_collection_from_maf), len(expected_variants))
-    for v1, v2 in zip(expected_variants, variant_collection_from_maf):
-        eq_(v1, v2)
+    for v_expect, v_maf in zip(expected_variants, variant_collection_from_maf):
+        eq_(v_expect, v_maf)
+        gene_name = v_maf.info['Hugo_Symbol']
+        assert any(gene.name == gene_name for gene in v_maf.genes()), \
+            "Expected gene name %s but got %s" % (gene_name, v_maf.genes())
diff --git a/test/test_mutation_effects.py b/test/test_mutation_effects.py
index cf49b1a..9a05cc2 100644
--- a/test/test_mutation_effects.py
+++ b/test/test_mutation_effects.py
@@ -1,7 +1,8 @@
 from varcode import (
-    VariantAnnotator,
     Variant,
-    infer_transcript_effect,
+    #
+    # transcript effects
+    #
     NoncodingTranscript,
     IncompleteTranscript,
     FivePrimeUTR,
@@ -17,8 +18,9 @@
     FrameShift,
     # TODO: SpliceDonor, SpliceReceptor
 )
+from pyensembl import EnsemblRelease
 
-annot = VariantAnnotator(ensembl_release=77)
+ensembl = EnsemblRelease(release=78)
 
 def test_incomplete():
     # transcript EGFR-009 (ENST00000450046 in Ensembl 77)
@@ -27,10 +29,10 @@ def test_incomplete():
     # first exon begins: ATCATTCCTTTGGGCCTAGGA
 
     # change the first nucleotide of the 5' UTR A>T
-    variant = Variant("7", 55109723, "A", "T")
+    variant = Variant("7", 55109723, "A", "T", ensembl=ensembl)
 
-    transcript = annot.ensembl.transcript_by_id("ENST00000450046")
-    effect = infer_transcript_effect(variant, transcript)
+    transcript = ensembl.transcript_by_id("ENST00000450046")
+    effect = variant.transcript_effect(transcript)
     assert isinstance(effect, IncompleteTranscript), \
         "Expected %s on %s to be IncompleteTranscript, got %s" % (
             variant, transcript, effect)
@@ -43,9 +45,9 @@ def test_start_loss():
     # which is 55,019,034 + 244 of chr7 = 55019278
     # change the first nucleotide of the 5' UTR A>T
     # making what used to be a start codon into TTG (Leucine)
-    variant = Variant("7", 55019278, "A", "T")
-    transcript = annot.ensembl.transcript_by_id("ENST00000420316")
-    effect = infer_transcript_effect(variant, transcript)
+    variant = Variant("7", 55019278, "A", "T", ensembl=ensembl)
+    transcript = ensembl.transcript_by_id("ENST00000420316")
+    effect = variant.transcript_effect(transcript)
     assert isinstance(effect, StartLoss), \
         "Expected StartLoss, got %s for %s on %s" % (
             effect, variant, transcript, )
\ No newline at end of file
diff --git a/test/test_trim_shared.py b/test/test_string_helpers.py
similarity index 94%
rename from test/test_trim_shared.py
rename to test/test_string_helpers.py
index fa06769..510b88c 100644
--- a/test/test_trim_shared.py
+++ b/test/test_string_helpers.py
@@ -1,6 +1,6 @@
 from nose.tools import eq_
 
-from varcode.common import trim_shared_flanking_strings
+from varcode.string_helpers import trim_shared_flanking_strings
 
 def test_trim_shared_string_endings():
     # empty strings
diff --git a/test/test_vcf.py b/test/test_vcf.py
index 6a88f05..23e6e0c 100644
--- a/test/test_vcf.py
+++ b/test/test_vcf.py
@@ -12,33 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from varcode import load_variants
+from varcode import load_vcf
 
 VCF_FILENAME = "data/somatic_hg19_14muts.vcf"
 
 def test_vcf_reference_name():
-    variants = load_variants(VCF_FILENAME)
-    # the raw reference name can be a file path to the hg19 FASTA file
-    assert variants.reference_path and "hg19" in variants.reference_path, \
-        "Expected hg19 reference, got %s" % (variants.reference_path,)
+    variants = load_vcf(VCF_FILENAME)
     # after normalization, hg19 should be remapped to GRCh37
-    assert variants.reference_name == "GRCh37"
+    assert variants.reference_names() ==  { "GRCh37" }
 
 def test_vcf_number_entries():
     # there are 14 mutations listed in the VCF, make sure they are all parsed
-    variants = load_variants(VCF_FILENAME)
+    variants = load_vcf(VCF_FILENAME)
     assert len(variants) == 14, \
         "Expected 14 mutations, got %d" % (len(variants),)
 
-def _check_effect_gene_name(effect):
-    variant = effect.variant
+def _check_variant_gene_name(variant):
     expected_gene_names = variant.info['GE']
-    gene_names = [gene.name for gene in effect.genes]
-    assert expected_gene_names == gene_names, \
+    assert variant.gene_names() == expected_gene_names, \
         "Expected gene name %s for variant %s, got %s" % (
-            expected_gene_name, variant, gene_names)
+            expected_gene_name, variant, variant.gene_names())
 
 def test_vcf_gene_names():
-    variants = load_variants(VCF_FILENAME)
-    for effect in variants.variant_effects():
-        yield (_check_effect_gene_name, effect)
+    variants = load_vcf(VCF_FILENAME)
+    for variant in variants:
+        yield (_check_variant_gene_name, variant)
diff --git a/varcode/__init__.py b/varcode/__init__.py
index eefbeb9..0dbad7c 100644
--- a/varcode/__init__.py
+++ b/varcode/__init__.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .core_logic import infer_transcript_effect
+from .effects import *
 from .effect_ordering import effect_priority, top_priority_transcript_effect
-from .load_variants import load_variants
-from .transcript_mutation_effects import *
+from .maf import load_maf, load_maf_dataframe
 from .variant import Variant
-from .variant_annotator import VariantAnnotator
 from .variant_collection import VariantCollection
+from .vcf import load_vcf
\ No newline at end of file
diff --git a/varcode/core_logic.py b/varcode/coding_effect.py
similarity index 54%
rename from varcode/core_logic.py
rename to varcode/coding_effect.py
index c3ebea8..b2de72b 100644
--- a/varcode/core_logic.py
+++ b/varcode/coding_effect.py
@@ -15,20 +15,8 @@
 from __future__ import print_function, division, absolute_import
 
 import logging
-import math
-from collections import namedtuple
 
-from .common import (
-    reverse_complement,
-    trim_shared_flanking_strings,
-)
-from .variant import Variant
-from .transcript_mutation_effects import (
-    NoncodingTranscript,
-    IncompleteTranscript,
-    FivePrimeUTR,
-    ThreePrimeUTR,
-    Intronic,
+from .effects import (
     Silent,
     Insertion,
     Deletion,
@@ -39,10 +27,9 @@
     FrameShift,
     FrameShiftTruncation
 )
+from .string_helpers import trim_shared_flanking_strings
 
 from Bio.Seq import Seq, CodonTable
-from pyensembl import (Transcript, find_nearest_locus)
-from pyensembl.biotypes import is_coding_biotype
 
 def mutate(sequence, position, variant_ref, variant_alt):
     """
@@ -72,90 +59,29 @@ def mutate(sequence, position, variant_ref, variant_alt):
     suffix = sequence[position+n_variant_ref:]
     return prefix + variant_alt + suffix
 
-def first_transcript_offset(start_pos, end_pos, transcript):
-    """
-    Given a variant and a particular transcript,
-    return the start/end offsets of the variant relative to the
-    chromosomal positions of the transcript.
-    """
-    # ensure that start_pos:end_pos overlap with transcript positions
-    assert start_pos <= end_pos, \
-        "start_pos %d shouldn't be greater than end_pos %d" % (
-            start_pos, end_pos)
-    assert start_pos <= transcript.end, \
-        "Range %d:%d starts after transcript %s (%d:%d)" % (
-            start_pos,
-            end_pos,
-            transcript,
-            transcript.start,
-            transcript.end)
-    assert end_pos >= transcript.start, \
-        "Range %d:%d ends before transcript %s (%d:%d)" % (
-            start_pos,
-            end_pos,
-            transcript,
-            transcript.start,
-            transcript.end)
-    # trim the start position to the beginning of the transcript
-    if start_pos < transcript.start:
-        start_pos = transcript.start
-    # trim the end position to the end of the transcript
-    if end_pos > transcript.end:
-        end_pos = transcript.end
-
-    # offsets into the spliced transcript
-    offsets = [
-        transcript.spliced_offset(start_pos),
-        transcript.spliced_offset(end_pos)
-    ]
-
-    start_offset_with_utr5 = min(offsets)
-
-    assert start_offset_with_utr5 >= 0, \
-        "Position %d is before start of transcript %s" % (
-            start_offset_with_utr5, transcript)
-
-    return start_offset_with_utr5
-
-def normalize_variant_fields(variant, transcript):
-    """
-    Given a Variant (with fields `ref`, `alt`, `pos` and `end_pos`),
-    compute the offset the variant position relative
-    to the position and direction of the given transcript.
-    Also reverse the nucleotide strings if the transcript is on the
-    backwards strand and trim any shared substrings from the beginning
-    or end of `ref` and `alt`.
-
-    Example:
-    If Variant(ref="ATGGC", alt="TCGC", pos=3000, end_pos=3005) is normalized
-    relative to Transcript(start=2000, end=4000) then the result of this
-    function will be:
-        ("G", "C", 1002)
-    where the elements of this tuple are (ref, alt, offset).
-    """
-    if transcript.on_backward_strand:
-        ref = reverse_complement(variant.ref)
-        alt = reverse_complement(variant.alt)
-    else:
-        ref = variant.ref
-        alt = variant.alt
-
-    start_offset_with_utr5 = first_transcript_offset(
-        variant.pos, variant.end_pos, transcript)
-
-    # in case nucleotide strings share prefix (e.g. ref="C", alt="CC")
-    # bump the offsets and make the strings disjoint (ref="", alt="C")
-    ref, alt, prefix, _ = trim_shared_flanking_strings(ref, alt)
-    n_same = len(prefix)
-    start_offset_with_utr5 += n_same
-    return ref, alt, start_offset_with_utr5
-
 def infer_coding_effect(
         ref,
         alt,
         cds_offset,
-        variant,
-        transcript):
+        transcript,
+        variant):
+    """
+    Given a minimal ref/alt nucleotide string pair and an offset into a given
+    transcript, determine the coding effect of this nucleotide substitution
+    onto the translated protein.
+
+    Parameters
+    ----------
+    ref : str
+
+    alt : str
+
+    cds_offset : int
+
+    transcript : Transcript
+
+    variant : Variant
+    """
     # Don't need a pyfaidx.Sequence object here, just convert it to the an str
     cds_seq = str(transcript.coding_sequence)
 
@@ -348,123 +274,3 @@ def infer_coding_effect(
             aa_pos=aa_pos,
             aa_ref=aa_ref,
             aa_alt=aa_alt)
-
-def infer_exonic_effect(variant, transcript):
-    ref, alt, start_offset_with_utr5 = normalize_variant_fields(
-        variant, transcript)
-
-    utr5_length = min(transcript.start_codon_spliced_offsets)
-
-    if utr5_length > start_offset_with_utr5:
-        # TODO: what do we do if the variant spans the beginning of
-        # the coding sequence?
-        assert utr5_length > start_offset_with_utr5 + len(ref), \
-            "Variant which span the 5' UTR and CDS not yet supported: %s" % (
-                variant)
-        return FivePrimeUTR(variant, transcript)
-
-    utr3_offset = max(transcript.stop_codon_spliced_offsets) + 1
-
-    if start_offset_with_utr5 >= utr3_offset:
-        return ThreePrimeUTR(variant, transcript)
-
-    cds_offset = start_offset_with_utr5 - utr5_length
-    return infer_coding_effect(
-        ref,
-        alt,
-        cds_offset,
-        variant,
-        transcript)
-
-def infer_intronic_effect(variant, transcript, nearest_exon):
-    """
-    Infer effect of variant which does not overlap any exon of
-    the given transcript.
-    """
-    distance_to_exon = nearest_exon.distance_to_interval(
-        variant.pos, variant.end_pos)
-    assert distance_to_exon > 0, \
-        "Expected intronic effect to have distance_to_exon > 0, got %d" % (
-            distance_to_exon,)
-    before_forward_exon = (
-        transcript.strand == "+" and
-        variant.pos < nearest_exon.start)
-    before_backward_exon = (
-        transcript.strand == "-" and
-        variant.end_pos > nearest_exon.end)
-    before_exon = before_forward_exon or before_backward_exon
-
-    if distance_to_exon <= 2:
-        if before_exon:
-            # 2 last nucleotides of intron before exon are the splice acceptor
-            # site, typically "AG"
-            effect_class = SpliceAcceptor
-        else:
-            # 2 first nucleotides of intron after exon are the splice donor
-            # site, typically "GT"
-            effect_class = SpliceDonor
-    elif not before_exon and distance_to_exon <= 6:
-        # variants in nucleotides 3-6 at start of intron aren't as certain
-        # to cause problems as nucleotides 1-2 but still implicated in
-        # alternative splicing
-        effect_class = IntronicSpliceSite
-    elif before_exon and distance_to_exon <= 4:
-        # nucleotides -4 and -3 before exon are part of the 3' splicing motif
-        # but allow for more degeneracy than the -2, -1 nucleotides
-        effect_class = IntronicSpliceSite
-    else:
-        assert distance_to_exon > 6, \
-            "Looks like we didn't cover all possible splice site mutations"
-        # intronic mutation unrelated to splicing
-        effect_class = Intronic
-    return effect_class(
-            variant=variant,
-            transcript=transcript,
-            nearest_exon=nearest_exon,
-            distance_to_exon=distance_to_exon)
-
-def infer_transcript_effect(variant, transcript):
-    """
-    Generate a transcript effect (such as FrameShift) by applying a genomic
-    variant to a particular transcript.
-
-    Parameters
-    ----------
-    variant : Variant
-        Genomic variant
-
-    transcript :  Transcript
-        Transcript we're going to mutate
-    """
-
-    if not isinstance(variant, Variant):
-        raise TypeError(
-            "Expected %s : %s to have type Variant" % (
-                variant, type(variant)))
-
-    if not isinstance(transcript, Transcript):
-        raise TypeError(
-            "Expected %s : %s to have type Transcript" % (
-                transcript, type(transcript)))
-
-    # check for non-coding transcripts first, since
-    # every non-coding transcript is "incomplete".
-    if not is_coding_biotype(transcript.biotype):
-        return NoncodingTranscript(variant, transcript)
-
-    if not transcript.complete:
-        return IncompleteTranscript(variant, transcript)
-
-    distance_to_exon, nearest_exon = find_nearest_locus(
-        start=variant.pos,
-        end=variant.end_pos,
-        loci=transcript.exons)
-
-    if distance_to_exon > 0:
-        return infer_intronic_effect(
-            variant=variant,
-            transcript=transcript,
-            nearest_exon=nearest_exon)
-
-    # TODO: exonic splice site mutations
-    return infer_exonic_effect(variant, transcript)
diff --git a/varcode/common.py b/varcode/common.py
index 84136cc..04f3282 100644
--- a/varcode/common.py
+++ b/varcode/common.py
@@ -1,76 +1,20 @@
+# Copyright (c) 2014. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function, division, absolute_import
 
 from collections import defaultdict
-import pyfaidx
-
-def reverse_complement(x):
-    """
-    Reverse complement of a nucleotide string.
-
-    Parameters
-    ----------
-
-    x : str
-        Original nucleotide string
-    """
-    return pyfaidx.Sequence(seq=x).reverse.complement.seq
-
-
-def trim_shared_prefix(ref, alt):
-    """
-    Sometimes mutations are given with a shared prefix between the reference
-    and alternate strings. Examples: C>CT (nucleotides) or GYFP>G (amino acids).
-
-    This function trims the common prefix and returns the disjoint ref
-    and alt strings, along with the shared prefix.
-    """
-    n_ref = len(ref)
-    n_alt = len(alt)
-    n_min = min(n_ref, n_alt)
-    i = 0
-    while i < n_min and ref[i] == alt[i]:
-        i += 1
-
-    # guaranteed that ref and alt agree on all the characters
-    # up to i'th position, so it doesn't matter which one we pull
-    # the prefix out of
-    prefix = ref[:i]
-    ref_suffix = ref[i:]
-    alt_suffix = alt[i:]
-    return ref_suffix, alt_suffix, prefix
-
-def trim_shared_suffix(ref, alt):
-    """
-    Reuse the `trim_shared_prefix` function above to implement similar
-    functionality for string suffixes.
-
-    Given ref='ABC' and alt='BC', we first revese both strings:
-        reverse_ref = 'CBA'
-        reverse_alt = 'CB'
-    and then the result of calling trim_shared_prefix will be:
-        ('A', '', 'CB')
-    We then reverse all three of the result strings to get back
-    the shared suffix and both prefixes leading up to it:
-        ('A', '', 'BC')
-    """
-    reverse_ref = ref[::-1]
-    reverse_alt = alt[::-1]
-    results = trim_shared_prefix(reverse_ref, reverse_alt)
-    return tuple(s[::-1] for s in results)
-
-def trim_shared_flanking_strings(ref, alt):
-    """
-    Given two nucleotide or amino acid strings, identify
-    if they have a common prefix, a common suffix, and return
-    their unique components along with the prefix and suffix.
-
-    For example, if the input ref = "SYFFQGR" and alt = "SYMLLFIFQGR"
-    then the result will be:
-        ("F", "MLLFI", "SY", "FQGR")
-    """
-    ref, alt, prefix = trim_shared_prefix(ref, alt)
-    ref, alt, suffix = trim_shared_suffix(ref, alt)
-    return ref, alt, prefix, suffix
 
 def group_by(records, field_name):
     """
diff --git a/varcode/effect_ordering.py b/varcode/effect_ordering.py
index a58ed78..cc87da4 100644
--- a/varcode/effect_ordering.py
+++ b/varcode/effect_ordering.py
@@ -1,4 +1,4 @@
-from .transcript_mutation_effects import *
+from .effects import *
 
 transcript_effect_priority_list = [
     IncompleteTranscript,
diff --git a/varcode/transcript_mutation_effects.py b/varcode/effects.py
similarity index 99%
rename from varcode/transcript_mutation_effects.py
rename to varcode/effects.py
index b1ca65c..10cfe1d 100644
--- a/varcode/transcript_mutation_effects.py
+++ b/varcode/effects.py
@@ -97,7 +97,7 @@ class IntronicSpliceSite(Intronic, SpliceSite):
     affect splicing and are given their own effect classes below.
     """
     def __init__(self, *args, **kwargs):
-        Intronic.__init__(self, *self, **kwargs)
+        Intronic.__init__(self, *args, **kwargs)
 
     def short_description(self):
         return "intronic-splice-site"
@@ -107,7 +107,7 @@ class SpliceDonor(IntronicSpliceSite):
     Mutation in the first two intron residues.
     """
     def __init__(self, *args, **kwargs):
-        Intronic.__init__(self, *self, **kwargs)
+        Intronic.__init__(self, *args, **kwargs)
 
     def short_description(self):
         return "splice-donor"
diff --git a/varcode/filter_variants.py b/varcode/filter_variants.py
deleted file mode 100644
index 99fd760..0000000
--- a/varcode/filter_variants.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-import argparse
-from collections import namedtuple
-
-from . import maf
-from .variant_annotator import VariantAnnotator
-from .variant_collection import VariantCollection
-
-import numpy as np
-import pyensembl
-
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("input",
-    help="VCF input file")
-
-parser.add_argument("--output-csv",
-    help="Comma-separated output with chr/pos/ref/alt from VCF and annotation")
-
-parser.add_argument("--output-tsv",
-    help="Tab-separated output with chr/pos/ref/alt from VCF and annotation")
-
-
-
-if __name__ == "__main__":
-    # TODO: determine ensembl release from VCF metadata
-    annot = VariantAnnotator(ensembl_release=75)
-
-    args = parser.parse_args()
-
-    for record in VariantCollection(args.input):
-        chrom, pos = record.contig, record.pos
-        ref, alt = record.ref, record.alt
-        print(chrom, pos, ref, alt)
-        print("--", annot.describe_variant(chrom, pos, ref, alt))
-        """
-        genes_to_transcripts = {}
-        for transcript_name in transcript_names:
-            gene = ensembl.gene_name_of_transcript_name(transcript_name)
-            if gene in genes_to_transcripts:
-                genes_to_transcripts[gene].append(transcript_name)
-            else:
-                genes_to_transcripts[gene] = [transcript_name]
-        for (gene, transcript_names) in genes_to_transcripts.iteritems():
-            print "\t", gene
-            for transcript_name in transcript_names:
-                print "\t\t", transcript_name
-        """
diff --git a/varcode/load_variants.py b/varcode/load_variants.py
deleted file mode 100644
index 2c1aff4..0000000
--- a/varcode/load_variants.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2014. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .maf import load_maf
-from .vcf import load_vcf
-
-def load_variants(filename):
-    """
-    Creates VariantCollection from name of either VCF or MAF file
-    """
-    assert isinstance(filename, str), \
-        "Expected filename to be str, got %s : %s" % (
-            filename, type(filename))
-
-    if filename.endswith(".vcf"):
-        return load_vcf(filename)
-    elif filename.endswith(".maf"):
-        return load_maf(filename)
-    else:
-        raise ValueError("Unrecognized file type: %s" % (filename,))
diff --git a/varcode/maf.py b/varcode/maf.py
index f10c718..cf4b358 100755
--- a/varcode/maf.py
+++ b/varcode/maf.py
@@ -16,10 +16,15 @@
 import logging
 
 from .nucleotides import normalize_nucleotide_string
+from .reference_name import (
+    infer_reference_name,
+    ensembl_release_number_for_reference_name
+)
 from .variant import Variant
 from .variant_collection import VariantCollection
 
 import pandas
+from pyensembl import EnsemblRelease
 
 TCGA_PATIENT_ID_LENGTH = 12
 
@@ -50,7 +55,11 @@ def load_maf_dataframe(filename, nrows=None, verbose=False):
     """
     Load the guaranteed columns of a TCGA MAF file into a DataFrame
     """
-    logging.info("Opening %s" % filename)
+
+    if not isinstance(filename, str):
+        raise ValueError(
+            "Expected filename to be str, got %s : %s" % (
+                filename, type(filename)))
 
     # skip comments and optional header
     with open(filename) as f:
@@ -77,21 +86,7 @@ def load_maf(filename):
     if len(maf_df) == 0:
         raise ValueError("Empty MAF file %s" % filename)
 
-    ncbi_builds = maf_df.NCBI_Build.unique()
-
-    if len(ncbi_builds) == 0:
-        raise ValueError("No NCBI builds for MAF file %s" % filename)
-    elif len(ncbi_builds) > 1:
-        raise ValueError(
-            "Multiple NCBI builds (%s) for MAF file %s" % (ncbi_builds, filename))
-
-    ncbi_build = ncbi_builds[0]
-
-    if isinstance(ncbi_build, int):
-        reference_name = "B%d" % ncbi_build
-    else:
-        reference_name = str(ncbi_build)
-
+    ensembl_objects = {}
     variants = []
     for _, x in maf_df.iterrows():
         contig = x.Chromosome
@@ -99,21 +94,61 @@ def load_maf(filename):
         end_pos = x.End_Position
         ref = x.Reference_Allele
 
+        # it's possible in a MAF file to have multiple Ensembl releases
+        # mixed in a single MAF file (the genome assembly is
+        # specified by the NCBI_Build column)
+        ncbi_build = x.NCBI_Build
+        if ncbi_build in ensembl_objects:
+            ensembl = ensembl_objects[ncbi_build]
+        else:
+            if isinstance(ncbi_build, int):
+                reference_name = "B%d" % ncbi_build
+            else:
+                reference_name = str(ncbi_build)
+
+            reference_name = infer_reference_name(reference_name)
+            ensembl_release = ensembl_release_number_for_reference_name(
+                reference_name)
+            ensembl = EnsemblRelease(release=ensembl_release)
+            ensembl_objects[ncbi_build] = ensembl
+
+        # have to try both Tumor_Seq_Allele1 and Tumor_Seq_Allele2
+        # to figure out which is different from the reference allele
         if x.Tumor_Seq_Allele1 != ref:
             alt = x.Tumor_Seq_Allele1
         else:
-            assert x.Tumor_Seq_Allele2 != ref, \
-                "Both tumor alleles agree with reference: %s" % (x,)
+            if x.Tumor_Seq_Allele2 == ref:
+                raise ValueError(
+                    "Both tumor alleles agree with reference %s: %s" % (
+                        ref, x,))
             alt = x.Tumor_Seq_Allele2
 
-        assert end_pos == start_pos + len(ref) - 1, \
-            "Expected variant %s:%s %s>%s to end at %d but got end_pos=%d" % (
+        if end_pos != start_pos + len(ref) - 1:
+            raise ValueError(
+                "Expected variant %s:%s %s>%s to end at %d but got end=%d" % (
                 contig, start_pos, ref, alt,
-                start_pos + len(ref) - 1, end_pos)
-
-        variants.append(Variant(contig, start_pos, ref, alt))
+                start_pos + len(ref) - 1, end_pos))
+
+        # keep metadata about the variant and its TCGA annotation
+        info = {
+            'Hugo_Symbol' : x.Hugo_Symbol,
+            'Center' : x.Center,
+            'Strand' : x.Strand,
+            'Variant_Classification' : x.Variant_Classification,
+            'Variant_Type' : x.Variant_Type,
+            'dbSNP_RS' : x.dbSNP_RS,
+            'dbSNP_Val_Status' : x.dbSNP_Val_Status,
+            'Tumor_Sample_Barcode' : x.Tumor_Sample_Barcode,
+            'Matched_Norm_Sample_Barcode' : x.Matched_Norm_Sample_Barcode,
+        }
+
+        variant = Variant(
+            contig, start_pos, ref, alt,
+            ensembl=ensembl,
+            info=info)
+
+        variants.append(variant)
 
     return VariantCollection(
         variants=variants,
-        original_filename=filename,
-        reference_name=reference_name)
+        original_filename=filename)
diff --git a/varcode/string_helpers.py b/varcode/string_helpers.py
new file mode 100644
index 0000000..41f3064
--- /dev/null
+++ b/varcode/string_helpers.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2014. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+
+import pyfaidx
+
+def reverse_complement(x):
+    """
+    Reverse complement of a nucleotide string.
+
+    Parameters
+    ----------
+
+    x : str
+        Original nucleotide string
+    """
+    return pyfaidx.Sequence(seq=x).reverse.complement.seq
+
+
+def trim_shared_prefix(ref, alt):
+    """
+    Sometimes mutations are given with a shared prefix between the reference
+    and alternate strings. Examples: C>CT (nucleotides) or GYFP>G (amino acids).
+
+    This function trims the common prefix and returns the disjoint ref
+    and alt strings, along with the shared prefix.
+    """
+    n_ref = len(ref)
+    n_alt = len(alt)
+    n_min = min(n_ref, n_alt)
+    i = 0
+    while i < n_min and ref[i] == alt[i]:
+        i += 1
+
+    # guaranteed that ref and alt agree on all the characters
+    # up to i'th position, so it doesn't matter which one we pull
+    # the prefix out of
+    prefix = ref[:i]
+    ref_suffix = ref[i:]
+    alt_suffix = alt[i:]
+    return ref_suffix, alt_suffix, prefix
+
+def trim_shared_suffix(ref, alt):
+    """
+    Reuse the `trim_shared_prefix` function above to implement similar
+    functionality for string suffixes.
+
+    Given ref='ABC' and alt='BC', we first revese both strings:
+        reverse_ref = 'CBA'
+        reverse_alt = 'CB'
+    and then the result of calling trim_shared_prefix will be:
+        ('A', '', 'CB')
+    We then reverse all three of the result strings to get back
+    the shared suffix and both prefixes leading up to it:
+        ('A', '', 'BC')
+    """
+    reverse_ref = ref[::-1]
+    reverse_alt = alt[::-1]
+    results = trim_shared_prefix(reverse_ref, reverse_alt)
+    return tuple(s[::-1] for s in results)
+
+def trim_shared_flanking_strings(ref, alt):
+    """
+    Given two nucleotide or amino acid strings, identify
+    if they have a common prefix, a common suffix, and return
+    their unique components along with the prefix and suffix.
+
+    For example, if the input ref = "SYFFQGR" and alt = "SYMLLFIFQGR"
+    then the result will be:
+        ("F", "MLLFI", "SY", "FQGR")
+    """
+    ref, alt, prefix = trim_shared_prefix(ref, alt)
+    ref, alt, suffix = trim_shared_suffix(ref, alt)
+    return ref, alt, prefix, suffix
diff --git a/varcode/transcript_helpers.py b/varcode/transcript_helpers.py
new file mode 100644
index 0000000..7dbfb39
--- /dev/null
+++ b/varcode/transcript_helpers.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2014. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+
+def interval_offset_on_transcript(start, end, transcript):
+    """
+    Given an interval [start:end] and a particular transcript,
+    return the start offset of the interval relative to the
+    chromosomal positions of the transcript.
+    """
+    # ensure that start_pos:end_pos overlap with transcript positions
+    if start > end:
+        raise ValueError(
+            "start_pos %d shouldn't be greater than end_pos %d" % (
+                start, end))
+    if start > transcript.end:
+        raise ValueError(
+            "Range %d:%d starts after transcript %s (%d:%d)" % (
+                start,
+                end,
+                transcript,
+                transcript.start,
+                transcript.end))
+    if end < transcript.start:
+        raise ValueError(
+            "Range %d:%d ends before transcript %s (%d:%d)" % (
+                start,
+                end,
+                transcript,
+                transcript.start,
+                transcript.end))
+    # trim the start position to the beginning of the transcript
+    if start < transcript.start:
+        start = transcript.start
+    # trim the end position to the end of the transcript
+    if end > transcript.end:
+        end = transcript.end
+
+    # return earliest offset into the spliced transcript
+    return min(transcript.spliced_offset(start), transcript.spliced_offset(end))
\ No newline at end of file
diff --git a/varcode/variant.py b/varcode/variant.py
index 3390580..4c629e3 100644
--- a/varcode/variant.py
+++ b/varcode/variant.py
@@ -1,39 +1,106 @@
+# Copyright (c) 2014. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function, division, absolute_import
 
+from .coding_effect import infer_coding_effect
+from .common import group_by
 from .nucleotides import normalize_nucleotide_string
+from .string_helpers import reverse_complement, trim_shared_flanking_strings
+from .transcript_helpers import interval_offset_on_transcript
+from .effects import (
+    NoncodingTranscript,
+    IncompleteTranscript,
+    FivePrimeUTR,
+    ThreePrimeUTR,
+    Intronic,
+    IntronicSpliceSite,
+    SpliceAcceptor,
+    SpliceDonor,
+)
+from .variant_effect_collection import VariantEffectCollection
 
-from memoized_property import memoized_property
+from pyensembl import Transcript, find_nearest_locus, EnsemblRelease
 from pyensembl.locus import normalize_chromosome
+from pyensembl.biotypes import is_coding_biotype
 
 class Variant(object):
-    def __init__(self, contig, pos, ref, alt, info=None):
+    def __init__(self, contig, pos, ref, alt, ensembl, info=None):
+        """
+        Construct a Variant object.
+
+        Parameters
+        ----------
+        contig : str
+            Chromosome that this variant is on
+
+        pos : int
+            1-based position on the chromosome of first reference nucleotide
+
+        ref : str
+            Reference nucleotide(s)
+
+        alt : str
+            Alternate nucleotide(s)
+
+        ensembl : EnsemblRelease
+            Ensembl object used for determining gene/transcript annotations
+        """
         self.contig = normalize_chromosome(contig)
-        self.pos = int(pos)
         self.ref = normalize_nucleotide_string(ref)
         self.alt = normalize_nucleotide_string(alt)
+        self.pos = int(pos)
+        self.end = self.pos + len(self.ref) - 1
+
+        if not isinstance(ensembl, EnsemblRelease):
+            raise TypeError("Expected EnsemblRelease, got %s : %s" % (
+                ensembl, type(ensembl)))
+        self.ensembl = ensembl
+
         self.info = {} if info is None else info
 
-    @memoized_property
-    def end_pos(self):
-        return self.pos + len(self.ref) - 1
+    @property
+    def reference_name(self):
+        return self.ensembl.reference.reference_name
 
     def __str__(self):
-        return "Variant(contig=%s, pos=%d, ref=%s, alt=%s)" % (
-            self.contig, self.pos, self.ref, self.alt)
+        fields = self.fields()
+        return "Variant(contig=%s, pos=%d, ref=%s, alt=%s, genome=%s)" % fields
 
     def __repr__(self):
         return str(self)
 
     def __hash__(self):
-        return hash((self.contig, self.pos, self.ref, self.alt))
+        return hash(self.fields())
+
+    def fields(self):
+        """
+        All identifying fields of a variant (contig, pos, ref, alt, genome)
+        in a single tuple. This makes for cleaner printing, hashing, and
+        comparisons.
+        """
+        return (
+            self.contig,
+            self.pos,
+            self.ref,
+            self.alt,
+            self.reference_name)
 
     def __eq__(self, other):
         return (
             isinstance(other, Variant) and
-            self.contig == other.contig and
-            self.pos == other.pos and
-            self.ref == other.ref and
-            self.alt == other.alt)
+            self.fields() == other.fields())
 
     def short_description(self):
         chrom, pos, ref, alt = self.contig, self.pos, self.ref, self.alt
@@ -46,3 +113,255 @@ def short_description(self):
                 chrom, pos + len(alt), pos + len(ref), ref[len(alt):])
         else:
             return "chr%s g.%d %s>%s" % (chrom, pos, ref, alt)
+
+    def transcripts(self):
+        return self.ensembl.transcripts_at_locus(
+            self.contig, self.pos, self.end)
+
+    def coding_transcripts(self):
+        """
+        Protein coding transcripts
+        """
+        return [
+            transcript for transcript in self.transcripts()
+            if is_coding_biotype(transcript.biotype)
+        ]
+
+    def genes(self):
+        """
+        Return Gene object for all genes which overlap this variant.
+        """
+        return self.ensembl.genes_at_locus(
+            self.contig, self.pos, self.end)
+
+    def gene_names(self):
+        """
+        Return names of all genes which overlap this variant. Calling
+        this method is significantly cheaper than calling `Variant.genes()`,
+        which has to issue many more queries to construct each Gene object.
+        """
+        return self.ensembl.gene_names_at_locus(
+            self.contig, self.pos, self.end)
+
+    def coding_genes(self):
+        """
+        Protein coding transcripts
+        """
+        return [
+            gene for gene in self.genes()
+            if is_coding_biotype(gene.biotype)
+        ]
+
+    def effects(self, only_coding_transcripts=False, raise_on_error=True):
+        """
+        Determine the effects of a variant on any transcripts it overlaps.
+        Returns a VariantEffectCollection object.
+        """
+
+        overlapping_transcripts = self.transcripts()
+
+        if only_coding_transcripts:
+            # remove non-coding transcripts
+            overlapping_transcripts = [
+                transcript
+                for transcript
+                in overlapping_transcripts
+                if is_coding_biotype(transcript.biotype)
+            ]
+
+        if len(overlapping_transcripts) == 0:
+            # intergenic variant
+            return VariantEffectCollection(
+                variant=self,
+                gene_effect_groups={},
+                errors={})
+
+        # group transcripts by their gene ID
+        overlapping_transcript_groups = group_by(
+            overlapping_transcripts, field_name='gene_id')
+
+        # dictionary from gene ID to list of transcript effects
+        gene_transcript_effects_groups = {}
+
+        # mapping from Transcript objects to errors encountered
+        # while trying to annotated them
+        errors = {}
+
+        for (gene_id, transcripts) in overlapping_transcript_groups.items():
+            effects = []
+            for transcript in transcripts:
+                try:
+                    effects.append(self.transcript_effect(transcript))
+                except (AssertionError, ValueError) as error:
+                    if raise_on_error:
+                        raise
+                    else:
+                        logging.warn(
+                            "Encountered error annotating %s for %s: %s",
+                            self,
+                            transcript,
+                            error)
+                    errors[transcript] = error
+            gene_transcript_effects_groups[gene_id] = effects
+
+        return VariantEffectCollection(
+            variant=self,
+            gene_effect_groups=gene_transcript_effects_groups,
+            errors=errors)
+
+    def transcript_effect(self, transcript):
+        """
+        Return the transcript effect (such as FrameShift) that results from
+        applying this genomic variant to a particular transcript.
+
+        Parameters
+        ----------
+
+        transcript :  Transcript
+            Transcript we're going to apply mutation to.
+        """
+
+        if not isinstance(transcript, Transcript):
+            raise TypeError(
+                "Expected %s : %s to have type Transcript" % (
+                    transcript, type(transcript)))
+
+        # check for non-coding transcripts first, since
+        # every non-coding transcript is "incomplete".
+        if not is_coding_biotype(transcript.biotype):
+            return NoncodingTranscript(self, transcript)
+
+        if not transcript.complete:
+            return IncompleteTranscript(self, transcript)
+
+        distance_to_exon, nearest_exon = find_nearest_locus(
+            start=self.pos,
+            end=self.end,
+            loci=transcript.exons)
+
+        if distance_to_exon > 0:
+            return self._intronic_transcript_effect(
+                transcript=transcript,
+                nearest_exon=nearest_exon)
+
+        # TODO: exonic splice site mutations
+        return self._exonic_transcript_effect(transcript)
+
+
+    def _intronic_transcript_effect(self, transcript, nearest_exon):
+        """
+        Infer effect of variant which does not overlap any exon of
+        the given transcript.
+        """
+        distance_to_exon = nearest_exon.distance_to_interval(
+            self.pos, self.end)
+
+        assert distance_to_exon > 0, \
+            "Expected intronic effect to have distance_to_exon > 0, got %d" % (
+                distance_to_exon,)
+
+        before_forward_exon = (
+            transcript.strand == "+" and
+            self.pos < nearest_exon.start)
+
+        before_backward_exon = (
+            transcript.strand == "-" and
+            self.end > nearest_exon.end)
+
+        before_exon = before_forward_exon or before_backward_exon
+
+        if distance_to_exon <= 2:
+            if before_exon:
+                # 2 last nucleotides of intron before exon are the splice acceptor
+                # site, typically "AG"
+                effect_class = SpliceAcceptor
+            else:
+                # 2 first nucleotides of intron after exon are the splice donor
+                # site, typically "GT"
+                effect_class = SpliceDonor
+        elif not before_exon and distance_to_exon <= 6:
+            # variants in nucleotides 3-6 at start of intron aren't as certain
+            # to cause problems as nucleotides 1-2 but still implicated in
+            # alternative splicing
+            effect_class = IntronicSpliceSite
+        elif before_exon and distance_to_exon <= 4:
+            # nucleotides -4 and -3 before exon are part of the 3' splicing motif
+            # but allow for more degeneracy than the -2, -1 nucleotides
+            effect_class = IntronicSpliceSite
+        else:
+            assert distance_to_exon > 6, \
+                "Looks like we didn't cover all possible splice site mutations"
+            # intronic mutation unrelated to splicing
+            effect_class = Intronic
+
+        return effect_class(
+                variant=self,
+                transcript=transcript,
+                nearest_exon=nearest_exon,
+                distance_to_exon=distance_to_exon)
+
+
+    def _offset_on_transcript(self, transcript):
+        """
+        Given a Variant (with fields ref, alt, pos, and end), compute the offset
+        of the variant position relative to the position and direction of the
+        transcript.
+
+        Returns a tuple (offset, ref, alt), where ref and alt are this variant's
+        ref and alt nucleotide strings with shared prefixes and suffixes removed
+        and the bases reversed if the transcript is on the backward strand.
+
+        For example,
+
+        >> variant = Variant(ref="ATGGC", alt="TCGC", pos=3000, end=3005)
+        >> variant.offset_on_transcript(Transcript(start=2000, end=4000))
+        (1002, "G", "C")
+        """
+        if transcript.on_backward_strand:
+            ref = reverse_complement(self.ref)
+            alt = reverse_complement(self.alt)
+        else:
+            ref = self.ref
+            alt = self.alt
+
+        # in case nucleotide strings share prefix (e.g. ref="C", alt="CC")
+        # bump the offsets and make the strings disjoint (ref="", alt="C")
+        ref, alt, prefix, _ = trim_shared_flanking_strings(ref, alt)
+        n_same = len(prefix)
+        start_offset_with_utr5 = interval_offset_on_transcript(
+            self.pos, self.end, transcript)
+        offset =  start_offset_with_utr5 + n_same
+        return offset, ref, alt
+
+    def _exonic_transcript_effect(self, transcript):
+        """
+        Effect of this variant on a Transcript, assuming we already know
+        that this variant overlaps some exon of the transcript.
+        """
+        offset_with_utr5, ref, alt = self._offset_on_transcript(transcript)
+
+        utr5_length = min(transcript.start_codon_spliced_offsets)
+
+        if utr5_length > offset_with_utr5:
+            # TODO: what do we do if the variant spans the beginning of
+            # the coding sequence?
+            if utr5_length < offset_with_utr5 + len(ref):
+                raise ValueError(
+                    "Variant which span 5' UTR and CDS not supported: %s" % (
+                        self,))
+            return FivePrimeUTR(self, transcript)
+
+        utr3_offset = max(transcript.stop_codon_spliced_offsets) + 1
+
+        if offset_with_utr5 >= utr3_offset:
+            return ThreePrimeUTR(self, transcript)
+
+        cds_offset = offset_with_utr5 - utr5_length
+
+        return infer_coding_effect(
+            ref,
+            alt,
+            cds_offset,
+            variant=self,
+            transcript=transcript)
+
diff --git a/varcode/variant_annotator.py b/varcode/variant_annotator.py
deleted file mode 100644
index 63c6168..0000000
--- a/varcode/variant_annotator.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from __future__ import print_function, division, absolute_import
-import logging
-
-from .common import group_by
-from .core_logic import infer_transcript_effect
-from .variant import Variant
-from .variant_effect import VariantEffect
-
-import pyensembl
-
-class VariantAnnotator(object):
-    def __init__(self, ensembl_release):
-        self.ensembl = pyensembl.EnsemblRelease(ensembl_release)
-
-    def effect(self, variant, raise_on_error=True):
-        """
-        Determine the effects of a variant on any transcripts it overlaps.
-        Returns a VariantEffect object.
-        """
-        contig = variant.contig
-        pos = variant.pos
-        end_pos = variant.end_pos
-        ref = variant.ref
-        alt = variant.alt
-
-        overlapping_transcripts = self.ensembl.transcripts_at_locus(
-            contig, pos, end_pos)
-
-        if len(overlapping_transcripts) == 0:
-            # intergenic variant
-            return VariantEffect(
-                variant=variant,
-                genes=[],
-                gene_transcript_effects={},
-                errors={})
-
-        # group transcripts by their gene ID
-        overlapping_transcript_groups = group_by(
-            overlapping_transcripts, field_name='gene_id')
-
-        # dictionary from gene ID to list of transcript effects
-        gene_transcript_effects_groups = {}
-
-        # mapping from Transcript objects to errors encountered
-        # while trying to annotated them
-        errors = {}
-
-        for (gene_id, transcripts) in overlapping_transcript_groups.items():
-            effects = []
-            for transcript in transcripts:
-                try:
-                    effects.append(infer_transcript_effect(variant, transcript))
-                except (AssertionError, ValueError) as error:
-                    if raise_on_error:
-                        raise
-                    else:
-                        logging.warn(
-                            "Encountered error annotating %s for %s: %s",
-                            variant,
-                            transcript,
-                            error)
-                    errors[transcript] = error
-            gene_transcript_effects_groups[gene_id] = effects
-
-        # construct Gene objects for all the genes containing
-        # all the transcripts this variant overlaps
-        overlapping_genes = [
-            self.ensembl.gene_by_id(gene_id)
-            for gene_id
-            in overlapping_transcript_groups.keys()
-        ]
-
-        return VariantEffect(
-            variant=variant,
-            genes=overlapping_genes,
-            gene_transcript_effects=gene_transcript_effects_groups,
-            errors=errors)
diff --git a/varcode/variant_collection.py b/varcode/variant_collection.py
index 122482a..537f057 100644
--- a/varcode/variant_collection.py
+++ b/varcode/variant_collection.py
@@ -13,23 +13,15 @@
 # limitations under the License.
 
 from __future__ import print_function, division, absolute_import
+from collections import Counter
+
+from .effects import Substitution
+from .effect_ordering import effect_priority, transcript_effect_priority_dict
 
-from .effect_ordering import effect_priority
-from .reference_name import (
-    infer_reference_name,
-    ensembl_release_number_for_reference_name
-)
-from .variant_annotator import VariantAnnotator
 
 class VariantCollection(object):
 
-    def __init__(
-            self,
-            variants,
-            reference_path=None,
-            reference_name=None,
-            ensembl_release=None,
-            original_filename=None):
+    def __init__(self, variants, original_filename=None):
         """
         Construct a VariantCollection from a list of Variant records and
         the name of a reference genome.
@@ -43,39 +35,8 @@ def __init__(
         original_filename : str, optional
             File from which we loaded variants, though the current
             VariantCollection may only contain a subset of them.
-
-        reference_path : str, optional
-            Path to reference FASTA file.
-
-        reference_name : str, optional
-            Name of reference genome (e.g. "GRCh37", "hg18"). If not given
-            infer from reference path or from ensembl_release.
-
-        ensembl_release : int, optional
-            If not specified, infer Ensembl release from reference_name
         """
         self.variants = variants
-        self.reference_path = reference_path
-        if reference_name:
-            # convert from e.g. "hg19" to "GRCh37"
-            #
-            # TODO: actually handle the differences between these references
-            # instead of just treating them as interchangeable
-            self.reference_name = infer_reference_name(reference_name)
-        else:
-            if reference_path:
-                self.reference_name = infer_reference_name(reference_path)
-            else:
-                raise ValueError(
-                    "Must specify one of reference_path or reference_name")
-
-        if ensembl_release:
-            self.ensembl_release = ensembl_release
-        else:
-            self.ensembl_release = ensembl_release_number_for_reference_name(
-                self.reference_name)
-
-        self.annot = VariantAnnotator(ensembl_release=self.ensembl_release)
         self.original_filename = original_filename
 
     def __len__(self):
@@ -87,14 +48,13 @@ def __iter__(self):
     def __eq__(self, other):
         return (
             isinstance(other, VariantCollection) and
-            self.reference_name == other.reference_name and
             len(self.variants) == len(other.variants) and
             all(v1 == v2 for (v1, v2) in zip(self.variants, other.variants)))
 
     def __str__(self):
         fields = [
             ("n_variants", len(self.variants)),
-            ("reference", self.reference_name)
+            ("reference", ", ".join(self.reference_names()))
         ]
 
         if self.original_filename:
@@ -110,83 +70,120 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
-    def clone(self, new_variants=None):
+    def reference_names(self):
+        """
+        All unique reference names associated with variants in this collection.
+        """
+        return {variant.reference_name for variant in self.variants}
+
+    def _clone_metadata(self, new_variants):
         """
         Create copy of VariantCollection with same metadata but possibly
-        different Variant entries. If no variants provided, then just make a
-        copy of self.variants.
+        different Variant entries.
         """
-        if new_variants is None:
-            new_variants = self.variants
         return VariantCollection(
-            variants=list(new_variants),
-            original_filename=self.original_filename,
-            reference_path=self.reference_path,
-            reference_name=self.reference_name,
-            ensembl_release=self.ensembl_release)
+            variants=new_variants,
+            original_filename=self.original_filename)
 
     def drop_duplicates(self):
         """
         Create a new VariantCollection without any duplicate variants.
         """
-        return self.clone(set(self.variants))
+        return self._clone_metadata(set(self.variants))
 
-    def variant_effects(self, raise_on_error=True):
+    def variant_effects(
+            self,
+            high_impact=False,
+            only_coding_transcripts=False,
+            raise_on_error=True):
         """
-        Determine the impact of each variant, return a list of
-        VariantEffect objects.
+        Returns a list containing one VariantEffectCollection object for each
+        variant in this VariantCollection.
 
         Parameters
         ----------
+        high_impact : bool, optional
+            Only show effects which make changes to coding transcripts that
+            are predicted to be deleterious (amino acid sequence changes,
+            changes to essential splice sites, frameshifts, &c)
 
-        raise_on_error : bool, optional
-            Raise exception if error is encountered while annotating
-            transcripts, otherwise track errors in VariantEffect.errors
-            dictionary (default=True).
+        only_coding_transcripts : bool, optional
+            Only annotate variant effects on coding transcripts.
 
+        raise_on_error : bool, optional
+            If exception is raised while determining effect of variant on a
+            transcript, should it be raised? This default is True, meaning
+            errors result in raised exceptions. If raise_on_error=False then
+            exceptions are logged in VariantEffectCollection.errors.
         """
-        return [
-            self.annot.effect(
-                variant=variant,
+        results = []
+
+        if high_impact:
+            min_priority = transcript_effect_priority_dict[Substitution]
+        else:
+            min_priority = -1
+
+        for variant in self.variants:
+            variant_effect_collection = variant.effects(
+                only_coding_transcripts=only_coding_transcripts,
                 raise_on_error=raise_on_error)
-            for variant
-            in self.variants
-        ]
 
-    def effects_to_string(self, *args, **kwargs):
+            if only_coding_transcripts and len(variant_effect_collection) == 0:
+                # if we only want coding transcripts, then skip all
+                # intergenic and non-coding gene variants
+                continue
+
+            best_effect = variant_effect_collection.highest_priority_effect
+            # either this variant is intergenic and there's no minimum
+            # threshold for effect priority or the highest impact effect
+            # is higher priority than the min_priority
+            if ((best_effect is None and min_priority < 0) or
+                    (effect_priority(best_effect) >= min_priority)):
+                results.append(variant_effect_collection)
+        return results
+
+    def effect_summary(self, *args, **kwargs):
         """
         Create a long string with all transcript effects for each mutation,
         grouped by gene (if a mutation affects multiple genes).
 
-        Arguments are passed on to variant_effects(*args, **kwargs).
+        Arguments are passed on to self.variant_effects(*args, **kwargs).
         """
         lines = []
-        for variant_effect in self.variant_effects(*args, **kwargs):
-            transcript_effect_count = 0
-            lines.append("\n%s" % variant_effect.variant)
-            transcript_effect_lists = variant_effect.gene_transcript_effects
-            for gene, transcript_effects in transcript_effect_lists.iteritems():
-                gene_name = self.annot.ensembl.gene_name_of_gene_id(gene)
-                lines.append("  Gene: %s (%s)" % (gene_name, gene))
-                # print transcript effects with more significant impact
+        for effect_collection in self.variant_effects(*args, **kwargs):
+            variant = effect_collection.variant
+            lines.append("\n%s" % variant)
+            transcript_effect_lists = effect_collection.gene_effect_groups
+            for gene_id, effects in transcript_effect_lists.iteritems():
+                gene_name = variant.ensembl.gene_name_of_gene_id(gene_id)
+                lines.append("  Gene: %s (%s)" % (gene_name, gene_id))
+                # place transcript effects with more significant impact
                 # on top (e.g. FrameShift should go before NoncodingTranscript)
-                for transcript_effect in sorted(
-                        transcript_effects,
+                for effect in sorted(
+                        effects,
                         key=effect_priority,
                         reverse=True):
-                    transcript_effect_count += 1
-                    lines.append("  -- %s" % transcript_effect)
+                    lines.append("  -- %s" % effect)
             # if we only printed one effect for this gene then
             # it's redundant to print it again as the highest priority effect
-            if transcript_effect_count > 1:
-                best = variant_effect.highest_priority_effect
+            if len(effect_collection) > 1:
+                best = effect_collection.highest_priority_effect
                 lines.append("  Highest Priority Effect: %s" % best)
         return "\n".join(lines)
 
-    def print_effects(self, *args, **kwargs):
+    def reference_names(self):
+        """
+        All distinct reference names used by Variants in this
+        collection.
         """
-        Print all variants and their transcript effects (grouped by gene).
+        return { variant.reference_name for variant in self.variants }
 
-        Arguments are passed on to effects_to_string(*args, **kwargs).
+    def gene_counts(self, only_coding=False):
         """
-        print(self.effects_to_string(*args, **kwargs))
+        Count how many variants overlap each gene name.
+        """
+        counter = Counter()
+        for variant in self.variants:
+            for gene_name in variant.gene_names():
+                counter[gene_name] += 1
+        return counter
diff --git a/varcode/variant_effect.py b/varcode/variant_effect.py
deleted file mode 100644
index c10f563..0000000
--- a/varcode/variant_effect.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-from .effect_ordering import top_priority_transcript_effect
-
-from pyensembl.biotypes import is_coding_biotype
-
-class VariantEffect(object):
-    """
-    A VariantEffect object is a container for all the TranscriptEffects of
-    a particular mutation, as well as some properties that attempt to
-    summarize the impact of the mutation across all genes/transcripts.
-    """
-
-    def __init__(
-            self,
-            variant,
-            genes,
-            gene_transcript_effects,
-            errors={}):
-        """
-        variant : Variant
-
-        genes : list
-            List of Gene objects
-
-        gene_transcript_effects : dict
-            Dictionary from gene ID to list of transcript variant effects
-
-        errors : dict, optional
-            Mapping from transcript objects to strings explaining error which
-            was encountered while trying to annotate them.
-        """
-        self.variant = variant
-        self.genes = genes
-        self.gene_transcript_effects = gene_transcript_effects
-
-        # dictionary mapping from transcript IDs to transcript mutation effects
-        self.transcript_effects = {}
-        for (_, transcript_effects) in self.gene_transcript_effects.items():
-            for effect in transcript_effects:
-                self.transcript_effects[effect.transcript.id] = effect
-
-        # if our variant overlaps any genes, then choose the highest
-        # priority transcript variant, otherwise call the variant "Intergenic"
-        if len(self.transcript_effects) > 0:
-            self.highest_priority_effect = top_priority_transcript_effect(
-                self.transcript_effects.values())
-            highest_priority_class = self.highest_priority_effect.__class__
-            self.variant_summary = highest_priority_class.__name__
-        else:
-            self.highest_priority_effect = None
-            self.variant_summary = "Intergenic"
-
-        self.errors = errors
-
-    @property
-    def coding_genes(self):
-        """
-        The `genes` property includes all sorts of esoteric entities
-        like anti-sense annotations and pseudogenes.
-
-        This property gives you just protein coding genes and functional RNAs
-        """
-        return [
-            gene for gene in self.genes
-            if is_coding_biotype(gene.biotype)
-        ]
-
-    def __str__(self):
-        fields = [
-            ("variant", self.variant.short_description()),
-            ("genes", [gene.name for gene in self.genes]),
-            ("transcript_effects", self.transcript_effects)
-        ]
-        if self.errors:
-            fields.append( ("errors", self.errors) )
-        return "VariantEffect(%s)" % (
-            ", ".join(["%s=%s" % (k,v) for (k,v) in fields]))
-
-    def __repr__(self):
-        return str(self)
diff --git a/varcode/variant_effect_collection.py b/varcode/variant_effect_collection.py
new file mode 100644
index 0000000..5516b28
--- /dev/null
+++ b/varcode/variant_effect_collection.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2014. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+
+from .effect_ordering import top_priority_transcript_effect
+
+from pyensembl.biotypes import is_coding_biotype
+
+class VariantEffectCollection(object):
+    """
+    Collection of all the TranscriptEffect objects associated with
+    each transcript overlapping a given variant, as well as some properties
+    that attempt to summarize the impact of the mutation across all
+    genes/transcripts.
+    """
+    def __init__(
+            self,
+            variant,
+            gene_effect_groups,
+            errors={}):
+        """
+        variant : Variant
+
+        gene_effect_groups : dict
+            Dictionary from gene ID to list of transcript variant effects
+
+        errors : dict, optional
+            Mapping from transcript objects to strings explaining error which
+            was encountered while trying to annotate them.
+        """
+        self.variant = variant
+        self.gene_effect_groups = gene_effect_groups
+
+        # dictionary mapping from transcript IDs to transcript mutation effects
+        self.transcript_effect_dict = {}
+        for (_, effect_list) in self.gene_effect_groups.items():
+            for effect in effect_list:
+                self.transcript_effect_dict[effect.transcript.id] = effect
+
+        # if our variant overlaps any genes, then choose the highest
+        # priority transcript variant, otherwise call the variant "Intergenic"
+        if len(self.transcript_effect_dict) > 0:
+            self.highest_priority_effect = top_priority_transcript_effect(
+                self.transcript_effect_dict.values())
+            highest_priority_class = self.highest_priority_effect.__class__
+            self.variant_summary = highest_priority_class.__name__
+        else:
+            self.highest_priority_effect = None
+            self.variant_summary = "Intergenic"
+
+        self.errors = errors
+
+    def __str__(self):
+        fields = [
+            ("variant", self.variant.short_description()),
+            ("genes", self.gene_names()),
+            ("effects", self.effects())
+        ]
+        if self.errors:
+            fields.append( ("errors", self.errors) )
+        return "VariantEffectCollection(%s)" % (
+            ", ".join(["%s=%s" % (k,v) for (k,v) in fields]))
+
+    def __repr__(self):
+        return str(self)
+
+    def __len__(self):
+        """
+        Length of a VariantEffectCollection is the number of effect objects
+        it contains.
+        """
+        return len(self.transcript_effect_dict)
+
+    def effects(self):
+        """
+        Returns all TranscriptMutationEffect objects contained in this
+        collection.
+        """
+        return self.transcript_effect_dict.values()
+
+    def transcripts(self):
+        """
+        Returns list of Transcript objects for all transcripts which are
+        annotated with effects in this collection.
+        """
+        return [effect.transcript for effect in self.effects()]
+
+    def genes(self):
+        """
+        Construct Gene objects for all the genes which contain the
+        transcripts in this effect collection.
+        """
+        return [
+            self.variant.ensembl.gene_by_id(gene_id)
+            for gene_id
+            in self.gene_effect_groups
+        ]
+
+    def gene_names(self):
+        """
+        Fetch names of all the genes which contain the
+        transcripts in this effect collection. This is significantly
+        cheaper than constructing a Gene object and fetching its name.
+        """
+        return [
+            self.variant.ensembl.gene_name_of_gene_id(gene_id)
+            for gene_id
+            in self.gene_effect_groups
+        ]
diff --git a/varcode/vcf.py b/varcode/vcf.py
index 282278d..19e5366 100644
--- a/varcode/vcf.py
+++ b/varcode/vcf.py
@@ -15,13 +15,22 @@
 # required so that 'import vcf' gets the global PyVCF package,
 # rather than our local vcf module
 from __future__ import absolute_import
+
+from .reference_name import (
+    infer_reference_name,
+    ensembl_release_number_for_reference_name
+)
 from .variant import Variant
 from .variant_collection import VariantCollection
 
+from pyensembl import EnsemblRelease
 # PyVCF
 import vcf
 
-def load_vcf(filename, reference_path_field='reference'):
+def load_vcf(
+        filename,
+        reference_path_field='reference',
+        ensembl_release=None):
     """
     Load reference name and Variant objects from the given VCF filename.
     Drop any entries whose FILTER field is not one of "." or "PASS".
@@ -34,19 +43,39 @@ def load_vcf(filename, reference_path_field='reference'):
     reference_path_field : str, optional
         Name of metadata field which contains path to reference FASTA
         file (default = 'reference')
+
+    ensembl_release : int, optional
+        Which release of Ensembl to use for annotation, by default inferred
+        from the reference path.
     """
+
+    if not isinstance(filename, str):
+        raise ValueError(
+            "Expected filename to be str, got %s : %s" % (
+                filename, type(filename)))
+
     vcf_reader = vcf.Reader(filename=filename)
-    reference_name = vcf_reader.metadata[reference_path_field]
+
+    if not ensembl_release:
+        reference_path = vcf_reader.metadata[reference_path_field]
+        reference_name = infer_reference_name(reference_path)
+        ensembl_release = ensembl_release_number_for_reference_name(
+                reference_name)
+
+    ensembl = EnsemblRelease(release=ensembl_release)
 
     variants = [
         Variant(
-            x.CHROM, x.POS,
-            x.REF, x.ALT[0].sequence,
-            x.INFO)
+            contig=x.CHROM,
+            pos=x.POS,
+            ref=x.REF,
+            alt=x.ALT[0].sequence,
+            info=x.INFO,
+            ensembl=ensembl)
         for x in vcf_reader
         if not x.FILTER or x.FILTER == "PASS"
     ]
+
     return VariantCollection(
         variants=variants,
-        original_filename=filename,
-        reference_path=reference_name)
+        original_filename=filename)
\ No newline at end of file