In [13]:
import pysam as ps
import pymongo

In [14]:
client = pymongo.MongoClient("localhost", 27017)
db = client.hospital
mongo_variants = db.variants
mongo_variants.drop()

In [15]:
file = "/Users/mattsdwatson/Documents/KU Leuven/Courses/Semester 2" \
       " 2020/Management of Large-Scale Omics Data/Practical/Practical_3/chr1.vcf.gz"
data = ps.VariantFile(file)

In [16]:
def filter_variants(vcf):
    list = []
    for i, record in enumerate(vcf):
        for sample_id, val in record.samples.items():
            genotype = val["GT"]
            if genotype != (0, 0) and sample_id is not None:
                list.append({sample_id: genotype, "chr": record.chrom, "pos": record.pos, 
                             "ref": record.ref, "alt": record.alts})
    return list

In [17]:
def into_mongodb(collection, file, num_records=1000000):
    data = ps.VariantFile(file)
    filtered = filter_variants(data)[:num_records]
    collection.insert_many(filtered)
    

In [18]:
into_mongodb(mongo_variants, file)

In [19]:
print(mongo_variants.count_documents({}))

1000000


In [20]:
print(mongo_variants.find_one({"pos": {"$gt": 10_000_000}}))

{'_id': ObjectId('5ebc6afb7fc0758414ad94c1'), 'HG00096': [1, 0], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}


In [21]:
def get_variants(collection, chrom, position, genotype=None):
    """Implement a function that returns all info on variants at `chrom` and `pos`.
    Args:
        collection    MongoDB collection
        chrom         chromosome
        pos           position
        genotype      pair like (1, 0) or (1, 1). If None all are returned
    """
    if genotype is not None:
        genotypes = [genotype, genotype[::-1]]
        for x in collection.find({"$and": [{"pos": {'$eq': position}}, {"chr": {'$eq': chrom}}, {"genotype": {'$in': genotypes}}]}):
            print(x)
    else:
        for x in collection.find({"$and": [{"pos": {'$eq': position}}, {"chr": {'$eq': chrom}}]}):
            print(x)

In [23]:
get_variants(mongo_variants, chrom="1", position=10000400)

{'_id': ObjectId('5ebc6afb7fc0758414ad94c1'), 'HG00096': [1, 0], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c2'), 'HG00099': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c3'), 'HG00100': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c4'), 'HG00101': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c5'), 'HG00102': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c6'), 'HG00103': [0, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c7'), 'HG00105': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c8'), 'HG00106': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94

In [24]:
# this query does not return any records at the position, as expected
get_variants(mongo_variants, chrom="1", position=10000300)

In [25]:
def get_variants_in_range(collection, chrom, start, end):
    """Returns list of variant informations at `chrom` and from position `start` to `end`.
    Args:
        collection    MongoDB collection
        chrom         chromosome
        start         starting position (inclusive)
        end           end position (inclusive)
    """
    for x in collection.find({"$and": [{"pos": {"$gte": start, "$lte": end}}, {"chr": {'$eq': chrom}}]}):
        print(x)

In [26]:
get_variants_in_range(mongo_variants, chrom='1', start=10_000_000, end=10_003_000)

{'_id': ObjectId('5ebc6afb7fc0758414ad94c1'), 'HG00096': [1, 0], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c2'), 'HG00099': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c3'), 'HG00100': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c4'), 'HG00101': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c5'), 'HG00102': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c6'), 'HG00103': [0, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c7'), 'HG00105': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c8'), 'HG00106': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94

In [27]:
db.variants.create_index([("chr", pymongo.ASCENDING)])
db.variants.create_index([("pos", pymongo.ASCENDING)])
print(db.variants.index_information())

{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'hospital.variants'}, 'chr_1': {'v': 2, 'key': [('chr', 1)], 'ns': 'hospital.variants'}, 'pos_1': {'v': 2, 'key': [('pos', 1)], 'ns': 'hospital.variants'}}


In [28]:
%time var10m = get_variants_in_range(mongo_variants, chrom='1', start=10_000_000, end=10_003_000)

{'_id': ObjectId('5ebc6afb7fc0758414ad94c1'), 'HG00096': [1, 0], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c2'), 'HG00099': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c3'), 'HG00100': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c4'), 'HG00101': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c5'), 'HG00102': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c6'), 'HG00103': [0, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c7'), 'HG00105': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94c8'), 'HG00106': [1, 1], 'chr': '1', 'pos': 10000400, 'ref': 'T', 'alt': ['A']}
{'_id': ObjectId('5ebc6afb7fc0758414ad94