<a href="https://colab.research.google.com/github/miqrom29/WGSExtract-Dev-Experimental/blob/master/coordsExtractorB37.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
import os

# 1. Upload VCF file
print("Select your VCF file (.vcf or .vcf.gz):")
uploaded = files.upload()
input_file_name = list(uploaded.keys())[0]

# 2. User configuration
INPUT_VCF = input_file_name
OUTPUT_TSV = 'csvs_coords.tsv'

# Example: ['1', '2', 'X'] to keep only some chromosomes, or None for all
CHROMS_FILTER = None

# Example: 'stop_gained' to keep only variants with that consequence in ANN/CSQ, or None for all
CONSEQUENCE_KEYWORD = None

print(f"\nUploaded file: {INPUT_VCF}")


Select your VCF file (.vcf or .vcf.gz):


Saving trio4viewGT1_1.vcf to trio4viewGT1_1.vcf

Uploaded file: trio4viewGT1_1.vcf


In [3]:
import gzip
from pathlib import Path

def open_maybe_gzip(path):
    """Open plain text or bgzipped VCF."""
    path = str(path)
    if path.endswith('.gz'):
        return gzip.open(path, 'rt')
    return open(path, 'r')

def variant_passes_filters(columns, chroms_filter=None, consequence_keyword=None):
    """
    Apply basic filters:
      - optional chromosome filter
      - optional keyword in ANN/CSQ (INFO column)
    """
    # columns[0] = CHROM, columns[1] = POS, columns[7] = INFO
    chrom = columns[0].replace('chr', '')  # strip 'chr' prefix if present
    if chroms_filter is not None and chrom not in chroms_filter:
        return False

    if consequence_keyword is None:
        return True

    info = columns[7] if len(columns) > 7 else ''
    return consequence_keyword in info


In [4]:
coords = []
vcf_path = Path(INPUT_VCF)

if not vcf_path.exists():
    raise FileNotFoundError(f'VCF file not found: {vcf_path}')

print("Parsing VCF...")
with open_maybe_gzip(vcf_path) as f:
    for line in f:
        if not line or line.startswith('#'):
            continue
        cols = line.rstrip().split('\t')
        if len(cols) < 2:
            continue

        if not variant_passes_filters(cols, CHROMS_FILTER, CONSEQUENCE_KEYWORD):
            continue

        chrom, pos = cols[0], cols[1]
        # CSVS / Ensembl-style region
        coords.append(f'{chrom}:{pos}-{pos}')

print(f'✓ Extracted {len(coords)} coordinates.')

# Save TSV
with open(OUTPUT_TSV, 'w') as out_f:
    out_f.write('coord\n')
    for c in coords:
        out_f.write(c + '\n')

print(f'✓ Saved to: {OUTPUT_TSV}')


Parsing VCF...
✓ Extracted 15777 coordinates.
✓ Saved to: csvs_coords.tsv


In [5]:
from google.colab import files
import pandas as pd

if coords:
    print("Preview of the first 10 coordinates:")
    for c in coords[:10]:
        print(c)

    # Trigger download of the TSV file
    files.download(OUTPUT_TSV)
else:
    print("No variants matched the selected filters.")


Preview of the first 10 coordinates:
1:861808-861808
1:998395-998395
1:1097335-1097335
1:1258246-1258246
1:1269554-1269554
1:1664124-1664124
1:1684472-1684472
1:1706136-1706136
1:1865298-1865298
1:1946591-1946591


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>