<a href="https://colab.research.google.com/github/mlcsmits/Alternative-Splicing-MBI03/blob/main/REAL_python_code_for_pathogenicity_determination_INDELPHI_FAST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run this cell, press play button

In [None]:
# @title
####    USE ONLY THE FIRST TIME!
!pip install beautifulsoup4 requests aiohttp nest_asyncio lxml selenium webdriver-manager biopython

!apt-get update
!apt install -y chromium-browser
!pip install -q selenium
!pip install -q webdriver-manager

Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py

Don't run this three cells

In [None]:
# @title
import csv
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import nest_asyncio
import requests
import time
import os
filename = input("insert file name:")
try:
  os.remove('/content/temp.csv')
except FileNotFoundError: print(f"File '/content/temp.csv' not found.")
# Fix for nested event loops (necessary in Jupyter)
nest_asyncio.apply()

# Debugging function for easier printing
def debug(msg):
    print(f"[DEBUG] {msg}")
try:
  os.remove('/content/input_urls.csv')
except FileNotFoundError: print(f"File '/content/input_urls.csv' not found.")
with open('/content/input_urls.csv', 'a') as bla:
    bla.write('Link for Clinical Significance\n')

# Function to read SNP locations from a CSV file
def read_snp_locations(file_path):
    snp_locations = []
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            if row and row[-1]:  # Ensure the last column is not empty
                snp_locations.append(row[-1].strip())
    return snp_locations

# Function to write URLs to a CSV file
def write_urls_to_csv(file_path, urls):
    with open(file_path, 'a', newline='') as out_file:
        writer = csv.writer(out_file)
        for url in urls:
            writer.writerow([url])

# Read SNP locations and write to temp CSV
snp_locations = read_snp_locations(f'/content/{filename}.csv')
with open('/content/temp.csv', 'w', newline='') as o:
    writer = csv.writer(o)
    writer.writerow(['location of SNP'])
    for loc in snp_locations:
        writer.writerow([loc])

# Generate URLs for ClinVar based on SNP data
rules = []
with open('/content/temp.csv', 'r') as t:
    reader = csv.reader(t)
    next(reader)  # Skip header
    for rule in reader:
        chr = rule[0].replace('chr', '')
        chrpos = chr.split(':')[1].strip() if ':' in chr else ''
        chr = chr.split(':')[0]
        if "+" or "-" in chrpos:
            chrpos = chrpos.replace("+", "").replace("-", "")
        chrpos = int(chrpos)
        chrpos1 = chrpos-3
        chrpos2 = chrpos+3
        url = f'https://www.ncbi.nlm.nih.gov/clinvar/?term={chr}%5BChr%5D+AND+{chrpos1}%3A{chrpos2}%5BChrPos%5D'
#        url = f'https://www.ncbi.nlm.nih.gov/clinvar/?term={chr}%5BChr%5D+AND+{chrpos}%5BChrPos%5D'
        with open('/content/input_urls.csv', 'a') as bla:
            bla.write(f'{url}\n')
            rules.append(url)

# Asynchronous function to fetch a single URL
async def fetch_url(session, url):
#    debug(f"Fetching URL: {url}")
    try:
        async with session.get(url) as response:
            response.raise_for_status()
            html = await response.text()
#            debug(f"Response received from {url}")
            soup = BeautifulSoup(html, "html.parser")

            product_links = [
                f'https://www.ncbi.nlm.nih.gov{tag["href"]}#clinical_significance'
                for tag in soup.find_all("a", href=True) if tag['href'].startswith('/clinvar/rs')
            ]
#            debug(f"Found product links: {product_links}")
            return product_links
    except Exception as e:
        debug(f"Request failed for {url}: {e}")
        return []

# Asynchronous function to fetch all URLs
async def fetch_all_urls(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_url(session, url) for url in urls]
        results = await asyncio.gather(*tasks)
        return [link for sublist in results for link in sublist]

# Main function for fetching ClinVar URLs
async def main():
    products = await fetch_all_urls(rules)
    write_urls_to_csv('/content/input_urls.csv', products)

# Run the main coroutine
await main()

# Function to fetch pathogenic data synchronously
def fetch_url_sync(url, retries=3, delay=2):
    clinical_terms = ["Pathogenic", "Benign", "Conflicting Interpretations Of Pathogenicity", "Likely benign", "Likely pathogenic", "Uncertain significance"]
    debug(f"Processing URL: {url}")
    for attempt in range(retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            html = response.text
            soup = BeautifulSoup(html, "html.parser")
#            debug(f"Page content length: {len(html)}")

            pathogenic_data = []

            # Find all spans with class "bold"
            span_elements = soup.find_all("span", class_="bold")
#            debug(f"Found {len(span_elements)} span elements with class 'bold'")

            for span in span_elements:
                inner_text = span.get_text().strip()
                for term in clinical_terms:
                    if term in inner_text:
                        pathogenic_data.append(f"{url}:,{inner_text}")
#                        debug(f"Found clinical term: {inner_text} in {url}")

            return pathogenic_data
        except requests.RequestException as e:
            debug(f"Request failed for {url}: {e}")
            time.sleep(delay)
            delay *= 2  # Exponential backoff
    return []
try:
  os.remove(f'/content/output_{filename}.csv')
except FileNotFoundError: print(f"File '/content/output_{filename}.csv' not found.")
# Function to fetch all URLs synchronously
def fetch_all_urls_sync(urls):
    all_data = []
    for url in urls:
        all_data.extend(fetch_url_sync(url))
    return all_data

# Main function to process pathogenic data
def main_sync():
    with open('/content/input_urls.csv', 'r') as inp:
        urls = [line.strip() for line in inp.readlines()[1:] if line.strip()]  # Skip empty lines
    pathogenic_data = fetch_all_urls_sync(urls)

    with open(f'/content/output_{filename}.csv', 'a', newline='') as out_file:
        out_file.write('URL:,Clinical significance\n')
        for data in pathogenic_data:
            out_file.write(f'{data}\n')
#            debug(f"Writing data: {data}")

# Run the synchronous main function
main_sync()


In [None]:
# @title
###### WORKS WITH RANGE

import csv
import requests
from bs4 import BeautifulSoup
import time
import os

filename = input("insert input file name:")

deviation = input("insert deviation from cut site:")
def debug(msg):
    print(f"[DEBUG] {msg}")
try:
    os.remove(f'/content/output_{filename}.csv')
except FileNotFoundError:
    print(f"File '/content/output_{filename}.csv' not found.")

# Function to read SNP locations from a CSV file (assuming gene and locus data are included in this file)
def read_snp_locations(file_path):
    snp_data = []
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            if row and len(row) >= 2:  # Assuming gene in 1st column, locus in 2nd, SNP in last column
                gene = row[-2].strip()
                locus = row[-1].strip()
                locus = f'{locus} ±{deviation}'
                snp_location = row[-1].strip()
                snp_data.append((gene, locus, snp_location))
    return snp_data

# Function to write final output to a CSV file in the format 'gene,locus,url,result'
def write_output_to_csv(file_path, data):
    with open(file_path, 'a', newline='') as out_file:
        writer = csv.writer(out_file)
        writer.writerow(['gene', 'locus', 'url', 'result'])  # Write header
        for entry in data:
            writer.writerow(entry)

# Generate URLs for ClinVar based on SNP data
def generate_clinvar_urls(snp_data):
    urls = []
    for gene, locus, snp in snp_data:
        chr = snp.replace('chr', '')
        chrpos = chr.split(':')[1].strip() if ':' in chr else ''
        chr = chr.split(':')[0]
        if "+" or "-" in chrpos:
            chrpos = chrpos.replace("+", "").replace("-", "")
        chrpos = int(chrpos)
        chrpos1 = chrpos - int(deviation)
        chrpos2 = chrpos + int(deviation)
        url = f'https://www.ncbi.nlm.nih.gov/clinvar/?term={chr}%5BChr%5D+AND+{chrpos1}%3A{chrpos2}%5BChrPos%5D'
        urls.append((gene, locus, url))  # Store gene and locus with URL
    return urls

# Function to fetch pathogenic data synchronously
def fetch_url_sync(gene, locus, url, retries=3, delay=2):
    clinical_terms = ["Pathogenic", "Benign", "Conflicting Interpretations Of Pathogenicity", "Likely benign", "Likely pathogenic", "Uncertain significance"]
    debug(f"Processing URL: {url}")
    for attempt in range(retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            html = response.text
            soup = BeautifulSoup(html, "html.parser")

            # Collect all significant clinical terms
            pathogenic_data = []

            # Find all spans with class "bold" that might contain clinical significance data
            span_elements = soup.find_all("span", class_="bold")

            for span in span_elements:
                inner_text = span.get_text().strip()
                for term in clinical_terms:
                    if term in inner_text:
                        pathogenic_data.append(inner_text)
                        break  # Stop checking further terms for this span

            # If significant data is found, join it into a single string
            if pathogenic_data:
                result = "; ".join(pathogenic_data)
                return (gene, locus, url, result)
            else:
                return None  # Return None if no significant data is found

        except requests.RequestException as e:
            print(f"Request failed for {url}: {e}")
            time.sleep(delay)
            delay *= 2  # Exponential backoff
    return None

# Main function to process pathogenic data synchronously
def main_sync():
    snp_data = read_snp_locations(f'/content/{filename}.csv')
    urls = generate_clinvar_urls(snp_data)

    output_data = []
    for gene, locus, url in urls:
        result = fetch_url_sync(gene, locus, url)
        if result:  # Only add results that have significant data
            output_data.append(result)

    write_output_to_csv(f'/content/output_{filename}.csv', output_data)

# Run the synchronous main function
main_sync()


In [None]:
# @title
import os
import csv
import requests
import time
import gzip  # Nieuwe import toegevoegd
from lxml import html
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

# Installeer de benodigde pakketten (indien nodig)
# Uncomment de volgende regels als je deze pakketten nog niet hebt geïnstalleerd
# !apt-get update
# !apt install -y chromium-browser
# !pip install -q selenium
# !pip install -q webdriver-manager

# Vraag om inputbestand en afwijking
filename = input("Insert input file name (zonder .csv): ")
deviation_input = input("Insert deviation from cut site: ")

# Converteer deviation naar integer en valideer
try:
    deviation = int(deviation_input)
    if deviation < 0:
        raise ValueError("Deviation must be a non-negative integer.")
except ValueError as ve:
    print(f"Invalid deviation input: {ve}")
    exit(1)

def debug(msg):
    print(f"[DEBUG] {msg}")

# Definieer paden voor outputbestanden
output_csv_path = f'output_{filename}.csv'
output_sequences_path = f'output_sequences_{filename}.csv'

# Verwijder eventueel bestaand outputbestand voor ClinVar gegevens
for path in [output_csv_path, output_sequences_path]:
    try:
        os.remove(path)
        print(f"Removed existing file: {path}")
    except FileNotFoundError:
        print(f"File '{path}' not found. Proceeding...")

# Functie om SNP-locaties uit een CSV-bestand te lezen
def read_snp_locations(file_path):
    snp_data = []
    with open(file_path, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader, None)  # Sla header over
        for row in reader:
            if row and len(row) >= 2:  # Aangenomen dat gen in 1e kolom, locus in 2e, SNP in laatste kolom
                gene = row[-2].strip()
                locus = row[-1].strip()
                locus_with_deviation = f'{locus} ±{deviation}'
                snp_location = row[-1].strip()
                snp_data.append((gene, locus_with_deviation, snp_location))
    return snp_data

# Functie om output naar CSV te schrijven
def write_output_to_csv(file_path, data):
    with open(file_path, 'a', newline='', encoding='utf-8') as out_file:
        writer = csv.writer(out_file)
        if os.path.getsize(file_path) == 0:
            writer.writerow(['gene', 'locus', 'url', 'result'])  # Schrijf header alleen als bestand leeg is
        for entry in data:
            writer.writerow(entry)

# Functie om ClinVar URL's te genereren op basis van SNP-data
def generate_clinvar_urls(snp_data):
    urls = []
    for gene, locus, snp in snp_data:
        chr_info = snp.replace('chr', '')
        if ':' in chr_info:
            chr_num, chr_pos = chr_info.split(':', 1)
        else:
            chr_num, chr_pos = chr_info, ''
        chr_num = chr_num.strip()
        if "+" in chr_pos:
            chr_pos = chr_pos.replace("+", "").replace("-","")
            chr_pos = int(chr_pos) - 3
        elif "-" in chr_pos:
            chr_pos = chr_pos.replace("+", "").replace("-","")
            chr_pos = int(chr_pos) + 3
        try:
            chr_pos = int(chr_pos)
            if chr_pos <= 0:
                print(f"Invalid chromosome position (non-positive): {chr_pos} for SNP: {snp}")
                continue
        except ValueError:
            print(f"Invalid chromosome position: {chr_pos} for SNP: {snp}")
            continue
        chrpos1 = chr_pos - deviation
        chrpos2 = chr_pos + deviation
        # Zorg ervoor dat chrpos1 niet negatief wordt
        if chrpos1 < 1:
            chrpos1 = 1
        url = f'https://www.ncbi.nlm.nih.gov/clinvar/?term={chr_num}%5BChr%5D+AND+{chrpos1}%3A{chrpos2}%5BChrPos%5D'
        urls.append((gene, locus, url))  # Sla gen en locus op met URL
    return urls

# Functie om ClinVar gegevens synchron te halen
def fetch_clinvar_data(gene, locus, url, retries=3, delay=2):
    clinical_terms = ["Pathogenic", "Benign", "Conflicting Interpretations Of Pathogenicity",
                      "Likely benign", "Likely pathogenic", "Uncertain significance"]
    debug(f"Processing URL: {url}")
    for attempt in range(retries):
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                              "AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/58.0.3029.110 Safari/537.3"
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            html_content = response.content  # Gebruik bytes in plaats van string
            tree = html.fromstring(html_content)

            # Zoek naar klinische termen binnen spans met class "bold"
            pathogenic_data = []
            span_elements = tree.xpath("//span[@class='bold']")
            for span in span_elements:
                inner_text = span.text_content().strip()
                for term in clinical_terms:
                    if term in inner_text:
                        pathogenic_data.append(inner_text)
                        break  # Stop met zoeken naar termen voor deze span

            # Als significante data is gevonden, voeg deze samen
            if pathogenic_data:
                result = "; ".join(pathogenic_data)
                return (gene, locus, url, result)
            else:
                return None  # Geen significante data gevonden

        except requests.RequestException as e:
            print(f"Request failed for {url}: {e}")
            time.sleep(delay)
            delay *= 2  # Exponentiële backoff
        except ValueError as ve:
            print(f"Parsing failed for {url}: {ve}")
            print("HTML content causing the issue:")
            print(html_content[:500])  # Print de eerste 500 bytes van de HTML voor debugging
            break  # Stop met proberen als er een parsing error is
    return None

# Hoofdfunctie om ClinVar data synchron te verwerken
def main_sync():
    snp_data = read_snp_locations(f'{filename}.csv')
    urls = generate_clinvar_urls(snp_data)

    output_data = []
    for gene, locus, url in urls:
        result = fetch_clinvar_data(gene, locus, url)
        if result:  # Voeg alleen resultaten toe die significante data hebben
            output_data.append(result)

    write_output_to_csv(output_csv_path, output_data)

# Voer de synchron main functie uit voor ClinVar gegevens
main_sync()

# Initialiseer het output_sequences.csv bestand met header
with open(output_sequences_path, 'w', newline='', encoding='utf-8') as seq:
    writer = csv.writer(seq)
    writer.writerow(['gene', 'strand', 'locus', 'sequence'])  # Schrijf header

# Functie om Chromosome URL's te genereren en FASTA-sequenties te halen
def generate_chromosome_urls(output):
    with open(output, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]  # Sla header over
        for line in lines:
            column = line.strip().split(',')
            if len(column) < 2:
                print(f"Invalid line format: {line}")
                continue

            gene = column[0].strip()
            locus_info = column[1].strip()

            try:
                chr_part, pos_part = locus_info.split(':', 1)
                chr_num = chr_part.replace('chr', '').strip()
                pos_str = pos_part.strip().split(' ')[0]

                if "+" in pos_str:
                    pos_str_clean = pos_str.replace("+", "").replace("-", "")
                    chrpos = int(pos_str_clean) + 17
                    strand = "+"
                elif "-" in pos_str:
                    pos_str_clean = pos_str.replace("+", "").replace("-", "")
                    chrpos = int(pos_str_clean) + 3
                    strand = "-"
                else:
                    print(f"Invalid chromosome position format: {pos_str}")
                    continue

                if chrpos <= 0:
                    print(f"Invalid chrpos (non-positive): {chrpos} for locus: {locus_info}")
                    continue

            except (ValueError, IndexError) as e:
                print(f"Error parsing locus info '{locus_info}': {e}")
                continue

            chrpos1 = chrpos - 100
            chrpos2 = chrpos  # Originele positie
            chrpos3 = chrpos + 101

            # Zorg ervoor dat chrpos1 niet negatief wordt
            if chrpos1 < 1:
                chrpos1 = 1

            fasta_url = f'http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr{chr_num}.fa.gz'

            # Yield alle benodigde informatie
            yield (fasta_url, chrpos1, chrpos2, chrpos3, chr_num, gene, strand)

# Functie om FASTA-bestanden te downloaden en sequenties te extraheren
def download_fasta(fasta_urls):
    for url, pos1, pos2, pos3, chr_num, gene, strand in fasta_urls:
        print(f"Downloading FASTA from: {url}")
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to download {url}: {e}")
            continue

        fasta_filename = f'chr{chr_num}.fa.gz'
        with open(fasta_filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {fasta_filename}.")

        # Extract sequences from the downloaded FASTA
        try:
            with gzip.open(fasta_filename, 'rt', encoding='utf-8') as f:
                sequence = ""
                for line in f:
                    line = line.strip()
                    if line.startswith('>'):
                        continue  # Skip header lines
                    sequence += line  # Concatenate sequence lines
        except OSError as e:
            print(f"Failed to extract {fasta_filename}: {e}")
            os.remove(fasta_filename)
            continue

        # Zorg ervoor dat we de juiste posities gebruiken om de sequenties te extraheren
        # Let op: Python is 0-based, dus we passen de indices aan
        # Controleer of pos3 niet buiten de sequentie valt
        if pos3 > len(sequence):
            print(f"Position {pos3} exceeds sequence length for chr{chr_num}. Skipping.")
            os.remove(fasta_filename)
            continue

        seq1 = sequence[pos1:pos2]  # Sequentie tussen pos1 en chrpos
        seq2 = sequence[pos2:pos3]    # Sequentie tussen chrpos en pos3

        # Schrijf beide sequenties naar het output_sequences.csv bestand
        with open(output_sequences_path, 'a', newline='', encoding='utf-8') as seq_file:
            writer = csv.writer(seq_file)
            writer.writerow([gene, strand, f'chr{chr_num}:{pos1}-{pos2}', seq1])  # Eerste sequentie
            writer.writerow([gene, strand, f'chr{chr_num}:{pos2}-{pos3}', seq2])      # Tweede sequentie

        # Verwijder het gedownloade FASTA-bestand om ruimte te besparen
        os.remove(fasta_filename)
        print(f"Processed and removed {fasta_filename}.")

# Genereer chromosoom URL's en download FASTA-bestanden
fasta_urls = list(generate_chromosome_urls(output_csv_path))
download_fasta(fasta_urls)

#print(chrnum)
print("Process completed.")


Run this cell after uploading your file in the main path on colab and typing in the text spaces

In [None]:
import os
import csv
import requests
import time
import gzip
from lxml import html
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict

# Vraag om inputbestand en afwijking
file_name = "insert filename" #@param {type: "string"}
deviation = 1 #@param {type: "integer"}

# Converteer deviation naar integer en valideer
try:
    deviation = int(deviation)
    if deviation < 0:
        raise ValueError("Deviation must be a non-negative integer.")
except ValueError as ve:
    print(f"Invalid deviation input: {ve}")
    exit(1)

# Definieer paden voor outputbestanden

output_csv_path = f'output_{file_name}.csv'
output_sequences_path = f'output_sequences_{file_name}.csv'

# Verwijder eventueel bestaand outputbestand voor ClinVar gegevens
for path in [output_csv_path, output_sequences_path]:
    try:
        os.remove(path)
        print(f"Removed existing file: {path}")
    except FileNotFoundError:
        print(f"File '{path}' not found. Proceeding...")

# Functie om SNP-locaties uit een CSV-bestand te lezen
def read_snp_locations(file_path):
    snp_data = []
    with open(file_path, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader, None)  # Sla header over
        for row in reader:
            if row and len(row) >= 2:  # Aangenomen dat gen in 1e kolom, locus in 2e, SNP in laatste kolom
                gene = row[-2].strip()
                locus = row[-1].strip()
                locus_with_deviation = f'{locus} ±{deviation}'
                snp_location = row[-1].strip()
                snp_data.append((gene, locus_with_deviation, snp_location))
    return snp_data

# Functie om output naar CSV te schrijven
def write_output_to_csv(file_path, data, header=None):
    mode = 'a'
    write_header = False
    if not os.path.exists(file_path):
        write_header = True
    elif os.path.getsize(file_path) == 0:
        write_header = True

    with open(file_path, mode, newline='', encoding='utf-8') as out_file:
        writer = csv.writer(out_file)
        if write_header and header:
            writer.writerow(header)
        writer.writerows(data)

# Functie om ClinVar URL's te genereren op basis van SNP-data
def generate_clinvar_urls(snp_data):
    urls = []
    for gene, locus, snp in snp_data:
        chr_info = snp.replace('chr', '')
        if ':' in chr_info:
            chr_num, chr_pos = chr_info.split(':', 1)
        else:
            chr_num, chr_pos = chr_info, ''
        chr_num = chr_num.strip()
        if "+" in chr_pos:
            chr_pos = chr_pos.replace("+", "").replace("-","")
            chr_pos = int(chr_pos) + 17
        elif "-" in chr_pos:
            chr_pos = chr_pos.replace("+", "").replace("-","")
            chr_pos = int(chr_pos) + 3
        try:
            chr_pos = int(chr_pos)
            if chr_pos <= 0:
                print(f"Invalid chromosome position (non-positive): {chr_pos} for SNP: {snp}")
                continue
        except ValueError:
            print(f"Invalid chromosome position: {chr_pos} for SNP: {snp}")
            continue
        chrpos1 = chr_pos - deviation
        chrpos2 = chr_pos + deviation
        # Zorg ervoor dat chrpos1 niet negatief wordt
        if chrpos1 < 1:
            chrpos1 = 1
        url = f'https://www.ncbi.nlm.nih.gov/clinvar/?term={chr_num}%5BChr%5D+AND+{chrpos1}%3A{chrpos2}%5BChrPos%5D'
        urls.append((gene, locus, url, chr_num, chr_pos))
    return urls

# Functie om ClinVar gegevens op te halen
def fetch_clinvar_data(entry, retries=3, delay=2):
    gene, locus, url, chr_num, chr_pos = entry
    clinical_terms = [
        "Pathogenic", "Benign", "Conflicting Interpretations Of Pathogenicity",
        "Likely benign", "Likely pathogenic", "Uncertain significance", "Oncogenic"
    ]
    print(f"Processing URL: {url}")
    for attempt in range(retries):
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                              "AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/58.0.3029.110 Safari/537.3"
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            tree = html.fromstring(response.content)

            # Zoek naar klinische termen binnen spans met class "bold"
            pathogenic_data = []
            span_elements = tree.xpath("//span[@class='bold']")
            for span in span_elements:
                inner_text = span.text_content().strip()
                for term in clinical_terms:
                    if term in inner_text:
                        pathogenic_data.append(inner_text)
                        break  # Stop met zoeken naar termen voor deze span

            # Als significante data is gevonden, voeg deze samen
            if pathogenic_data:
                result = " - ".join(pathogenic_data)
                return (gene, locus, url, result)
            else:
                return None  # Geen significante data gevonden

        except requests.RequestException as e:
            print(f"Request failed for {url}: {e}")
            time.sleep(delay)
            delay *= 2  # Exponentiële backoff
        except ValueError as ve:
            print(f"Parsing failed for {url}: {ve}")
            break  # Stop met proberen als er een parsing error is
    return None

# Hoofdfunctie om ClinVar data parallel te verwerken
def main_sync():
    snp_data = read_snp_locations(f'{file_name}.csv')
    urls = generate_clinvar_urls(snp_data)

    header = ['gene', 'locus', 'url', 'result']
    write_output_to_csv(output_csv_path, [], header=header)  # Initialize with header

    output_data = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_entry = {executor.submit(fetch_clinvar_data, entry): entry for entry in urls}
        for future in as_completed(future_to_entry):
            result = future.result()
            if result:
                output_data.append(result)

    # Write all results at once
    write_output_to_csv(output_csv_path, output_data)

# Initialiseer het output_sequences.csv bestand met header
def initialize_sequences_csv():
    with open(output_sequences_path, 'w', newline='', encoding='utf-8') as seq:
        writer = csv.writer(seq)
        writer.writerow(['gene', 'strand', 'locus', 'sequence'])  # Schrijf header

# Functie om Chromosome URL's te genereren en SNPs te groeperen per chromosoom
def generate_chromosome_snp_map(output):
    snp_map = defaultdict(list)
    with open(output, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            locus = row['locus']
            gene = row['gene']
            # Parse locus to get chromosome number and position
            try:
                chr_part, pos_part = locus.split(':', 1)
                chr_num = chr_part.replace('chr', '').strip()
                pos_str = pos_part.strip().split(' ')[0]

                if "+" in pos_str:
                    pos_str_clean = pos_str.replace("+", "").replace("-", "")
                    chrpos = int(pos_str_clean) + 17
                    strand = "+"
                elif "-" in pos_str:
                    pos_str_clean = pos_str.replace("+", "").replace("-", "")
                    chrpos = int(pos_str_clean) + 3
                    strand = "-"
                else:
                    print(f"Invalid chromosome position format: {pos_str}")
                    continue

                if chrpos <= 0:
                    print(f"Invalid chrpos (non-positive): {chrpos} for locus: {locus}")
                    continue

                chrpos1 = chrpos - 100
                chrpos2 = chrpos  # Originele positie
                chrpos3 = chrpos + 101

                # Zorg ervoor dat chrpos1 niet negatief wordt
                if chrpos1 < 1:
                    chrpos1 = 1

                snp_map[chr_num].append({
                    'gene': gene,
                    'strand': strand,
                    'chrpos1': chrpos1,
                    'chrpos2': chrpos2,
                    'chrpos3': chrpos3,
                    'locus': locus
                })

            except (ValueError, IndexError) as e:
                print(f"Error parsing locus info '{locus}': {e}")
                continue
    return snp_map

# Functie om FASTA-bestanden te downloaden en sequenties te extraheren
def download_and_process_fasta(chr_num, snps):
    fasta_url = f'http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr{chr_num}.fa.gz'
    fasta_filename = f'chr{chr_num}.fa.gz'

    print(f"Downloading FASTA from: {fasta_url}")
    try:
        response = requests.get(fasta_url, timeout=30)
        response.raise_for_status()
        with open(fasta_filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {fasta_filename}.")
    except requests.RequestException as e:
        print(f"Failed to download {fasta_url}: {e}")
        return

    # Extract sequences from the downloaded FASTA
    try:
        with gzip.open(fasta_filename, 'rt', encoding='utf-8') as f:
            sequence = []
            for line in f:
                line = line.strip()
                if line.startswith('>'):
                    continue  # Skip header lines
                sequence.append(line)
            sequence = ''.join(sequence)
    except OSError as e:
        print(f"Failed to extract {fasta_filename}: {e}")
        os.remove(fasta_filename)
        return

    # Verwijder het gedownloade FASTA-bestand om ruimte te besparen
    os.remove(fasta_filename)
    print(f"Processed and removed {fasta_filename}.")

    # Extract and collect sequences
    sequences = []
    for snp in snps:
        gene = snp['gene']
        strand = snp['strand']
        pos1 = snp['chrpos1']
        pos2 = snp['chrpos2']
        pos3 = snp['chrpos3']
        locus = snp['locus']

        # Controleer of pos3 niet buiten de sequentie valt
        if pos3 > len(sequence):
            print(f"Position {pos3} exceeds sequence length for chr{chr_num}. Skipping SNP at {locus}.")
            continue

        # Python is 0-based, dus we passen de indices aan
        seq1 = sequence[pos1:pos2]  # Sequentie tussen pos1 en chrpos
        seq2 = sequence[pos2:pos3]    # Sequentie tussen chrpos en pos3

        sequences.append([gene, strand, f'chr{chr_num}:{pos1}-{pos2}', seq1])  # Eerste sequentie
        sequences.append([gene, strand, f'chr{chr_num}:{pos2}-{pos3}', seq2])      # Tweede sequentie

    # Schrijf alle sequenties in één keer naar het CSV-bestand
    write_output_to_csv(output_sequences_path, sequences)

# Hoofdfunctie om FASTA-bestanden parallel te downloaden en te verwerken
def process_fasta_sequences(snp_map):
    initialize_sequences_csv()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        for chr_num, snps in snp_map.items():
            futures.append(executor.submit(download_and_process_fasta, chr_num, snps))
        for future in as_completed(futures):
            future.result()  # Trigger exceptions if any

# Hoofdfunctie om het gehele proces uit te voeren
def main():
    start_time = time.time()
    print("Starting ClinVar data retrieval...")
    main_sync()
    print("ClinVar data retrieval completed.")

    print("Grouping SNPs by chromosome for FASTA processing...")
    snp_map = generate_chromosome_snp_map(output_csv_path)
    print(f"Grouped SNPs into {len(snp_map)} chromosomes.")

    print("Starting FASTA downloads and sequence extraction...")
    process_fasta_sequences(snp_map)
    print("FASTA processing completed.")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Process completed in {elapsed_time:.2f} seconds.")

if __name__ == "__main__":
    main()


File 'output_All chromosomes.csv' not found. Proceeding...
File 'output_sequences_All chromosomes.csv' not found. Proceeding...
Starting ClinVar data retrieval...
Processing URL: https://www.ncbi.nlm.nih.gov/clinvar/?term=1%5BChr%5D+AND+198699578%3A198699618%5BChrPos%5DProcessing URL: https://www.ncbi.nlm.nih.gov/clinvar/?term=1%5BChr%5D+AND+155235235%3A155235275%5BChrPos%5D

Processing URL: https://www.ncbi.nlm.nih.gov/clinvar/?term=2%5BChr%5D+AND+47800613%3A47800653%5BChrPos%5D
Processing URL: https://www.ncbi.nlm.nih.gov/clinvar/?term=3%5BChr%5D+AND+179218300%3A179218340%5BChrPos%5D
Processing URL: https://www.ncbi.nlm.nih.gov/clinvar/?term=4%5BChr%5D+AND+1805641%3A1805681%5BChrPos%5D
Processing URL: https://www.ncbi.nlm.nih.gov/clinvar/?term=5%5BChr%5D+AND+223506%3A223546%5BChrPos%5D
Processing URL: https://www.ncbi.nlm.nih.gov/clinvar/?term=6%5BChr%5D+AND+43007262%3A43007302%5BChrPos%5DProcessing URL: https://www.ncbi.nlm.nih.gov/clinvar/?term=7%5BChr%5D+AND+117530972%3A117531012%

This code can run within 100 seconds when it has a single pathogenic variant found in each chromosome and when it has 100 lines as input file. (Running on CPU)