<a href="https://colab.research.google.com/github/nibaskumar93n-debug/Morphoinformatics/blob/main/NGS_after_trim_tumor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update -qq
!apt-get install -y -qq bwa samtools

In [None]:
%%bash
# Define the output file name (often ending in .fa or .fasta)
OUTPUT_FILE="GRCh38.primary_assembly.fa.gz"

# Download the compressed FASTA file
echo "Downloading GRCh38 primary assembly..."
wget -O $OUTPUT_FILE http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz

# Decompress the file
echo "Decompressing the file..."
gunzip $OUTPUT_FILE

# The resulting file will be GRCh38.primary_assembly.fa
echo "Download and decompression complete. File name: GRCh38.primary_assembly.fa"

In [None]:
!bwa index /content/GRCh38.primary_assembly.fa

In [None]:
# Mount Google Drive (only for reading input files)
from google.colab import drive
drive.mount('/content/drive')

import os

# Input directory (Google Drive - where your reference and FASTQ files are)
input_dir = '/content/drive/MyDrive/Breast_cancer_ngs'

# Output directory (Colab local storage - faster, outputs saved here)
output_dir = '/content/output'

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Change to output directory for work
os.chdir(output_dir)
print(f"Working in: {os.getcwd()}")
print(f"Input files from: {input_dir}")
print(f"Output files to: {output_dir}")

In [None]:
# Define your sample information
# MODIFY THESE ACCORDING TO YOUR FILE NAMES
tumor_r1 = "SRR31264098_1_paired.fq.gz"
tumor_r2 = "SRR31264098_2_paired.fq.gz"

reference = "GRCh38.primary_assembly.fa"
sample_name = "Patient1441T"  # Change this to your sample ID

print(f"Processing sample: {sample_name}")
print(f"Tumor files: {tumor_r1}, {tumor_r2}")

In [None]:
# Step 1: Align TUMOR sample with BWA-MEM
import os

# Input directory (Google Drive where your reference and FASTQ files are)
input_dir = "/content/drive/MyDrive/breast_cancer_ngs"

# Output directory (Colab local storage - faster)
output_dir = "/content/output"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
reference_path = "/content/GRCh38.primary_assembly.fa"
print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")
tumor_r1_path = f"{input_dir}/{tumor_r1}"
tumor_r2_path = f"{input_dir}/{tumor_r2}"
output_bam_tumor = f"{output_dir}/{sample_name}_tumor.bam"

print("Aligning normal sample...")
!bwa mem -t 2 \
  -R '@RG\tID:{sample_name}_tumor\tSM:{sample_name}_normal\tLB:lib1\tPL:ILLUMINA' \
  {reference_path} \
  {tumor_r1_path} \
  {tumor_r2_path} \
  | samtools view -Sb - > {output_bam_tumor}

print("Normal alignment complete!")
!ls -lh {output_bam_tumor}


In [None]:
!samtools sort -o /content/output/Patient1441_tumor_sorted.bam /content/output/Patient1441T_tumor.bam

In [None]:
# Index the sorted BAM
!samtools index /content/output/Patient1441_tumor_sorted.bam

In [None]:
# Automatically download the BAM file and its index to your PC
from google.colab import files

# Download the sorted BAM file
files.download('/content/output/Patient1441_tumor_sorted.bam')

# Download the BAM index file (.bai)
files.download('/content/output/Patient1441_tumor_sorted.bam.bai')