#Cellranger count of gene expression data
required before guide calling

In [3]:
import sys
# adding notebooks to the system path
sys.path.insert(0, '/home/southark/notebooks')

from guide_calling import *
import os

import os.path

from BarcodeCaller import *

import pandas as pd
import numpy as np
import csv

import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
from scipy.signal import argrelextrema
from scipy.stats import gaussian_kde
from scipy.io import mmread
from joblib import Parallel, delayed


import scipy.io
import gzip
import codecs

%matplotlib inline  
%load_ext autoreload
%autoreload 2

sns.set_style('white')
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
def create_and_set_wdir(working_dir):
    #create and set working directory
    #working_dir = "/scratch/sequencing/processed_overloading/notebook_test/"

    isdir = os.path.isdir(working_dir)

    if isdir:
        print('moving to the working directory')
        %cd {working_dir}


    else:
        print('created working directory')
        %mkdir {working_dir}
        print('moving to the working directory')
        %cd {working_dir}      

## Set experiment cellranger output directory

In [5]:
#set cellranger output dir
working_dir = "/data/norman/southark/rpe1_tfs/23127_RPE1_TFs_CRa"        
create_and_set_wdir(working_dir)

created working directory
moving to the working directory
/lila/data/norman/southark/rpe1_tfs/23127_RPE1_TFs_CRa


# Create guide barcode csv

In [6]:
## create guide barcode csv
#import list of expected guides
guides = pd.read_csv('/data/norman/southark/guide_libraries/CRISPRa_guide_selection/final_selected/final_combined_paralogs_morf_CRISPRa_library_with_adapters.csv', skipfooter=3)
#guides = guides.rename(columns = {'protospacer': 'barcode'})


ctrls = pd.read_csv('/data/norman/southark/guide_libraries/CRISPRa_guide_selection/final_selected/Weissman_v3_CRISPRa_filtered_controls_NHDF-Ad_05172023.csv', skipfooter=3)
ctrls = ctrls.rename(columns = {'protospacer': 'barcode', 'Unnamed: 0': 'id'})

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [7]:
guides = guides.rename(columns = {'protospacer': 'sequence',
                                 # 'sgID':'id',
                                 'gene': 'name'})


ctrls = ctrls.rename(columns = {'barcode': 'sequence',
                                 # 'sgID':'id',
                                 'gene': 'name'})

guides['sequence'] = guides.sequence.str.upper()
#guides['id'] = guides['id'].str.split(',').str[0]


guides.drop(columns =['full_sequence'], inplace = True)
ctrls.drop(columns =['transcript', 'sgID','source', 'original_rank','entrezgene', 'n_offtargets', 'n_exact_matches',
                    'state', 'n_off_strict'], inplace = True)

guides = pd.concat([guides, ctrls])


guides = guides.assign(target_gene_id = np.where(guides.name.str.contains('non_targeting'),
                                                 'Non-Targeting',
                                                 guides['name']))

guides = guides.assign(target_gene_name = np.where(guides.id.str.contains('non_targeting'),
                                                 'Non-Targeting',
                                                 guides['name']))

#guides['target_gene_id'] = guides['target_gene_id'].str.split('_').str[0]

guides

Unnamed: 0,id,name,sequence,target_gene_id,target_gene_name
0,AATF_GGAAGCGCGCAGAAGGTTGA,AATF,GGAAGCGCGCAGAAGGTTGA,AATF,AATF
1,AATF_GCGTGCGAGTGCGCGGGAAG,AATF,GCGTGCGAGTGCGCGGGAAG,AATF,AATF
2,AATF_GCGCAGAAGGTTGAAGGGAT,AATF,GCGCAGAAGGTTGAAGGGAT,AATF,AATF
3,AATF_GGGCGTTGCTAGCATGAAGG,AATF,GGGCGTTGCTAGCATGAAGG,AATF,AATF
4,AATF_GTGAAGGGATTGGAGCCGTA,AATF,GTGAAGGGATTGGAGCCGTA,AATF,AATF
...,...,...,...,...,...
73,non_targeting_GAACGGGCCGTGATCGGACC,non_targeting,GAACGGGCCGTGATCGGACC,Non-Targeting,Non-Targeting
74,non_targeting_GGCCCTCCCCACCGGCACGA,non_targeting,GGCCCTCCCCACCGGCACGA,Non-Targeting,Non-Targeting
75,non_targeting_GTCGTTATCTCGCTATTTCG,non_targeting,GTCGTTATCTCGCTATTTCG,Non-Targeting,Non-Targeting
76,non_targeting_GTGGTTATACCCGACTAGAC,non_targeting,GTGGTTATACCCGACTAGAC,Non-Targeting,Non-Targeting


 ## Find sequencing files for each sample

In [10]:
from glob import glob

#set file name search strings
SAMPLE_GLOB = '*_GEX*'

FB_GLOB = '*_guides*'

#perform for each project folder (i.e. if more sequencing was ordered)

input_fastq_rna_1="/data/norman/southark/datasets/Project_15055/"

unique_samples_1 = sorted(set(map(lambda x: x.split('/')[-1].split('_S')[0].split('Sample_')[0], 
                                glob(os.path.join(input_fastq_rna_1, '**', SAMPLE_GLOB), recursive=True))))

unique_samples_fb_1 = sorted(set(map(lambda x: x.split('/')[-1].split('_S')[0].split('Sample_')[0], 
                                glob(os.path.join(input_fastq_rna_1, '**', FB_GLOB), recursive=True))))


input_fastq_rna_2="/data/norman/southark/datasets/Project_15055_B/"

unique_samples_2 = sorted(set(map(lambda x: x.split('/')[-1].split('_S')[0].split('Sample_')[0], 
                                glob(os.path.join(input_fastq_rna_2, '**', SAMPLE_GLOB), recursive=True))))

unique_samples_fb_2 = sorted(set(map(lambda x: x.split('/')[-1].split('_S')[0].split('Sample_')[0], 
                                glob(os.path.join(input_fastq_rna_2, '**', FB_GLOB), recursive=True))))

unique_samples_1.remove('')
unique_samples_fb_1.remove('')

unique_samples_2.remove('')
unique_samples_fb_2.remove('')

In [11]:
unique_samples_1

['10A_GEX_IGO_15055_11',
 '10B_GEX_IGO_15055_16',
 '11A_GEX_IGO_15055_12',
 '11B_GEX_IGO_15055_17',
 '12A_GEX_IGO_15055_13',
 '12B_GEX_IGO_15055_18',
 '13A_GEX_IGO_15055_14',
 '13B_GEX_IGO_15055_19',
 '14A_GEX_IGO_15055_15',
 '14B_GEX_IGO_15055_20',
 '2A_GEX_IGO_15055_1',
 '2B_GEX_IGO_15055_6',
 '3A_GEX_IGO_15055_2',
 '3B_GEX_IGO_15055_7',
 '4A_GEX_IGO_15055_3',
 '4B_GEX_IGO_15055_8',
 '5A_GEX_IGO_15055_4',
 '5B_GEX_IGO_15055_9',
 '6A_GEX_IGO_15055_5',
 '6B_GEX_IGO_15055_10']

In [12]:
unique_samples_fb_1

['10A_guides_IGO_15055_31',
 '10B_guides_IGO_15055_36',
 '11A_guides_IGO_15055_32',
 '11B_guides_IGO_15055_37',
 '12A_guides_IGO_15055_33',
 '12B_guides_IGO_15055_38',
 '13A_guides_IGO_15055_34',
 '13B_guides_IGO_15055_39',
 '14A_guides_IGO_15055_35',
 '14B_guides_IGO_15055_40',
 '2A_guides_IGO_15055_21',
 '2B_guides_IGO_15055_26',
 '3A_guides_IGO_15055_22',
 '3B_guides_IGO_15055_27',
 '4A_guides_IGO_15055_23',
 '4B_guides_IGO_15055_28',
 '5A_guides_IGO_15055_24',
 '5B_guides_IGO_15055_29',
 '6A_guides_IGO_15055_25',
 '6B_guides_IGO_15055_30']

In [13]:
unique_samples_2

['10A_GEX_IGO_15055_B_11',
 '10B_GEX_IGO_15055_B_16',
 '11A_GEX_IGO_15055_B_12',
 '11B_GEX_IGO_15055_B_17',
 '12A_GEX_IGO_15055_B_13',
 '12B_GEX_IGO_15055_B_18',
 '13A_GEX_IGO_15055_B_14',
 '13B_GEX_IGO_15055_B_19',
 '14A_GEX_IGO_15055_B_15',
 '14B_GEX_IGO_15055_B_20',
 '2A_GEX_IGO_15055_B_1',
 '2B_GEX_IGO_15055_B_6',
 '3A_GEX_IGO_15055_B_2',
 '3B_GEX_IGO_15055_B_7',
 '4A_GEX_IGO_15055_B_3',
 '4B_GEX_IGO_15055_B_8',
 '5A_GEX_IGO_15055_B_4',
 '5B_GEX_IGO_15055_B_9',
 '6A_GEX_IGO_15055_B_5',
 '6B_GEX_IGO_15055_B_10']

In [14]:
unique_samples_fb_2

['10A_guides_IGO_15055_B_31',
 '10B_guides_IGO_15055_B_36',
 '11A_guides_IGO_15055_B_32',
 '11B_guides_IGO_15055_B_37',
 '12A_guides_IGO_15055_B_33',
 '12B_guides_IGO_15055_B_38',
 '13A_guides_IGO_15055_B_34',
 '13B_guides_IGO_15055_B_39',
 '14B_guides_IGO_15055_B_40',
 '2B_guides_IGO_15055_B_26',
 '3B_guides_IGO_15055_B_27',
 '4B_guides_IGO_15055_B_28',
 '5B_guides_IGO_15055_B_29',
 '6B_guides_IGO_15055_B_30']

In [15]:
import os
import glob

def get_directory_paths(path: str, file_names: list) -> dict:
    """
    Returns the combination of parent directory paths that exist for every file name in a directory.

    Args:
    path (str): The path to the directory.
    file_names (list): A list of unique file names.

    Returns:
    dict: A dictionary with the file names as keys and a list of parent directory paths as values.
    """
    result = {}
    for root, dirs, files in os.walk(path):
        for file_name in file_names:
            if not result.get(file_name):
                result[file_name] = []
            for f in files:
                #print(f)
                if file_name+'_' in f:
                    result[file_name].append(os.path.dirname(os.path.join(root, f)))
                    break
                    
    return result


def format_fastq_df(path, samples, lib_type):
    sample_paths = get_directory_paths(path, samples)

    fastq_df = pd.DataFrame([{'sample': key, 'fastqs': value} for key, values in sample_paths.items() for value in values])
    fastq_df['library_type'] = lib_type
    fastq_df['sample_id'] = fastq_df['sample'].str.split('_').str[0]
    
    return fastq_df



gex_1 = format_fastq_df(input_fastq_rna_1, unique_samples_1, 'Gene Expression')
gbc_1 = format_fastq_df(input_fastq_rna_1, unique_samples_fb_1, 'CRISPR Guide Capture')

gex_2 = format_fastq_df(input_fastq_rna_2, unique_samples_2, 'Gene Expression')
gbc_2 = format_fastq_df(input_fastq_rna_2, unique_samples_fb_2, 'CRISPR Guide Capture')

In [16]:
gex_1

Unnamed: 0,sample,fastqs,library_type,sample_id
0,10A_GEX_IGO_15055_11,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,10A
1,10A_GEX_IGO_15055_11,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,10A
2,10A_GEX_IGO_15055_11,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,10A
3,10B_GEX_IGO_15055_16,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,10B
4,10B_GEX_IGO_15055_16,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,10B
5,11A_GEX_IGO_15055_12,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,11A
6,11A_GEX_IGO_15055_12,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,11A
7,11B_GEX_IGO_15055_17,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,11B
8,11B_GEX_IGO_15055_17,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,11B
9,12A_GEX_IGO_15055_13,/data/norman/southark/datasets/Project_15055/F...,Gene Expression,12A


In [17]:
df = pd.concat([gex_1, gbc_1, gex_2, gbc_2])

# Group the DataFrame by 'GroupColumn'
grouped = df.groupby('sample_id')

# Export each group to separate DataFrames
for group_name, group_data in grouped:
    group_df = pd.DataFrame(group_data)
    # Here, 'group_name' is the value in the 'GroupColumn' that represents the group.
    # You can save/export 'group_df' to a file or store it in a data structure as needed.
    print(f"Group {group_name}:")
    group_df[['fastqs', 'sample', 'library_type']].to_csv('feature_extract_lib_{0}.csv'.format(group_name), index = False)
    #print(group_df)

Group 10A:
Group 10B:
Group 11A:
Group 11B:
Group 12A:
Group 12B:
Group 13A:
Group 13B:
Group 14A:
Group 14B:
Group 2A:
Group 2B:
Group 3A:
Group 3B:
Group 4A:
Group 4B:
Group 5A:
Group 5B:
Group 6A:
Group 6B:


In [18]:
!ls

feature_extract_lib_10A.csv  feature_extract_lib_2A.csv
feature_extract_lib_10B.csv  feature_extract_lib_2B.csv
feature_extract_lib_11A.csv  feature_extract_lib_3A.csv
feature_extract_lib_11B.csv  feature_extract_lib_3B.csv
feature_extract_lib_12A.csv  feature_extract_lib_4A.csv
feature_extract_lib_12B.csv  feature_extract_lib_4B.csv
feature_extract_lib_13A.csv  feature_extract_lib_5A.csv
feature_extract_lib_13B.csv  feature_extract_lib_5B.csv
feature_extract_lib_14A.csv  feature_extract_lib_6A.csv
feature_extract_lib_14B.csv  feature_extract_lib_6B.csv


In [24]:
pd.read_csv('feature_extract_lib_10A.csv')

Unnamed: 0,fastqs,sample,library_type
0,/data/norman/southark/datasets/Project_15055/F...,10A_GEX_IGO_15055_11,Gene Expression
1,/data/norman/southark/datasets/Project_15055/F...,10A_GEX_IGO_15055_11,Gene Expression
2,/data/norman/southark/datasets/Project_15055/F...,10A_GEX_IGO_15055_11,Gene Expression
3,/data/norman/southark/datasets/Project_15055/F...,10A_guides_IGO_15055_31,CRISPR Guide Capture
4,/data/norman/southark/datasets/Project_15055_B...,10A_GEX_IGO_15055_B_11,Gene Expression
5,/data/norman/southark/datasets/Project_15055_B...,10A_GEX_IGO_15055_B_11,Gene Expression
6,/data/norman/southark/datasets/Project_15055_B...,10A_guides_IGO_15055_B_31,CRISPR Guide Capture


In [20]:
#check files

In [21]:
pd.read_csv('feature_extract_lib_10A.csv').fastqs[5]

'/data/norman/southark/datasets/Project_15055_B/FAUCI_0055/Sample_10A_GEX_IGO_15055_B_11'

# Set up bash script for bsub

In [1]:
#write script to generate bsub file with ref locations etc

In [34]:
# Set the reference path
ref_path = "/your/reference/path"

# Path to the folder containing CSV files
csv_folder = working_dir

# Path to the control CSV file
control_csv_path = "/path/to/your/rpe1_tfs_crispra_w_controls.csv"

# Memory allocation, wall time, and number of cores settings
# if total mem requested changes you should modify the run_cellranger_count.sh script
memory_allocation = 4  # You can change this value
wall_time = "48:00"  # You can change this value
num_cores = 64  # You can change this value

#calc total mem requested
total_mem = memory_allocation*num_cores
print(f"requesting the following amount of memory: {total_mem} GB")

# Output script file
script_file = "submit_jobs.sh"

# Open the script file for writing
with open(script_file, "w") as script_file_writer:
    # Iterate through CSV files in the folder and write to the script file
    for csv_file in os.listdir(csv_folder):
        # Check if the file is a CSV file and follows the naming convention
        if csv_file.endswith(".csv") and csv_file.startswith("feature_extract_lib_"):
            # Extract sample_id from the CSV file name
            sample_id = csv_file.split("_")[-1].split(".")[0]

            # Construct the bsub command and write to the script file
            bsub_command = f"bsub -o {sample_id}.o -e {sample_id}.e -n {str(num_cores)} -R 'rusage[mem={str(memory_allocation)}]' -W {wall_time} -cwd {working_dir} /usr/bin/bash run_cellranger_count.sh {sample_id} {os.path.join(csv_folder, csv_file)} {ref_path} {control_csv_path}\n"
            script_file_writer.write(bsub_command)

# Inform the user
print(f"Script file '{script_file}' created. You can submit the jobs later using 'bash {script_file}'.")

requesting the following amount of memory: 256 GB
Script file 'submit_jobs.sh' created. You can submit the jobs later using 'bash submit_jobs.sh'.


In [35]:
#creation/modification of run_cellranger_count.sh script

In [36]:
script_content = """#!/usr/bin/bash

# Usage: run_cellranger_count.sh NAME CSV_FILE GENOME GUIDE_CSV LOCALCORES LOCALMEM

NAME=$1
CSV_FILE=$2
GENOME=$3
GUIDE_CSV=$4

echo "Name: $NAME"
echo "CSV File: $CSV_FILE"
echo "Genome: $GENOME"
echo "Guide CSV: $GUIDE_CSV"
echo "Local Cores: $LOCALCORES"
echo "Local Memory: $LOCALMEM"

export PATH=/data/norman/southark/software/cellranger-7.1.0:$PATH

# Uncomment the following line if you want to perform a sitecheck
# cellranger sitecheck > sitecheck.txt

cellranger count --id=$NAME --libraries=$CSV_FILE --transcriptome=$GENOME --feature-ref=$GUIDE_CSV --localcores={0} --localmem={1} --include-introns=false
""".format(num_cores, int(total_mem*0.9)) #cellranger should only have 90% of total available

# Specify the file path
script_file_path = "run_cellranger_count.sh"

# Write the content to the script file
with open(script_file_path, "w") as script_file:
    script_file.write(script_content)

# Display a message indicating that the file has been created
print(f"Script file '{script_file_path}' created.")

Script file 'run_cellranger_count.sh' created.


# Job submission instructions

log on to lilac and submit jobs with the following code snippets

```
#submit jobs with this:
bsub -pack /working_dir/submit_cellranger_jobs.sh

#live updates of submission status:
watch -n0.01 bjobs

#get list of running/submited jobss
bjobs

#kill currently running jobs (if something is not right)
bjobs -o "jobid stat" | awk '{print $1}' | tail -n +2 | xargs bkill

#info mem/usage of current job
bjobs -l <job_id>
