# Notebook to find the number of unique guide in CRISPRi library
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/milnus/unique_CRISPRi_segments/blob/main/unique_CRISPRi_notebook.ipynb)


To use the notebook it requires a bit of information:
1. A file(s) with sequence(s) that are to be searched for guides.
2. The sequence that is just after the guide in the plasmid.

To give the required inputs: first enter the sequence downstream of primers in the window below. Second, run the notebook. secondly run the code in the window below by pushing the 'play' botton in the window's upper left corner. A required program will install, which takes a couple of minutes and a restart of the underlying 'computer' running the code. When this is done go to `run time` and select then `run all`. in the second code window from the top a: 'Choose files' buttom will appear at the bottom. Using this you can select all your sequencing files. then the script will run and results will be reported at the bottom of the file.

In [None]:
#@title Install required program
# Install conda and seqkit
!pip install -q condacolab
import condacolab
condacolab.install_miniconda()

# Install seqkit
!conda install bioconda::seqkit -y

In [None]:
#@title Input sequence(s)
# Import useed packages
from subprocess import Popen, PIPE
from shlex import split
import pandas as pd
from google.colab import files

# Get inputs
#@markdown Change the sequence below to be the one that us just next to the guide sequence
sequence_next_to_guides = 'GTTTTAGAGCTAGAAA' #@param {type:"string"}

#@markdown Set if the sequence above is just up- or down-stream of the guide sequence
guide_placement = "Up-stream" #@param ["Up-stream", "Down-stream"]

#@markdown Now run this cell to get a 'Choose files' button, where you can select a folder, file, or files to be analysed.
uploaded_files = files.upload()

# Remove white-space from input sequence after guide
sequence_next_to_guides = "".join(sequence_next_to_guides.split())

# Use of a forward and reverse primer??

In [None]:
#@title #### Checking input with seqkit
### Check that input sequence is valid using seqkit

# Check the input sequence
cmd = ['echo', f'test_fasta\n{sequence_next_to_guides}', '|', 'seqkit', 'seq', '-t', 'dna']

echo_process = Popen(split(f"echo '>test_fasta\n{sequence_next_to_guides}'"), stdout=PIPE)
echo_process.wait()
seq_validation_return = Popen(split("seqkit seq -t dna -v"), stdin=echo_process.stdout, stderr=PIPE)
seq_validation_return.wait()

if seq_validation_return.returncode != 0:
	seqkit_error = str(seq_validation_return.stderr.read()).removesuffix(r"\n'").split('seq: ')[1]
	raise TypeError(f'''Input sequence downstream of guide guide contains characters that are non-valid in fasta format.\nOutput from seqkit:\n{seqkit_error}''')


# Check the input folder or files:
for file in uploaded_files:
  echo_process = Popen(split(f"echo '{uploaded_files[file].decode('utf-8')}'"), stdout=PIPE)
  echo_process.wait()
  seq_validation_return = Popen(split("seqkit seq -t dna -v"), stdin=echo_process.stdout, stderr=PIPE)
  seq_validation_return.wait()

  if seq_validation_return.returncode != 0:
    seqkit_error = str(seq_validation_return.stderr.read()).removesuffix(r"\n'").split('seq: ')[1]
    raise TypeError(f'''Input sequence downstream of guide guide contains characters that are non-valid in fasta format.\nOutput from seqkit:\n{seqkit_error}''')

print("All files are validated by Seqkit")

In [None]:
#@title Find number of times sequences occur
# Output percentage of sequences with guide
# Output an interactive table that can be sorted
# Include sequences that does not have a hit - this is important for biological understanding

# Constuct dictionary with uniquely identified spacer sequences
spacer_dict = {}

# Determine if the up- or down-stream region is the guide, based on primer direction:
if guide_placement == "Down-stream":
  extract_region = '1:21'
else:
  extract_region = '-21:-1'


for file in uploaded_files:
  echo_process = Popen(split(f"echo '{uploaded_files[file].decode('utf-8')}'"), stdout=PIPE)
  echo_process.wait()

  seqkit_amplicon = Popen(split(f"seqkit amplicon -F {sequence_next_to_guides} -f -r {extract_region}"), stdin=echo_process.stdout, stdout=PIPE, stderr=PIPE)
  seqkit_amplicon.wait()
  
  seqkit_fx2tab = Popen(split("seqkit fx2tab -s"), stdin=seqkit_amplicon.stdout, stdout=PIPE, stderr=PIPE)
  seqkit_fx2tab.wait()

  cut_process = Popen(split("cut -f2"), stdin=seqkit_fx2tab.stdout, stdout=PIPE)
  cut_process.wait()

  spacer_returned = cut_process.stdout.read().strip()

  # Check if no spacer is found
  if spacer_returned == b'':
    spacer_dict[file] = 'No spacer detected'
  else:
    spacer_dict[file] = spacer_returned


# Count spacers
spacer_count_dict = {}
for file in spacer_dict:
  spacer = spacer_dict[file]
  if spacer in spacer_count_dict:
    spacer_count_dict[spacer] += 1
  else:
    spacer_count_dict[spacer] = 1

spacer_series = pd.Series(spacer_count_dict)

# Print informations of interest
num_uniq_spacers = len(spacer_series.drop(spacer_series.filter(like='No', axis=0).index))
num_no_spacer = spacer_series.filter(like='No', axis=0)
print(f"There are {num_uniq_spacers} unique spacers in supplied files.")
print(f'\t{num_uniq_spacers/len(uploaded_files)*100} % of spacers are unique.')

print("\n\n----------------------------------\n\n")

if len(num_no_spacer):
  print(f"{num_no_spacer.iloc[0]} files have no identified spacers:")
  for file in spacer_dict:
    if spacer_dict[file] == 'No spacer detected':
      print(f'\t{file}')
else:
  print("All sequences had an inserted guide")

# Find the spacers that appear multiple times and find the files that contain them
multi_occur_spacers = [spacer for spacer in spacer_count_dict if spacer_count_dict[spacer] > 1 and spacer != 'No spacer detected']

print("\n\n----------------------------------\n\n")
if (len(multi_occur_spacers)):
  for spacer in multi_occur_spacers:
    print(f"Spacer: {spacer.decode('utf-8')} was found multiple times.\nThe spacer was found in files:")
    for file in spacer_dict:
      if spacer_dict[file] == spacer:
        print(f"\t{file}")
else:
  print("All spacers occured only once")
