# Genome Mining Notebook

## Configuration

Here are the parameters that you should edit for each run. The **runName** will be the name of the run, the **pattern** is the pattern to search lasso peptides against, the **cutoffRank** is the minimum required rank to run, the **genomeDir** is the directory where the FASTA and FNA files to be mined are stored, the **database** is where you would like the database to be written to, and **outputLogs** is where the output should be written to.

**models** is a list of models to generate motifs for and search with.

**memeDir** is the directory where the meme suite was installed on this computer, and **localMotifDir** is some empty or nonexistent directory that this program can use as scratch space.

In [None]:
import yaml
config = yaml.safe_load('''
# Setup parameters for a genome mining run

## Parameters for each run
runName: "test"
pattern: 'M[A-Z]{15,45}T[A-Z][A-Z]{6,8}[DE][A-Z]{5,30}\*'
cutoffRank: 0.4 # minimum rank to register as a hit
genomeDir: "/home/nathan/Desktop/lassomining/genomes/" # keep the trailing /
database: "/home/nathan/Desktop/lassomining/output/matches.db" # no trailing /
motifs:
  - "/home/nathan/Desktop/lassomining/models/memeb.xml"
  - "/home/nathan/Desktop/lassomining/models/memec.xml"
  
## Parameters you should not have to change
memeDir: "/home/nathan/meme"

''')

Below is a lot of the logic behind the mining, but just provides a series of functions to be used in the following code block

## Mining
Here are the commands for actually running the genome mining software. Output is printed below this block as it progresses, and this is the slowest running part of the program.

In [None]:
import traceback
import sys
import os
import time
import shutil
import json
from pathlib import Path
from mining import mine

In [None]:
runName = config["runName"]
pattern = config["pattern"]
cutoffRank = config["cutoffRank"]
genomeDir = config["genomeDir"]
databaseDir = config["database"]
motifs = config["motifs"]
memeDir = config["memeDir"]
print("Beginning run " + runName)
print("cutting off hits below " + str(cutoffRank))
print("searching for pattern " + pattern)
print("Using these motifs:")
print(motifs)
print("Genomes being read from " + str(genomeDir))
print("writing output to " + databaseDir)

In [None]:
try:
    # start a timer
    t0 = time.time()
    
    # store meta data about the particular run
    runStatus = {
        "name": runName,
        "pattern": pattern,
        "input": [],
        "progress": 0.0,
        "peptides": 0,
        "cutoff": cutoffRank
    }
    
    ## create filepaths
    # create genome folder if not already there
    if not os.path.exists(genomeDir):
        print("could not find " + genomeDir + ", attempting to make it")
        os.makedirs(genomeDir)
    # create output database if not already there
    path = databaseDir.split("/")
    databaseFolder = "/".join(path[0:len(path) - 1])
    if not os.path.exists(databaseFolder):
        print("creating database directory " + databaseFolder)
        os.makedirs(databaseFolder)
    if not os.path.exists(databaseDir):
        print("Could not find " + databaseDir + ", attempting to create...")
        Path(databaseDir).touch()
       
    mine(genomeDir, runName, pattern, cutoffRank, databaseDir, memeDir, motifs)
    print("finished all the runs for " + runName)
    
except Exception as error: 
    print("An error occured while mining")
    traceback.print_tb(sys.exc_info()[2])
    print(str(error))

## Lasso Viewer

A rudimentary method of looking at lassos within this notebook.

In [None]:
import sqlite3
import pandas as pd
from demjson import decode
import matplotlib.pyplot as plt

In [None]:
# regular expression function for regular expression search
def regexp(expr, item):
    reg = re.compile(expr)
    return reg.search(item) is not None

conn = sqlite3.connect(config["database"])
conn.create_function("REGEXP", 2, regexp)

In [None]:
c = conn.cursor()
selection_string = "SELECT DISTINCT genome FROM lassopeptides WHERE runname is '" + config["runName"] + "'"
distinct_genomes = []
for row in c.execute(selection_string):
    distinct_genomes.append(row[0])

c.close()
print(distinct_genomes)

In [None]:
import json
genome = distinct_genomes[0]
c = conn.cursor()
selection_string = f'SELECT DISTINCT sequence, start, end, overallLength, rank, orf, closestOrfs FROM lassopeptides WHERE runname is "{config["runName"]}" AND genome is "{genome}"'
lasso_peptides = []
for row in c.execute(selection_string):
    lasso_peptides.append({
        "sequence": row[0],
        "start": row[1],
        "end": row[2],
        "overallLength": row[3],
        "rank": row[4],
        "orf": row[5],
        "closestOrfs": decode(json.loads(row[6]))
    })
def sortFunct(lasso):
    return lasso["rank"]
lasso_peptides.sort(key=sortFunct, reverse=True)

c.close()

In [None]:
num_to_show = 10
fig, axs = plt.subplots(num_to_show, 1, figsize=(15,30), constrained_layout=True)

color_arr = ['blue', 'green', 'yellow', 'purple', 'brown', 'cyan']
count = 0
for lasso in lasso_peptides[0:num_to_show]:
    a = {'x': [lasso["start"], lasso["end"]], 'y': [1, 1]}
    axs[count].plot( a['x'], a['y'], label='a', marker='o', markerfacecolor='red', markersize=5, color='red', linewidth=4)
    inner_count = 0
    for closeOrf in lasso['closestOrfs']:
        axs[count].plot( [closeOrf['start'], closeOrf['end']], [3 + (2 * inner_count), 3 + (2 * inner_count)], label=closeOrf["motifType"],  marker='o', markerfacecolor='blue', markersize=5, color=color_arr[inner_count % len(color_arr)], linewidth=4)
        inner_count += 1
    axs[count].set_title(f'{lasso["sequence"]}, rank: {lasso["rank"]}')
    _ = axs[count].legend()
    count += 1

plt.show()

In [None]:
ind = 0
for lasso in lasso_peptides:
    ind += 1
    if("MEKIETHEDL" in lasso["sequence"]):
        print("found the real!")
        print(lasso["sequence"])
        print(f'index: {ind}, rank: {lasso["rank"]}')

## Graphical Genome-wide representations
Plots the rank distribution of the peptides found - a straight line indicates non discrimination occured while ranking, whereas a steep cutoff indicates that the "good" hits were strongly separated from noise.

In [None]:
import matplotlib.pyplot as plt
import sqlite3
import pandas as pd
# regular expression function for regular expression search
def regexp(expr, item):
    reg = re.compile(expr)
    return reg.search(item) is not None

conn = sqlite3.connect(config["database"])
conn.create_function("REGEXP", 2, regexp)

In [None]:
c = conn.cursor()
selection_string = "SELECT DISTINCT genome FROM lassopeptides WHERE runname is '" + config["runName"] + "'"
distinct_genomes = []
for row in c.execute(selection_string):
    distinct_genomes.append(row[0])

c.close()
print(distinct_genomes)

In [None]:
genome_ranks = {}
c = conn.cursor()
for genome in distinct_genomes:
    selection_string = f'SELECT rank FROM lassopeptides WHERE runname is "{config["runName"]}" AND genome is "{genome}"'
    ranks = []
    for row in c.execute(selection_string):
        ranks.append(row[0])
    ranks.sort(reverse=True)
    genome_ranks[genome] = ranks
c.close()

In [None]:
'''
# Multiple genomes at once
fig, axs = plt.subplots(len(distinct_genomes), 1, figsize=(10,15))
ind = 0
for genome in genome_ranks.keys():
    y = genome_ranks[genome]
    x = range(0, len(y))
    axs[ind].plot(x, y)
    axs[ind].set_title(genome)
    axs[ind].set(xlabel="number of peptides with this rank or higher", ylabel="rank")
    ind += 1
'''
genome = distinct_genomes[0]
y = genome_ranks[genome]
x = range(0, len(y))
plt.plot(x, y)
plt.title(genome)
plt.xlabel("number of peptides with this rank or higher")
plt.ylabel("rank")

In [None]:
c = conn.cursor()
selection_string = "SELECT DISTINCT genome FROM lassopeptides WHERE runname is '" + config["runName"] + "'"
distinct_genomes = []
for row in c.execute(selection_string):
    distinct_genomes.append(row[0])

c.close()
print(distinct_genomes)

In [None]:
genome_ranks = {}
c = conn.cursor()
for genome in distinct_genomes:
    selection_string = f'SELECT secondaryRank FROM lassopeptides WHERE runname is "{config["runName"]}" AND genome is "{genome}"'
    ranks = []
    for row in c.execute(selection_string):
        ranks.append(row[0])
    ranks.sort(reverse=True)
    genome_ranks[genome] = ranks
c.close()

In [None]:
'''
# Multiple genomes at once
fig, axs = plt.subplots(len(distinct_genomes), 1, figsize=(10,15))
ind = 0
for genome in genome_ranks.keys():
    y = genome_ranks[genome]
    x = range(0, len(y))
    axs[ind].plot(x, y)
    axs[ind].set_title(genome)
    axs[ind].set(xlabel="number of peptides with this rank or higher", ylabel="rank")
    ind += 1
'''
genome = distinct_genomes[0]
y = genome_ranks[genome]
x = range(0, len(y))
plt.plot(x, y)
plt.title(genome)
plt.xlabel("number of peptides with this rank or higher")
plt.ylabel("secondary rank")

## Export
Export data to CSVs and Firebase

In [None]:
import os

In [None]:
from mining import export_to_csv
if(not os.path.isdir('output/csvs')):
    os.mkdir('output/csvs')
export_to_csv(config["runName"], config["database"], os.path.join('output', 'csvs'))

In [None]:
from mining import export_to_firebase, clear_firebase

In [None]:
clear_firebase('/home/nalam/lassomining/output/lasso-peptides-51ce2e6250b9.json')

In [None]:
export_to_firebase(db_dir=config["database"], run_name=config["runName"], cred_file=False)