Using the targets file produced by 
~~~~
merrycrispr prep-sequences \
    --gtf ~/workspace//mc_human_files/Homo_sapiens.GRCh38.97.gtf \
    --fasta ~/workspace/mc_human_files/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
    --library_type knockout \
    --output ~/workspace/mc_human_files/pml.fa \
    -n PML
~~~~

and 

~~~~
merrycrispr create-library \
    -i mc_human_files/pml.fa \
    -p mc_human_files/pml.csv \
    -n SpCas9 \
    -r bowtie/GCA_000001405.15_GRCh38_no_alt_analysis_set \
    -c 8
    ~~~~

In [1]:
import pandas as pd
import numpy as np
import pyfaidx
import regex

spacers = "/Users/milessmith/workspace/mc_human_files/pml_targets.fa"
otrf = "/Users/milessmith/workspace/mc_human_files/bowtie_results.csv"
mmpos_re = regex.compile("[0-9]{1,}")

In [2]:
bowtie_results = pd.read_csv(
        otrf,
        header=None,
        names=[
            "hash",
            "strand",
            "refseq",
            "position",
            "seq",
            "readquality",
            "aligncount",
            "mismatches",
        ],
        usecols=['hash', 
                 'mismatches'],
        dtype={
            "hash": "int64",
            "mismatches": "str",
        },
        na_filter=False,
        skip_blank_lines=True,
        sep="\t",
        memory_map=True,
    )

In [3]:
spacers_fa = pyfaidx.Fasta(spacers)
spacers_dict = {k:v for k,v in zip(spacers_fa.keys(), 
                                   [spacers_fa[_][:].seq for _ in spacers_fa.keys()])}
spacers_df = pd.DataFrame.from_dict(data=spacers_dict, 
                                    orient='index', 
                                    columns=["spacer"])\
    .reset_index(drop=False)\
    .rename(index=str, 
            columns={"index":"hash"})
spacers_df = spacers_df.astype(dtype={"hash":"int64",
                                      "spacer":"str"})

In [4]:
bowtie_results = bowtie_results.fillna(0)

spacers_df["off_target_score"] = np.repeat(0, spacers_df.shape[0])
spacers_df["number_matching"] = np.repeat(0, spacers_df.shape[0])

spacers_df = spacers_df[spacers_df["hash"].isin(bowtie_results["hash"])]

In [5]:
results_count = bowtie_results.groupby('hash').agg('count').reset_index()
filtered_results = results_count[results_count['mismatches'] < 500]

In [6]:
print(f"before filtering: {bowtie_results.shape[0]}\n"
      f"after filtering: {bowtie_results[bowtie_results['hash'].isin(filtered_results['hash'])].shape[0]}")

before filtering: 46187665
after filtering: 346519


In [7]:
filtered_results.head()

Unnamed: 0,hash,mismatches
3,-9209457168488907513,275
9,-9171222395245968596,145
11,-9160792478378761553,186
21,-9133264647069694812,218
22,-9129542392350055743,150


In [None]:
from merrycrispr.off_target_scoring import sumofftargets
bowtie_results = bowtie_results.groupby('hash')
start = 0
end = 20
off_target_count_threshold = None

In [None]:
for i in spacers_df['hash'].unique():
    matching_locations = bowtie_results.get_group(i).reset_index(drop=True)

    # if the number of mismatches is above a threshold, remove the spacer
    # if there are more than one perfect matches
    if len(matching_locations[matching_locations["mismatches"] == 0].index) > 1:
        score = 0
    # if there is only one entry - no offtargets, assign a score of 0
    elif matching_locations.shape[0] == 1:
        score = 100
    # elif there are mismatch positions, get the positions, make a list
    # holding lists of those positions, and score
    else:
        mmpos = [
            mmpos_re.findall(str(j[1]["mismatches"]))
            for j in matching_locations.iterrows()
        ]
        score = sumofftargets(
            mmpos, start=start, end=end
        )
    results_count.loc[i, "off_target_score"] = score

Note the above run time: `1h 7m 11s`.  And that is just for *one* gene.

In [None]:
results_count.shape

In [None]:
part1 = results_count.iloc[:4980,]
part2 = results_count.iloc[4980:,]

In [None]:
part1.head()

In [None]:
part2.head()

In [None]:
off_target_scores = part2.drop(columns=['hash','mismatches']).reset_index().rename(index=str, columns={'index':'hash'})

In [None]:
scored_off_targets = bowtie_results.agg('count').reset_index().merge(off_target_scores)

In [None]:
scored_off_targets.sort_values(axis=0, 
                               by="off_target_score",
                               ascending=False).head()

In [None]:
scored_off_targets.sort_values(axis=0, 
                               by="off_target_score",
                               ascending=True).head()

In [None]:
bowtie_results.get_group(7331002894579019723)

In [None]:
bowtie_results.get_group(6469466269579083296)

In [None]:
mmpos = [
            mmpos_re.findall(str(j[1]["mismatches"]))
            for j in bowtie_results.get_group(7331002894579019723).iterrows()
        ]

In [None]:
mmpos

In [None]:
sumofftargets(mmpos, start=0, end=20)

Would like to perform this in parallel.  Trying the `swifter` package

In [71]:
import swifter



Hmmm... swifter actually increases the run time.

In [6]:
filtered_bowtie = bowtie_results[bowtie_results['hash'].isin(filtered_results['hash'])]

In [7]:
mmpos = regex.compile("[0-9]{1,}")
filtered_bowtie['locations'] = filtered_bowtie['mismatches'].apply(lambda x: mmpos.findall(x))
filtered_bowtie.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,hash,mismatches,locations
524,-1245412300131519133,,[]
525,-1245412300131519133,9:A>C,[9]
526,-1245412300131519133,9:C>G,[9]
527,-1245412300131519133,"1:C>T,4:A>C","[1, 4]"
528,-1245412300131519133,"4:T>C,10:A>G","[4, 10]"


In [8]:
filtered_bowtie.groupby('hash').apply(lambda x: x['locations'].values).head()

hash
-9209457168488907513    [[], [2, 12], [9, 18], [2, 14], [0, 2], [8, 18...
-9171222395245968596    [[], [18], [6, 14], [12, 18], [12, 18], [0, 15...
-9160792478378761553    [[], [4, 17], [15, 17], [4, 17], [3, 17], [4, ...
-9133264647069694812    [[], [5, 17], [5, 13], [4, 11], [4, 8], [5, 9]...
-9129542392350055743    [[], [14], [2, 9], [3, 4], [12, 16], [12, 16],...
dtype: object

In [17]:
filtered_bowtie.dtypes

hash           int64
mismatches    object
locations     object
dtype: object

In [9]:
from merrycrispr.off_target_scoring import sumofftargets

In [10]:
filtered_bowtie.groupby('hash').apply(lambda x: sumofftargets(x['locations'].values, start=0, end=20)).head()

hash
-9209457168488907513    31.261999
-9171222395245968596    55.416705
-9160792478378761553    41.274608
-9133264647069694812    34.989926
-9129542392350055743    39.480243
dtype: float64

In [26]:
collapsed_filtered_bowtie = filtered_bowtie.groupby('hash').apply(lambda x: x['locations'].values).reset_index()

In [27]:
collapsed_filtered_bowtie.head()

Unnamed: 0,hash,0
0,-9209457168488907513,"[[], [2, 12], [9, 18], [2, 14], [0, 2], [8, 18..."
1,-9171222395245968596,"[[], [18], [6, 14], [12, 18], [12, 18], [0, 15..."
2,-9160792478378761553,"[[], [4, 17], [15, 17], [4, 17], [3, 17], [4, ..."
3,-9133264647069694812,"[[], [5, 17], [5, 13], [4, 11], [4, 8], [5, 9]..."
4,-9129542392350055743,"[[], [14], [2, 9], [3, 4], [12, 16], [12, 16],..."


collapsed_filtered_bowtie.head()

In [29]:
collapsed_filtered_bowtie.apply(lambda x: sumofftargets(x[0], start=0, stop=20))

TypeError: ("sumofftargets() got an unexpected keyword argument 'stop'", 'occurred at index hash')

This method seems radically faster.  Wonder how long it would take to apply to the entirety of bowtie_results?

In [93]:
bowtie_results['locations'] = bowtie_results['mismatches'].apply(lambda x: mmpos.findall(x))
bowtie_results.groupby('hash').apply(lambda x: sumofftargets(x['locations'].values, start=0, end=20)).head()

hash
-9222657694777189612    11.633329
-9218736219115679820     5.990047
-9216575950229189898     6.011358
-9209457168488907513    31.261999
-9201242568838524401     6.357073
dtype: float64

Damn... down from just over 1 hour to 5 minutes.  Probably don't need to optimize this any further, but I would like to see if parallelizing with dask helps when the list is much larger:

In [11]:
import dask.dataframe as dd
from dask.distributed import Client
from dask.multiprocessing import get

In [None]:
client = Client()

In [12]:
bowtie_results_dd = dd.from_pandas(bowtie_results, npartitions=8)

In [19]:
bowtie_results_dd['locations'] = bowtie_results_dd['mismatches'].apply(lambda x: mmpos.findall(x), meta=pd.Series(dtype='object'))

In [20]:
bowtie_results_dd.head()

Unnamed: 0,hash,mismatches,locations
0,-2956615638747643762,,[]
1,-2956615638747643762,6:T>G,[6]
2,-2956615638747643762,"0:C>T,5:T>C","[0, 5]"
3,-2956615638747643762,"9:A>T,18:G>A","[9, 18]"
4,-2956615638747643762,"9:T>A,18:C>T","[9, 18]"


In [21]:
bowtie_results_dd.groupby('hash')\
    .apply(lambda x: sumofftargets(x['locations'].values, 
                                   start=0,
                                   end=20),
           meta=pd.Series(dtype='float64'))\
    .head()

KeyboardInterrupt: 

In [None]:
client.

In [None]:
print("stuff")