Using the targets file produced by 
~~~~
merrycrispr prep-sequences \
    --gtf ~/workspace//mc_human_files/Homo_sapiens.GRCh38.97.gtf \
    --fasta ~/workspace/mc_human_files/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
    --library_type knockout \
    --output ~/workspace/mc_human_files/pml.fa \
    -n PML
~~~~

and 

~~~~
merrycrispr create-library \
    -i mc_human_files/pml.fa \
    -p mc_human_files/pml.csv \
    -n SpCas9 \
    -r bowtie/GCA_000001405.15_GRCh38_no_alt_analysis_set \
    -c 8
    ~~~~

In [1]:
import pandas as pd
import numpy as np
import pyfaidx
import regex
import tqdm

spacers = "/Users/milessmith/workspace/mc_human_files/pml_targets.fa"
otrf = "/Users/milessmith/workspace/mc_human_files/bowtie_results.csv"
mmpos = regex.compile("[0-9]{1,}")

In [2]:
bowtie_results = pd.read_csv(
        otrf,
        header=None,
        names=[
            "hash",
            "strand",
            "refseq",
            "position",
            "seq",
            "readquality",
            "aligncount",
            "mismatches",
        ],
        usecols=['hash', 
                 'mismatches'],
        dtype={
            "hash": "int64",
            "mismatches": "str",
        },
        na_filter=False,
        skip_blank_lines=True,
        sep="\t",
        memory_map=True,
    )

In [3]:
spacers_fa = pyfaidx.Fasta(spacers)
spacers_dict = {k:v for k,v in zip(spacers_fa.keys(), 
                                   [spacers_fa[_][:].seq for _ in spacers_fa.keys()])}
spacers_df = pd.DataFrame.from_dict(data=spacers_dict, 
                                    orient='index', 
                                    columns=["spacer"])\
    .reset_index(drop=False)\
    .rename(index=str, 
            columns={"index":"hash"})
spacers_df = spacers_df.astype(dtype={"hash":"int64",
                                      "spacer":"str"})

In [4]:
bowtie_results = bowtie_results.fillna(0)

spacers_df["off_target_score"] = np.repeat(0, spacers_df.shape[0])
spacers_df["number_matching"] = np.repeat(0, spacers_df.shape[0])

spacers_df = spacers_df[spacers_df["hash"].isin(bowtie_results["hash"])]

In [5]:
results_count = bowtie_results.groupby('hash').agg('count').reset_index()
filtered_results = results_count[results_count['mismatches'] < 500]

In [8]:
print(f"before filtering: {bowtie_results.shape[0]}\n"
      f"after filtering: {bowtie_results[bowtie_results['hash'].isin(filtered_results['hash'])].shape[0]}")

before filtering: 46187665
after filtering: 346519


In [9]:
filtered_results.head()

Unnamed: 0,hash,mismatches
3,-9209457168488907513,275
9,-9171222395245968596,145
11,-9160792478378761553,186
21,-9133264647069694812,218
22,-9129542392350055743,150


In [10]:
from merrycrispr.off_target_scoring import sumofftargets
grouped_bowtie_results = bowtie_results.groupby('hash')
start = 0
end = 20
off_target_count_threshold = None

In [None]:
for i in spacers_df['hash'].unique():
    matching_locations = grouped_bowtie_results.get_group(i).reset_index(drop=True)

    # if the number of mismatches is above a threshold, remove the spacer
    # if there are more than one perfect matches
    if len(matching_locations[matching_locations["mismatches"] == 0].index) > 1:
        score = 0
    # if there is only one entry - no offtargets, assign a score of 0
    elif matching_locations.shape[0] == 1:
        score = 100
    # elif there are mismatch positions, get the positions, make a list
    # holding lists of those positions, and score
    else:
        mmpos = [
            mmpos_re.findall(str(j[1]["mismatches"]))
            for j in matching_locations.iterrows()
        ]
        score = sumofftargets(
            mmpos, start=start, end=end
        )
    results_count.loc[i, "off_target_score"] = score

Note the above run time: `1h 7m 11s`.  And that is just for *one* gene.

In [None]:
results_count.shape

In [None]:
part1 = results_count.iloc[:4980,]
part2 = results_count.iloc[4980:,]

In [None]:
part1.head()

In [None]:
part2.head()

In [None]:
off_target_scores = part2.drop(columns=['hash','mismatches']).reset_index().rename(index=str, columns={'index':'hash'})

In [None]:
scored_off_targets = bowtie_results.agg('count').reset_index().merge(off_target_scores)

In [None]:
scored_off_targets.sort_values(axis=0, 
                               by="off_target_score",
                               ascending=False).head()

In [None]:
scored_off_targets.sort_values(axis=0, 
                               by="off_target_score",
                               ascending=True).head()

In [None]:
bowtie_results.get_group(7331002894579019723)

In [None]:
bowtie_results.get_group(6469466269579083296)

In [None]:
mmpos = [
            mmpos_re.findall(str(j[1]["mismatches"]))
            for j in bowtie_results.get_group(7331002894579019723).iterrows()
        ]

In [None]:
mmpos

In [None]:
sumofftargets(mmpos, start=0, end=20)

In [7]:
filtered_bowtie = bowtie_results[bowtie_results['hash'].isin(filtered_results['hash'])]

In [19]:
from tqdm.autonotebook import tqdm
tqdm.pandas(desc="converting mismatches")
filtered_bowtie['locations'] = filtered_bowtie.progress_apply(lambda x: mmpos.findall(x['mismatches']), axis=1)
filtered_bowtie.head()

HBox(children=(IntProgress(value=0, description='converting mismatches', max=346519, style=ProgressStyle(descr…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,hash,mismatches,locations
524,-1245412300131519133,,[]
525,-1245412300131519133,9:A>C,[9]
526,-1245412300131519133,9:C>G,[9]
527,-1245412300131519133,"1:C>T,4:A>C","[1, 4]"
528,-1245412300131519133,"4:T>C,10:A>G","[4, 10]"


In [19]:
from tqdm.autonotebook import tqdm
tqdm.pandas(desc="converting mismatches")
filtered_bowtie['locations'] = filtered_bowtie.progress_apply(lambda x: mmpos.findall(x['mismatches']), axis=1)
filtered_bowtie.head()

HBox(children=(IntProgress(value=0, description='converting mismatches', max=346519, style=ProgressStyle(descr…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,hash,mismatches,locations
524,-1245412300131519133,,[]
525,-1245412300131519133,9:A>C,[9]
526,-1245412300131519133,9:C>G,[9]
527,-1245412300131519133,"1:C>T,4:A>C","[1, 4]"
528,-1245412300131519133,"4:T>C,10:A>G","[4, 10]"


In [9]:
filtered_bowtie["mismatches"].apply(mmpos.findall).head()

524         []
525        [9]
526        [9]
527     [1, 4]
528    [4, 10]
Name: mismatches, dtype: object

In [20]:
tqdm.pandas(desc="collapsing mismatches")
filtered_bowtie.groupby('hash').progress_apply(lambda x: x['locations'].values).head()

HBox(children=(IntProgress(value=0, description='collapsing mismatches', max=1203, style=ProgressStyle(descrip…

hash
-9209457168488907513    [[], [2, 12], [9, 18], [2, 14], [0, 2], [8, 18...
-9171222395245968596    [[], [18], [6, 14], [12, 18], [12, 18], [0, 15...
-9160792478378761553    [[], [4, 17], [15, 17], [4, 17], [3, 17], [4, ...
-9133264647069694812    [[], [5, 17], [5, 13], [4, 11], [4, 8], [5, 9]...
-9129542392350055743    [[], [14], [2, 9], [3, 4], [12, 16], [12, 16],...
dtype: object

In [15]:
filtered_bowtie.dtypes

hash           int64
mismatches    object
locations     object
dtype: object

In [16]:
mmpos.findall(filtered_bowtie.loc[527,"mismatches"])

['1', '4']

In [21]:
tqdm.pandas(desc="scoring mismatches")
filtered_bowtie.groupby('hash').progress_apply(lambda x: sumofftargets(x['locations'].values, start=0, end=20)).head()

HBox(children=(IntProgress(value=0, description='scoring mismatches', max=1203, style=ProgressStyle(descriptio…

hash
-9209457168488907513    31.261999
-9171222395245968596    55.416705
-9160792478378761553    41.274608
-9133264647069694812    34.989926
-9129542392350055743    39.480243
dtype: float64

Trying an alternative way of doing the same thing so I am also potentially use `swifter` or `dask`:

In [22]:
tqdm.pandas(desc="finding and collapsing mismatches")
collapsed_filtered_bowtie = filtered_bowtie.groupby('hash').progress_apply(lambda x: x['locations'].values).reset_index()

HBox(children=(IntProgress(value=0, description='finding and collapsing mismatches', max=1203, style=ProgressS…

In [23]:
collapsed_filtered_bowtie.head()

Unnamed: 0,hash,0
0,-9209457168488907513,"[[], [2, 12], [9, 18], [2, 14], [0, 2], [8, 18..."
1,-9171222395245968596,"[[], [18], [6, 14], [12, 18], [12, 18], [0, 15..."
2,-9160792478378761553,"[[], [4, 17], [15, 17], [4, 17], [3, 17], [4, ..."
3,-9133264647069694812,"[[], [5, 17], [5, 13], [4, 11], [4, 8], [5, 9]..."
4,-9129542392350055743,"[[], [14], [2, 9], [3, 4], [12, 16], [12, 16],..."


In [24]:
collapsed_filtered_bowtie = collapsed_filtered_bowtie.rename(index=str, columns={0:'locations'})

In [25]:
collapsed_filtered_bowtie.loc["0","locations"][10]

['9', '14']

In [27]:
tqdm.pandas(desc="scoring off-targets")
collapsed_filtered_bowtie.progress_apply(lambda x: sumofftargets(x['locations'], start=0, end=20), axis=1).head()

HBox(children=(IntProgress(value=0, description='scoring off-targets', max=1203, style=ProgressStyle(descripti…

0    31.261999
1    55.416705
2    41.274608
3    34.989926
4    39.480243
dtype: float64

This method seems radically faster.  Wonder how long it would take to apply to the entirety of bowtie_results?

In [29]:
tqdm.pandas(desc="converting mismatches")
bowtie_results['locations'] = bowtie_results['mismatches'].progress_apply(lambda x: mmpos.findall(x))
tqdm.pandas(desc="collapsing mismatches")
collapsed_bowtie = bowtie_results\
    .groupby('hash')\
    .progress_apply(lambda x: x['locations'].values)\
    .reset_index()\
    .rename(index=str,
            columns={0:'locations'})
tqdm.pandas(desc="scoring mismatches")
collapsed_bowtie.progress_apply(lambda x: sumofftargets(x['locations'], 
                                               start=0,
                                               end=20),
                       axis=1)\
    .head()

HBox(children=(IntProgress(value=0, description='converting mismatches', max=46187665, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='collapsing mismatches', max=4979, style=ProgressStyle(descrip…

HBox(children=(IntProgress(value=0, description='scoring mismatches', max=4979, style=ProgressStyle(descriptio…

0    11.633329
1     5.990047
2     6.011358
3    31.261999
4     6.357073
dtype: float64

Damn... down from just over 1 hour to ~5 minutes.  Probably don't need to optimize this any further, but I would like to see if parallelizing with dask helps when the list is much larger.  Using the `swifter` package is probably the easiest route before I start trying anything in `dask`:

In [30]:
import swifter

In [31]:
bowtie_results['locations'] = bowtie_results['mismatches'].swifter.apply(lambda x: mmpos.findall(x))
collapsed_bowtie = bowtie_results\
    .groupby('hash')\
    .apply(lambda x: x['locations'].values)\
    .reset_index()\
    .rename(index=str,
            columns={0:'locations'})
collapsed_bowtie\
    .swifter\
    .apply(lambda x: sumofftargets(x['locations'], 
                                   start=0,
                                   end=20),
           axis=1)\
    .head()

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=46187665, style=ProgressStyle(description_…

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=4979, style=ProgressStyle(description_widt…

0    11.633329
1     5.990047
2     6.011358
3    31.261999
4     6.357073
dtype: float64

Hmmm... swifter actually increases the run time.  From what I can see, I probably need to vectorize some of the scoring functions, but I will leave that for some post-finishing optimization

In [42]:
spacers_df = spacers_df.drop(columns=['off_target_score',"number_matching"])
spacers_df.head()

Unnamed: 0,hash,spacer
0,-2956615638747643762,TGAGCCGGCACCTCCCCTT
1,-1245412300131519133,CTCCCCTTTCGGACAGCTC
2,-7107283595214827167,TCCCCTTTCGGACAGCTCA
3,-1818764040153492721,GCTCAAGGGACTCAGCCAA
4,-5470846543948195797,ACCGAGAATCGAAACTAAG


In [39]:
tqdm.pandas(desc="scoring off-targets")
collapsed_filtered_bowtie["off_target_score"] = collapsed_filtered_bowtie\
    .progress_apply(lambda x: sumofftargets(x['locations'], start=0, end=20), axis=1)

HBox(children=(IntProgress(value=0, description='scoring off-targets', max=1203, style=ProgressStyle(descripti…

In [40]:
collapsed_filtered_bowtie.head()

Unnamed: 0,hash,locations,off_target_score
0,-9209457168488907513,"[[], [2, 12], [9, 18], [2, 14], [0, 2], [8, 18...",31.261999
1,-9171222395245968596,"[[], [18], [6, 14], [12, 18], [12, 18], [0, 15...",55.416705
2,-9160792478378761553,"[[], [4, 17], [15, 17], [4, 17], [3, 17], [4, ...",41.274608
3,-9133264647069694812,"[[], [5, 17], [5, 13], [4, 11], [4, 8], [5, 9]...",34.989926
4,-9129542392350055743,"[[], [14], [2, 9], [3, 4], [12, 16], [12, 16],...",39.480243


In [45]:
spacers_df = spacers_df.merge(collapsed_filtered_bowtie, on="hash")
spacers_df.head()

Unnamed: 0,hash,spacer,locations,off_target_score
0,-1245412300131519133,CTCCCCTTTCGGACAGCTC,"[[], [9], [9], [1, 4], [4, 10], [9, 18], [2, 7...",15.839646
1,-7107283595214827167,TCCCCTTTCGGACAGCTCA,"[[], [8], [6, 8], [11, 18], [8, 10], [0, 3], [...",21.880718
2,-5470846543948195797,ACCGAGAATCGAAACTAAG,"[[], [2, 16], [10, 15], [2, 18], [9, 14], [3, ...",36.057828
3,3044478046141746202,CCGAGAATCGAAACTAAGC,"[[], [6, 12], [1, 4, 8], [0, 8, 10], [1, 17, 1...",55.348827
4,-693125485272288290,CGAGAATCGAAACTAAGCT,"[[], [1, 7], [1, 7], [0, 1, 7], [0, 6, 14], [8...",40.496279


In [48]:
tqdm.pandas("counting off-targets")
spacers_df['off_targets'] = spacers_df.progress_apply(lambda x: len(x['locations'])-1, axis=1)
spacers_df = spacers_df.drop(columns=['locations'])
spacers_df.head()

HBox(children=(IntProgress(value=0, max=1203), HTML(value='')))

Unnamed: 0,hash,spacer,off_target_score,mismatches
0,-1245412300131519133,CTCCCCTTTCGGACAGCTC,15.839646,395
1,-7107283595214827167,TCCCCTTTCGGACAGCTCA,21.880718,329
2,-5470846543948195797,ACCGAGAATCGAAACTAAG,36.057828,170
3,3044478046141746202,CCGAGAATCGAAACTAAGC,55.348827,102
4,-693125485272288290,CGAGAATCGAAACTAAGCT,40.496279,141


In [51]:
spacers_df.to_pickle("spacers_df.pkl")