In [17]:
import joblib
import pandas as pd
import numpy as np

In [None]:
# I exported the data set in gz: in this format, it takes less space
# I did it in the following way:
# df = pd.read_csv("data/20201112_ccpp_drs_processed.csv", decimal=',', sep=';')
# df.evidencelevelreeval = df.evidencelevelreeval.fillna('-').astype(np.str)
# df.to_csv("data/my_20201112.gz", compression="gzip", index=False)

In [27]:
# For an example, i take just 10 first rows
df = pd.read_csv("../data/my_20201112.gz")
df.shape

(10, 349)

## Data Cleaning

### Move some variables
(it is more convenient this way)

In [28]:
def move_to_end(df, col_name):
    col = df[col_name]
    df.pop(col_name)
    idx = df.shape[1]
    df.insert(idx, col_name, col)

def move_to_begining(df, col_name):
    col = df[col_name]
    df.pop(col_name)
    df.insert(0, col_name, col)

In [29]:
move_to_end(df, "rnafold_37_dot_par")
move_to_end(df, "rnafold_75_dot_par")
move_to_end(df, "blastscore")
move_to_end(df, 'cas_prox_class')

### Construct cas_prox_subtype and cas_prox_type  target variables

In [30]:
df = df.rename(columns = {'cas_prox_class':'cas_prox_subtype'})
cas_prox_subtype = list(map(lambda s: s.replace('IV', 'IIII'), df.cas_prox_subtype))
cas_prox_subtype = list(map(lambda s: s.replace('V', 'IIIII'), cas_prox_subtype))
cas_prox_subtype = list(map(lambda s: s.replace('VI', 'IIIIII'), cas_prox_subtype))
df['cas_prox_type'] = list(map(lambda s: s.count('I'), cas_prox_subtype))

### Construct crispr/not-crispr target variable

In [31]:
df = df.astype({"blastscore": float})
df = df.assign(crispr=np.repeat(-1, df.shape[0]))
## all el4 are good
df.crispr[df.evidencelevel == 4] = 1
## all el1 are notgood
df.crispr[df.evidencelevel == 1] = 0
## all el1 that have blast score higher than 40 are reassigned to el4
df.crispr[np.logical_and(df.evidencelevel == 1, df.blastscore > 40)] = 1
## we exclude from the 0 dataset all DR than have blast < 40 but >10
df.crispr[np.logical_and.reduce((df.evidencelevel==1, df.blastscore<40, df.blastscore>10))] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


### Removal of repetitive ids and canonical k-mers construction
1. In the data set, for each DR a reverse complement (RC) was generated.
2. To construct canonical k-mers, a DR and its RC is used (AAA_new = AAA_dr + AAA_rc). Then, the sample size will be n/2.

In [32]:
def remove_repetitive_id(df):
    """
    Keep those with a minimal rnafold 37 energy
    """
    unique_ids = np.unique(df.id)
    unique_rows = many_unique_ids(unique_ids)
    return pd.concat(unique_rows, axis=1).T

def _for_one_unique_id(id_):
    repetitive_instances = df.loc[df.id == id_]
    repetitive_ids = repetitive_instances.index
    min_energy_id = repetitive_ids[df.rnafold_37_energy[repetitive_ids].argmin()]
    final_example = df.loc[min_energy_id, :]
    final_example.iloc[11:331] = repetitive_instances.iloc[:, 11:331].sum(axis=0)
    return final_example

many_unique_ids = np.vectorize(_for_one_unique_id)

In [33]:
df2 = remove_repetitive_id(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Remove reverse complement mers
Since canonical mers are used, there is no need in RC mers

In [34]:
def reverse_complement(seq):
    rc = seq[::-1]
    rc = rc.replace('A', '*')
    rc = rc.replace('T', 'A')
    rc = rc.replace('*', 'T')
    rc = rc.replace('C', '*')
    rc = rc.replace('G', 'C')
    rc = rc.replace('*', 'G')
    return rc

In [35]:
def find_sufficient_mers(mers):
    res = []
    for mer in mers:
        rc = reverse_complement(mer)
        if rc == mer:
            res.append(mer)
        else:
            if rc not in res:
                res.append(mer)
    return res

In [36]:
to_keep = find_sufficient_mers(df.columns[11:75]) + find_sufficient_mers(df.columns[75:331])
to_delete = pd.Index(np.setdiff1d(df.columns[11:331], to_keep))
df2 = df2.drop(to_delete, axis='columns')
df2.shape

(10, 199)

### Features

In [41]:
# 3 mers
df2.columns[11:43]

Index(['AAA', 'AAT', 'AAG', 'AAC', 'ATA', 'ATG', 'ATC', 'AGA', 'AGT', 'AGG',
       'AGC', 'ACA', 'ACG', 'ACC', 'TAA', 'TAG', 'TAC', 'TTG', 'TTC', 'TGA',
       'TGG', 'TGC', 'TCG', 'TCC', 'GAG', 'GAC', 'GTG', 'GGG', 'GGC', 'GCG',
       'CAG', 'CGG'],
      dtype='object')

In [42]:
# 4 mers
df2.columns[43:179]

Index(['AAAA', 'AAAT', 'AAAG', 'AAAC', 'AATA', 'AATT', 'AATG', 'AATC', 'AAGA',
       'AAGT',
       ...
       'CAAG', 'CATG', 'CAGG', 'CACG', 'CTAG', 'CTGG', 'CTCG', 'CGGG', 'CGCG',
       'CCGG'],
      dtype='object', length=136)

In [43]:
# rnafold
df2.columns[179:193]

Index(['rnafold_37_energy', 'rnafold_37_frequence', 'rnafold_37_diversity',
       'rnafold_37_stem', 'rnafold_37_stem_diff', 'rnafold_37_loop',
       'rnafold_37_stem_loop_nbr', 'rnafold_75_energy', 'rnafold_75_frequence',
       'rnafold_75_diversity', 'rnafold_75_stem', 'rnafold_75_stem_diff',
       'rnafold_75_loop', 'rnafold_75_stem_loop_nbr'],
      dtype='object')

## Prediction

In [39]:
fourmers = joblib.load('models/crispr/4mers-model.pkl')

In [45]:
y_pred = fourmers.predict(df2.iloc[:, 43:179])

In [49]:
# okay, here, it doesn't work because i took 10 examples without their rc counterparts
from sklearn.metrics import accuracy_score
accuracy_score(df2.crispr, y_pred)

ValueError: Classification metrics can't handle a mix of unknown and binary targets