# Introduction

SML= single mutant library (mixture of single-mutants randomized each at one position only)<br>
SRL= semi-random library (7 randomized sites)


 - Y E Q H K L P S S W P F**    (original peptide aka K5)
 - X X **Q** X **K L** X X X **W P** X (semi-random mutant library)
 - X X **Q** X **K** X X X X **W P** X (single mutant library)

The reactive glutamine is defined as position 0

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:40.2f}'.format

# Data loading

### SML

In [None]:
before = pd.read_table("count_SML_Before_AA_Seq.txt.gz", index_col=1)
before = before[['num_before']]

after = pd.read_table("count_SML_After_AA_Seq.txt.gz", index_col=1)
after = after[['num_after']]

SML = before.join(after, how='outer')
SML.fillna(0, inplace=True)
SML = SML.reset_index().rename(columns={'index':'seq'}).set_index("seq")
SML.head()

### SRL

In [None]:
before = pd.read_table("count_SRL_Before_AA_Seq.txt.gz", index_col=1)
before = before[['num_before']]

after = pd.read_table("count_SRL_After_AA_Seq.txt.gz", index_col=1)
after = after[['num_after']]

SRL = before.join(after, how='outer')
SRL.fillna(0, inplace=True)
SRL = SRL.reset_index().rename(columns={'index':'seq'}).set_index("seq")
SRL.head()

<br>

# Count normalization

In [None]:
def normalize(df):
    df["num_before"] = df["num_before"]/df["num_before"].sum()
    df["num_after"] = df["num_after"]/df["num_after"].sum()
    df["ER"] = df["num_after"]/df["num_before"]
    return df

In [None]:
SML = normalize(SML)
SRL = normalize(SRL)

<br>

# Peptide filtering

We will only keep peptides, that match all of of the following conditions 
 - They start with a methionine
 - They do not contain a stop codon
 - They contain a glutamine at pos 0
 - They have at least one count in both the before and after condition (otherwise no enrichment can be calculated)

In [None]:
SML = SML.reset_index()
SRL = SRL.reset_index()

In [None]:
def removeStopcodons(df):
    df = df[(df["seq"].str.contains("*",regex=False)==False)] #remove stopcodons
    return df

def removeWithoutStartcodons(df):
    df = df[df["seq"].str.startswith('M')] 
    return df

def filterByPattern(df, pattern):
    return df[df["seq"].str.contains(pat=pattern)]

def filterNonZero(df):
    df = df[(df["num_before"]>0) & (df["num_after"]>0)]
    return df

In [None]:
counts = []
counts.append(SML.shape[0])

tmp = removeStopcodons(SML)
counts.append(tmp.shape[0])

tmp = removeWithoutStartcodons(tmp)
counts.append(tmp.shape[0])

tmp = filterByPattern(tmp, "^...Q........")
counts.append(tmp.shape[0])

tmp = filterNonZero(tmp)
counts.append(tmp.shape[0])
SML_filtered = tmp

print(counts)
SML_counts = counts

In [None]:
counts = []
counts.append(SRL.shape[0])

tmp = removeStopcodons(SRL)
counts.append(tmp.shape[0])

tmp = removeWithoutStartcodons(tmp)
counts.append(tmp.shape[0])

tmp = filterByPattern(tmp, "^...Q........")
counts.append(tmp.shape[0])

tmp = filterNonZero(tmp)
counts.append(tmp.shape[0])
SRL_filtered = tmp

print(counts)
SRL_counts = counts

In [None]:
SML_filtered.to_csv("SML_normalized_filtered.tsv.gz", sep="\t")
SRL_filtered.to_csv("SRL_normalized_filtered.tsv.gz", sep="\t")