In [13]:
import numpy as np, pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

import os, time
import urllib.request 

from biopandas.pdb import PandasPdb
import blosum as bl

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

import Levenshtein
from Levenshtein import distance as levenshtein_distance

In [14]:
base = "VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK"

testCsvPath = "../novozymes-enzyme-stability-prediction-kaggle-2022/data/test.csv"
trainCsvPath = "../novozymes-enzyme-stability-prediction-kaggle-2022/data/train.csv"
pdbPath = "../novozymes-enzyme-stability-prediction-kaggle-2022/sample_data/wildtype_structure_prediction_af2.pdb"

testDF = pd.read_csv(testCsvPath)
trainDF = pd.read_csv(trainCsvPath)
pdbDF =  PandasPdb().read_pdb(pdbPath)

In [15]:
trainDF.values[:,1:]

array([['AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVGMIKDAGDDPDVTHGAEIQAFVRFASEDRLEGGEGVGVVTKPGLGVPVGEPAINPVPRRMIWEAVREVTERPLAVTIAIPGGEELAKKTLNPRLGILGGLSVLGTTGVVKPYSTSAFRMSVVQAVGVARANGLLEIAATTGGKSERFAQRLLPHLPEMAFIEMGDFVGDVLRAARKVGVEVVRVVGMIGKISKMADGKTMTHAAGGEVNLSLLLSLLKEAGASPKALKEAEGAATARRFLEIALEEGLELFFVNLVRLAQEKLQAYIGERPFVSVALTDFDEGRCLAAWPDREVYR',
        7.0, 'doi.org/10.1038/s41592-020-0801-4', 75.7],
       ['AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSRLQAQRRAQRVAWEDGDENVGQTVIPAQEEEGIEKPAEVHPTGKIGAKKLRKLEEKQARKAQREAEEAEREERKRLESQREAEWKKEEERLRLKEEQKEEEERKAQEEQARREHEEYLKLKEAFVVEEEGVSETMTEEQSHSFLTEFINYIKKSKVVLLEDLAFQMGLRTQDAINRIQDLLTEGTLTGVIDDRGKFIYITPEELAAVANFIRQRGRVSITELAQASNSLISWGQDLPAQAS',
        7.0, 'doi.org/10.1038/s41592-020-0801-4', 50.5],
       ['AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYATLGVAKNANGKDIKKAYYQLAKKYHPDTNKEDPDAGRKFQEVSEAYEVLSDEQKRREYDTYGQTAENIGRQGGGFPGGGAGGFGPEGFSQSWQFRSSIDPEELFRKIFGEGNFRTNSFDDFADSKFGFGQAQEMVMDLTFAQAARGVNKDVNVNVVDQCPKCAGTKCEPGTKPGRCQYCN

In [16]:
len(base)

221

In [17]:
len_list = [len(trainDF.values[i,1]) for i in range(len(trainDF))]

In [18]:
trainDF.insert(2, "sequence_len", len_list)

In [24]:
df1 = trainDF[trainDF.sequence_len == 221]
df2 = trainDF[trainDF.sequence_len == 220]
df3 = trainDF[trainDF.sequence_len == 222]

singleDF = pd.concat([df1,df2,df3])

In [29]:
def build_change_list(group_df):
    
    list_output = []
    group_size = len(group_df)
    group_values = group_df.values
    
    col = ['sequence_len','pH','data_source','tm']

    for i in range(group_size):
        data1 = group_values[i]
        line1 = data1[1] # protein sequence
        values1  = data1[2:]
        for j in range(group_size):
            data2 = group_values[j]
            line2 = data2[1]
            values2  = data2[2:]
            if i!=j:
                edits = Levenshtein.editops(line1, line2)
                if len(edits)==1:
                    list_output.append(tuple([line1,line2])+edits[0]+tuple(line1[edits[0][1]])+tuple(line2[edits[0][1]])+ tuple(values1) + tuple(values2))
                else:
                    list_output.append(tuple([line1,line2])+('replace', 0, 0, 'A', 'A') + tuple(values1) + tuple(values2))

    changes = pd.DataFrame(list_output,columns=['seq1','seq2','operation','position1','position2','change1','change2']+[c+'1' for c in col] + [c+'2' for c in col])
    changes.change2 = np.where(changes.operation=='delete','',changes.change2)
    
    return changes

In [30]:
changes = build_change_list(singleDF)

In [32]:
df_clean = changes[(changes.data_source1 == changes.data_source2)&(changes.pH1 == changes.pH2)]
df_clean = df_clean[(df_clean.pH1>=6) & (df_clean.pH1<=8)]
df_clean = df_clean[df_clean.position1 != 0]

df_clean['target'] = df_clean['tm2'] - df_clean['tm1'] 

print(len(df_clean))
display(pd.crosstab(df_clean.change1,df_clean.change2).style.background_gradient(axis=None, cmap="YlGnBu"))

20


ImportError: Missing optional dependency 'Jinja2'. DataFrame.style requires jinja2. Use pip or conda to install Jinja2.

In [26]:
singleDF.head

<bound method NDFrame.head of        seq_id                                   protein_sequence  \
529       529  ASLSQASSEGTTTCKAHDVCLLGPRPLPPSPPVRVSLYYESLCGAC...   
1423     1423  ETLAARGAKVIGTATSENGAQAISDYLGANGKGLMLNVTDPASIES...   
1485     1485  FFIYLLRRQIRTVIQYQTVRYDILPLSPLSRNRLAQVKRKILVLDL...   
2190     2190  KEEVPDNPPNEIYATAQQKLQDGNWRQAITQLEALDNRYPFGPYSQ...   
2284     2284  KLGIISPAYFFLWPEAFLYRFQIWRPFTATFYFPVGPGTGFLYLVN...   
...       ...                                                ...   
27296   27296  MTKHIALLGGTFDPIHIGHLRMAIELRLAGFDEVRLIPNNVPPHRE...   
27352   27352  MTKQHANWSPYDNNGGTCVAIAGSDYCVIAADTRMSTGYSILSRDY...   
27703   27703  MTSEVIEDEKQFYSKAKTYWKQIPPTVDGMLGGYGHISNIDLNSSR...   
27784   27784  MTSSATSPTNGVDKNKNEEMVATPANCPYQLFNQEVVWNGKWIQTR...   
30203   30203  RFQSSVRTPASEPSAEKGVDEWLEAINELREEFSAKEYLPETSLAP...   

       sequence_len   pH                        data_source    tm  
529             221  7.0  doi.org/10.1038/s41592-020-0801-4  53.8  
1423            2