In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5
...,...,...,...,...,...
31385,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,doi.org/10.1038/s41592-020-0801-4,51.8
31386,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,doi.org/10.1038/s41592-020-0801-4,37.2
31387,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,doi.org/10.1038/s41592-020-0801-4,64.6
31388,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.7


In [4]:
train_data, test_data = train_test_split(data, test_size=0.15)

In [5]:
codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

def create_dict(codes):
    char_dict = {}
    for index, val in enumerate(codes):
        char_dict[val] = index+1

    return char_dict

char_dict = create_dict(codes)

print(char_dict)
print("Dict Length:", len(char_dict))

{'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20}
Dict Length: 20


In [6]:
def integer_encoding(data):
    """
    - Encodes code sequence to integer values.
    - 20 common amino acids are taken into consideration
    and rest 4 are categorized as 0.
    """
  
    encode_list = []
    for row in tqdm(data['protein_sequence'].values):
        row_encode = []
        for code in row:
            row_encode.append(char_dict.get(code, 0))
        encode_list.append(np.array(row_encode))

    return encode_list

In [7]:
train_encode = integer_encoding(train_data) 
test_encode = integer_encoding(test_data) 

  0%|          | 0/26681 [00:00<?, ?it/s]

  0%|          | 0/4709 [00:00<?, ?it/s]

In [41]:
def pad_sequence(seq, l=100):
    res = np.zeros((l))
    if seq.shape[0] <= l:
        res[:seq.shape[0]] = seq
    else:
        res[:l] = seq[:l]
    return res

In [42]:
for i in tqdm(range(len(train_encode))):
    train_encode[i] = pad_sequence(train_encode[i])
for i in tqdm(range(len(test_encode))):
    test_encode[i] = pad_sequence(test_encode[i])

  0%|          | 0/26681 [00:00<?, ?it/s]

  0%|          | 0/4709 [00:00<?, ?it/s]

In [43]:
X_train, X_test = np.array(train_encode), np.array(test_encode)
y_train, y_test = train_data.tm.values, test_data.tm.values

In [44]:
from sklearn.preprocessing import OneHotEncoder

In [45]:
enc = OneHotEncoder()

In [46]:
X_train_one_hot = enc.fit_transform(X_train)
X_test_one_hot = enc.transform(X_test)

In [47]:
import xgboost as xgb

In [48]:
model = xgb.XGBRegressor(n_jobs=4)

In [49]:
model.fit(X_train_one_hot, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [50]:
y_train_pred = model.predict(X_train_one_hot)
y_test_pred = model.predict(X_test_one_hot)

In [51]:
from sklearn.metrics import mean_squared_error

In [52]:
mean_squared_error(y_train, y_train_pred)

72.06480867272745

In [53]:
mean_squared_error(y_test, y_test_pred)

140.62757217804133

In [54]:
import scipy.stats

In [55]:
scipy.stats.spearmanr(y_train, y_train_pred)

SpearmanrResult(correlation=0.6799966646998601, pvalue=0.0)

In [56]:
scipy.stats.spearmanr(y_test, y_test_pred)

SpearmanrResult(correlation=0.3790811944358931, pvalue=8.286629821090953e-161)

In [60]:
subm_data = pd.read_csv('test.csv')

In [61]:
subm_data

Unnamed: 0,seq_id,protein_sequence,pH,data_source
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes


In [72]:
X_subm = integer_encoding(subm_data)

  0%|          | 0/2413 [00:00<?, ?it/s]

In [73]:
for i in tqdm(range(len(X_subm))):
    X_subm[i] = pad_sequence(X_subm[i])

  0%|          | 0/2413 [00:00<?, ?it/s]

In [74]:
X_subm = np.array(X_subm)

In [75]:
X_subm_one_hot = enc.transform(X_subm)

In [89]:
preds = model.predict(X_subm_one_hot)

In [90]:
res = pd.DataFrame()

In [91]:
res['seq_id'] = subm_data.seq_id.values
res['tm'] = preds

In [92]:
res

Unnamed: 0,seq_id,tm
0,31390,17.654289
1,31391,46.098308
2,31392,38.296085
3,31393,2.343492
4,31394,46.014577
...,...,...
2408,33798,41.800804
2409,33799,11.734397
2410,33800,29.877657
2411,33801,13.027312


In [93]:
res.to_csv('submission.csv', index=False)