In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import spearmanr
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv("train.csv")
df_updated = pd.read_csv("train_updates.csv")
df_test =  pd.read_csv("test.csv")
for seq_id in df_updated.seq_id:
    df_train = df_train.drop(index = seq_id)

In [3]:
df_train.describe().T.style.background_gradient("YlOrRd")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seq_id,28956.0,15744.916529,9251.179714,0.0,7526.75,15587.5,23902.25,31389.0
pH,28670.0,6.872918,0.79255,1.99,7.0,7.0,7.0,11.0
tm,28956.0,51.360399,12.060858,25.1,43.6,48.8,54.6,130.0


In [4]:
lengths = []
for i in range (len(df_train)):
    lengths.append(len(df_train["protein_sequence"].iloc[i]))
df_train["length"] = lengths

In [5]:
test_lengths = []
for i in range (len(df_test)):
    test_lengths.append(len(df_test["protein_sequence"].iloc[i]))
df_test["length"] = test_lengths

In [6]:
amino_acids= ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
for letter in amino_acids:
    df_train[letter] = df_train.protein_sequence.str.count(letter)
    df_test[letter] = df_test.protein_sequence.str.count(letter)

In [7]:
df_train = df_train[df_train["tm"] > 60]

from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Iterate over column names
for column in df_train:
    if df_train[column].dtype.kind == 'O':
        df_train[column]= label_encoder.fit_transform(df_train[column])
        df_test[column]= label_encoder.fit_transform(df_test[column]) 

In [8]:
# fitting our model
X = df_train.drop(columns=["protein_sequence","tm"])
y = df_train["tm"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [10]:
X_train

Unnamed: 0,seq_id,pH,data_source,length,A,C,D,E,F,G,...,M,N,P,Q,R,S,T,V,W,Y
29266,29266,7.0,176,202,13,0,11,7,9,16,...,4,15,7,2,17,15,10,13,1,17
19496,19496,8.0,70,455,45,5,15,32,21,46,...,13,13,24,25,19,25,16,30,4,14
19783,19783,7.0,176,278,19,3,15,8,15,20,...,8,25,7,5,11,27,11,10,0,14
489,489,7.0,176,715,70,13,34,43,35,46,...,11,25,49,32,45,56,33,48,14,18
14884,14884,7.0,176,305,24,3,15,24,12,30,...,8,13,16,6,18,10,19,23,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22559,22559,7.0,176,180,9,0,13,12,5,11,...,5,8,8,5,14,16,9,15,1,5
412,412,7.0,176,445,32,3,27,44,13,32,...,10,20,22,15,22,19,21,29,4,17
11244,11244,7.0,176,298,16,1,15,25,10,27,...,2,15,12,5,17,11,17,36,1,16
21163,21163,7.0,176,141,15,0,11,8,4,16,...,4,0,6,6,12,5,4,13,1,2


In [11]:
y_train

29266    67.1
19496    68.0
19783    68.8
489      61.6
14884    69.1
         ... 
22559    83.3
412      73.0
11244    74.9
21163    87.8
1564     62.4
Name: tm, Length: 3928, dtype: float64

# Model 1: XGBRegressor

In [18]:
model = XGBRegressor().fit(X_train,y_train)
prediction1 = model.predict(X_test)
corr, p = spearmanr(y_test,prediction1)
print("Spearman Correlation: ",corr)

Spearman Correlation:  0.7675805207184684


# Model 2: XGBRegressor with tuning

In [13]:
model = XGBRegressor()
params = {"n_estimators":[100,200,300],"max_depth":[3,5,7],"learning_rate":[0.1,0.05,0.001]}
tuned_model = GridSearchCV(model,param_grid=params,cv=5,verbose=2,n_jobs=-1).fit(X_train,y_train)
tuned_model.best_params_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}

In [19]:
model2 = XGBRegressor(n_estimators=300,learning_rate=0.05,max_depth=7).fit(X_train,y_train)
prediction2 = model2.predict(X_test)
corr, p = spearmanr(y_test,prediction2)
print("Spearman Correlation with Tuned Parameters: ",corr)

Spearman Correlation with Tuned Parameters:  0.7936095253908795


# Model 3: Previous XGBRegressor with RandomizeSearchCV

In [20]:
previous_model = XGBRegressor(subsample=0.6,
                         n_estimators=500,
                         max_depth=6,
                         learning_rate=0.01,
                         colsample_bytree=0.7999999999999999,
                         colsample_bylevel=0.5,
                         seed=20).fit(X_train, y_train)
prediction3 = previous_model.predict(X_test)
corr, p = spearmanr(y_test,prediction3)
print("Spearman Correlation: ",corr)

Spearman Correlation:  0.7901612391445052


# Generating Final Predictions

In [16]:
df_test = df_test.drop(columns=["protein_sequence"])
submission = pd.read_csv("sample_submission.csv")

In [21]:
submission_prediction1 = model.predict(df_test)
submission_prediction2 = model2.predict(df_test)
submission_prediction3 =  previous_model.predict(df_test)

In [22]:
submission["tm"] = submission_prediction1
submission.to_csv('prediction1.csv', index = False)

In [23]:
submission["tm"] = submission_prediction2
submission.to_csv('prediction2.csv', index = False)

In [24]:
submission["tm"] = submission_prediction3
submission.to_csv('prediction3.csv', index = False)

[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=300; total time=   1.8s
[CV] END ...learning_rate=0.1, max_depth=7, n_estimators=200; total time=   2.8s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=200; total time=   1.3s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=100; total time=   1.1s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=300; total time=   3.7s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=100; total time=   0.8s
[CV] END .learning_rate=0.001, max_depth=3, n_estimators=300; total time=   2.1s
[CV] END .learning_rate=0.001, max_depth=5, n_estimators=200; total time=   2.1s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.6s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=100; total time=   1.0s
[CV] END ...learning_rate=0.1, max_depth=5, n_estimators=300; total time=   2.9s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=200; total time=   1.3s
[CV] END ..learning_rate=0.0