In [None]:
import pandas as pd

bcell = pd.read_csv('input_bcell.csv')
bcell

In [None]:
sars = pd.read_csv('input_sars.csv')
sars

In [None]:
bs = pd.concat([bcell, sars], ignore_index=True)
bs

In [None]:
#Inspect the data: Types of features, Nullvalues and statistics
bs.info()

In [None]:
categorical_features = bs.select_dtypes(include=['object']).columns.tolist()
print(len(categorical_features), categorical_features)
continuous_features = bs.select_dtypes(include=['int64', 'float64']).columns.tolist()
continuous_features.remove('target')
print(len(continuous_features), continuous_features)                                  

In [None]:
#Data Preprocessing
#Find length of protein sequence
#Find length of peptide
#Find peptide region
#Drop unnecessary columns
bs['protein_seq_length'] = bs.apply(lambda x: len(x.protein_seq), axis=1)
bs['peptide_length'] = bs.apply(lambda x: len(x.peptide_seq), axis=1)
bs['peptide_region'] = bs.apply(lambda x: x.peptide_length/x.protein_seq_length, axis=1)

bs = bs.drop(columns=['parent_protein_id','protein_seq','peptide_seq','start_position','end_position'])
bs

In [None]:
bs.info()

In [None]:
#Build the model
#Experiment with different models
try:
  from pycaret.classification import *
except ModuleNotFoundError:
  !pip install pycaret
  from pycaret.classification import *

In [None]:
#Split data to train and test sets
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(bs, test_size=0.2)

print(train.shape)
print(test.shape)

In [None]:
#Setup experiment
experiment = setup(data=train, target='target', normalize=True)

In [None]:
compare_models()

In [None]:
from sklearn.metrics import roc_auc_score

results = pd.DataFrame(columns=['Classifier', 'ModelType', 'AUC'])

In [None]:
best_models = ['rf', 'et', 'lightgbm', 'catboost', 'xgboost']

for m in best_models:
  print('MODEL: ', m)

  print('Creating model...')
  model = create_model(m)
  pred_model = predict_model(model, test)
  pred_model.dropna(inplace=True)
  try:
    results.loc[len(results)] = [m, 'Model', roc_auc_score(pred_model['target'], pred_model['Label'])]
  except:
    results.loc[len(results)] = [m, 'Model', 'NA']

  print('Tuning model...')
  model_tuned = tune_model(model)
  pred_model_tuned = predict_model(model_tuned, test)
  pred_model_tuned.dropna(inplace=True)
  try:
    results.loc[len(results)] = [m, 'Tuned Model', roc_auc_score(pred_model_tuned['target'], pred_model_tuned['Label'])]
  except:
    results.loc[len(results)] = [m, 'Tuned Model', 'NA']

  print('Ensembling model...')
  model_ensembled = ensemble_model(model)
  pred_model_ensembled = predict_model(model_ensembled, test)
  pred_model_ensembled.dropna(inplace=True)
  try:
    results.loc[len(results)] = [m, 'Ensembled Model', roc_auc_score(pred_model_ensembled['target'], pred_model_ensembled['Label'])]
  except:
    results.loc[len(results)] = [m, 'Ensembled Model', 'NA']

  print()

In [None]:
#Check model with Best AUC
print(results.sort_values(by=['AUC'], ascending=False))

In [None]:
#Create the berst model
#Tuned Random forest
rf = create_model('rf')

In [None]:
rf_tuned = tune_model(rf)

In [None]:
#Evaluate the model
#ROC AUC
plot_model(rf_tuned)

In [None]:
#Confusion matrix
plot_model(rf_tuned, 'confusion_matrix')

In [None]:
#Discrimination threshold
plot_model(rf_tuned, 'threshold')

In [None]:
#Class prediction error
plot_model(rf_tuned, 'error')

In [None]:
#Classification Report
plot_model(rf_tuned, 'class_report')

In [None]:
#Feature importance
plot_model(rf_tuned, 'feature')

In [None]:
#save the model
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

model_save_name = 'covid-19_predictor'
path = F"/gdrive/MyDrive/Colab_Notebooks/COVID-19_Prediction/{model_save_name}"

save_model(rf_tuned, path)

In [None]:
#Make predictions
covid = pd.read_csv('input_covid.csv')
covid

In [None]:
#Preprocess the data (similar to training data)
covid['protein_seq_length'] = covid.apply(lambda x: len(x.protein_seq), axis=1)
covid['peptide_length'] = covid.apply(lambda x: len(x.peptide_seq), axis=1)
covid['peptide_region'] = covid.apply(lambda x: x.peptide_length/x.protein_seq_length, axis=1)

covid = covid.drop(columns=['parent_protein_id','protein_seq','peptide_seq','start_position','end_position'])

covid

In [None]:
#Load the model
drive.mount('/gdrive', force_remount=True)

model_save_name = 'covid-19_predictor'
path = F"/gdrive/MyDrive/Colab_Notebooks/COVID-19_Prediction/{model_save_name}"

model = load_model(path)

In [None]:
#Predict using the model
pred_covid = predict_model(model, covid)

pred_covid