# XGBoost predictor on peptide descriptors

In this notebook we demonstrate the application of `peptidy` in predicting antimicrobial peptides (AMPs) using XGBoost. `peptidy` is used for feature extraction from amino acid sequences, after which the XGBoost algorithm is used for classification. 


In [None]:
import pandas as pd
from peptidy.descriptors import compute_descriptors

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb

### Load a dataframe with peptides

In [None]:
subsample_AMP = pd.read_csv('subsample_AMP.csv')

X=subsample_AMP.drop('active',axis=1)
y=subsample_AMP['active']

### Encode and split the data 

In [None]:
# Encode the data using peptidy
X_encoded = subsample_AMP['sequence'].apply(compute_descriptors)
X_encoded = pd.DataFrame(X_encoded.tolist(), index= X.index)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

### Define the model

In [None]:
def GB_model(n_estimators=10,
             max_depth=10,
             gamma=0.1,
             reg_alpha=0.5, 
             min_child_weight=int(5), 
             colsample_bytree=0.1, 
             learning_rate=0.1,
             subsample=0.9,
             reg_lambda=0.5,
             objective='binary:logistic',
             eval_metric='logloss',): 
  
    model=xgb.XGBClassifier(n_estimators =int(n_estimators), max_depth = int(max_depth), gamma = gamma,
                           reg_alpha = reg_alpha,min_child_weight= min_child_weight,learning_rate=learning_rate,
                           subsample=subsample,reg_lambda=reg_lambda,objective=objective,
                           colsample_bytree=colsample_bytree, eval_metric=eval_metric)
    return model

### Fit the model to the data

In [None]:
model = GB_model(max_depth=10, n_estimators=7, gamma=0.5, learning_rate=0.001,
                 subsample=0.8, reg_alpha=0.8, reg_lambda=1,
                 min_child_weight=2, objective='binary:logistic', eval_metric='logloss',
                 colsample_bytree=0.9)

evaluation = [( X_train, y_train), ( X_test, y_test)]

fitted_model=model.fit(X_train, y_train,
    eval_set=evaluation,
    verbose=0)

### Evaluate the model

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)