# XGBoost

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
import logomaker as lm
import matplotlib.pyplot as plt

In [32]:
#Load data
folder = ''

# Constructing the input 
data_off = pd.read_csv(folder + 'off_sequences.txt', header=0)
data_on = pd.read_csv(folder + 'on_sequences.txt', header=0)

data = pd.concat((data_off, data_on))

original_df = pd.DataFrame(data)

original_df['Sequences'] = original_df['Sequences'].apply(list)

expanded_df = pd.DataFrame(original_df['Sequences'].tolist(), index=original_df.index)
result_df = pd.concat([original_df, expanded_df], axis=1)
result_df = result_df.drop('Sequences', axis=1)

# Negative Charge D, E: 0, 1 
# Positive Charge R, H, K: 2,3,4
# Polar Uncharged: S, T, C, P, N, Q: 5,6,7,8,9,10
# Nonpolar uncharged: G , A, V, L , M, I: 11, 12, 13, 14,15, 16
# Aromatic: F, Y, W: 17, 18, 19


amino_acid_mapping = {'D': 0, 'E': 1, 'R': 2, 'H': 3, 'K': 4,
                      'S': 5, 'T': 6, 'C': 7, 'P': 8, 'N': 9, 'Q': 10,
                      'G': 11, 'A': 12, 'V': 13, 'L': 14, 'M': 15, 'I': 16,
                      'F': 17, 'Y': 18, 'W': 19, '-': -1, 'X': -1}

# Apply mapping to each element in the DataFrame
result_df_mapped = result_df.applymap(lambda x: amino_acid_mapping[x])
result_df_encoded = pd.get_dummies(result_df_mapped , prefix='', prefix_sep='')

In [33]:
# Constructing the labels
labels_off = pd.DataFrame({'label': 0}, index=range(len(data_off)))
labels_on = pd.DataFrame({'label': 1}, index=range(len(data_off), len(data_off)+ len(data_on)) )
labels = pd.concat((labels_off, labels_on))

num_classes = 2

# Shuffling the data
indices = np.arange(len(data))
np.random.shuffle(indices)
data = data.iloc[indices]
labels = labels.iloc[indices]

In [34]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(result_df_encoded, labels, train_size = 0.8 ,test_size=0.2, shuffle = True)
chars = X_train.select_dtypes(exclude=np.number).columns.tolist()

# Train the XGBoost model with the default parameters
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

num_zeros, num_ones = count_classes(labels)
params = {'objective': 'binary:logistic', 'eval_metric': 'error'} 
bst = xgb.train(params, dtrain, num_boost_round = 100)

In [35]:
def grid_train_xgboost(X_train, y_train, X_test, y_test, max_depths = [2,3,4,5,10], min_child_weights = [0,1,2,5,10], early_stopping_rounds=10, num_boost_round = 100):    '''
    This funciton performs grid search through 
    parameters that are influencing the probability of overfitting
    It returns the values for AUC-ROC for the 
    best model and the values used for training
    '''
    best_params = None
    best_acc = 0
    num_zeros, num_ones = count_classes(y_train)
    for max_d in max_depths:
        for min_child in min_child_weights:
            params = {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': max_d, 'min_child_weight': min_child}
            dtrain = xgb.DMatrix(X_train, label=y_train)
            dtest = xgb.DMatrix(X_test, label=y_test)
            model = xgb.train(params, dtrain, evals=[(dtest, 'test')], 
                          early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
            preds = model.predict(dtest)
            preds = np.round(preds)
            preds = [int(pred) for pred in preds]
            acc = accuracy_score (y_test, preds)
            print('Accuracy with ', params, 'is ', "%.4f " % (acc))
            if acc > best_acc:
                best_acc = acc
                best_params = params

    print("Best ACC:", best_acc)
    print("Best params:", best_params)
    return best_params 

In [36]:
def cross_val_accuracy(params, num_boost_round, X, y, k=10):
    """
    Perform k-fold cross-validation for XGBoost model accuracy.

    Parameters:
    - params (dict): XGBoost model hyperparameters.
    - num_boost_round (int): Number of boosting rounds for XGBoost.
    - X (DataFrame): Features for the dataset.
    - y (Series): Target variable for the dataset.
    - k (int, optional): Number of folds for cross-validation. Default is 10.

    Returns:
    - float: Mean accuracy across all folds.
    """
    chunk_size = len(X) // k
    accuracies = []

    for i in range(k):
        start = i * chunk_size
        end = (i + 1) * chunk_size if i < k - 1 else len(X)

        X_val_fold = X.iloc[start:end]
        y_val_fold = y.iloc[start:end]

        X_train_fold = pd.concat([X.iloc[:start], X.iloc[end:]])
        y_train_fold = pd.concat([y.iloc[:start], y.iloc[end:]])

        dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
        dval = xgb.DMatrix(X_val_fold, label=y_val_fold)

        bst = xgb.train(params, dtrain, num_boost_round=num_boost_round)

        preds = bst.predict(dval)
        preds = np.round(preds)
        acc = accuracy_score(y_val_fold, preds)
        accuracies.append(acc)

    return np.mean(accuracies)

param_grid = {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': [2, 3, 4, 5, 10],
              'min_child_weight': [0, 1, 2, 5, 10]}

best_accuracy = 0.0
best_params = None

# Perform cross-validation
for max_depth in param_grid['max_depth']:
    for min_child_weight in param_grid['min_child_weight']:
        current_params = {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': max_depth,
                          'min_child_weight': min_child_weight}
        current_accuracy = cross_val_accuracy(current_params, num_boost_round=100, X=result_df_encoded, y=labels, k=10)

        print(f"Accuracy with {current_params} is {current_accuracy:.4f}")

        if current_accuracy > best_accuracy:
            best_accuracy = current_accuracy
            best_params = current_params

print(f"Best ACC: {best_accuracy:.4f}")
print(f"Best params: {best_params}")

Accuracy with {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': 2, 'min_child_weight': 0} is 0.6316
Accuracy with {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': 2, 'min_child_weight': 1} is 0.6497
Accuracy with {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': 2, 'min_child_weight': 2} is 0.6348
Accuracy with {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': 2, 'min_child_weight': 5} is 0.6107
Accuracy with {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': 2, 'min_child_weight': 10} is 0.6225
Accuracy with {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': 3, 'min_child_weight': 0} is 0.6439
Accuracy with {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': 3, 'min_child_weight': 1} is 0.6770
Accuracy with {'objective': 'binary:logistic', 'eval_metric': 'error', 'max_depth': 3, 'min_child_weight': 2} is 0.7043
Accuracy with {'objective': 'binary:log