In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from glob import glob
from sklearn.impute import SimpleImputer


import os
import graphviz

from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

## Prep only most feasible features and do not scale

In [None]:
# Load data
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', na_values=['NA', 'null', ''], low_memory=False)
    return df

# Preprocessing function
def preprocess_data(df):
    # Creating a copy to avoid SettingWithCopyWarning
    df = df.copy()

    # Selecting the relevant columns
    relevant_columns = ['IMPACT', 'QUAL', 'DP', 'QD', 'MAX_AF']
    df = df[relevant_columns]

    # Impact mapping using .loc to avoid SettingWithCopyWarning
    impact_mapping = {'HIGH': 0, 'MODERATE': 1, 'LOW': 2, 'MODIFIER': 3}
    df.loc[:, 'IMPACT'] = df['IMPACT'].map(impact_mapping)
    
    # Handle missing values for 'DP' (Depth of coverage)
    if df['DP'].isnull().any():
        print("NaN values found in DP. Imputing with median.")
        dp_imputer = SimpleImputer(strategy='median')
        df['DP'] = dp_imputer.fit_transform(df['DP'].values.reshape(-1, 1))

    # Handle missing values for 'MAX_AF' (Maximum Allele Frequency)
    if df['MAX_AF'].isnull().any():
        max_af_imputer = SimpleImputer(strategy='median')
        df['MAX_AF'] = max_af_imputer.fit_transform(df[['MAX_AF']])
    
    # Handling numeric columns 'QUAL' and 'QD'
    numeric_cols = ['QUAL', 'QD']
    for col in numeric_cols:
        if df[col].isnull().any():
            print(f"NaN values found in {col}. Imputing with median.")
            imputer = SimpleImputer(strategy='median')
            df[col] = imputer.fit_transform(df[[col]])

    return df

def process_files_dynamically(base_directory):
    for file_name in os.listdir(base_directory):
        if file_name.endswith('_aggregated.tsv'):
            group_subfolder = None
            if 'positive' in file_name:
                group_subfolder = 'positive_group'
            elif 'negative' in file_name:
                group_subfolder = 'negative_group'
            elif 'validation' in file_name:
                group_subfolder = 'validation_group'

            if group_subfolder:
                # Create the group directory if it doesn't exist
                group_directory = os.path.join(base_directory, group_subfolder)
                os.makedirs(group_directory, exist_ok=True)
                
                # Create the scaled subdirectory within the group directory
                scaled_directory = os.path.join(group_directory, 'scaled')
                os.makedirs(scaled_directory, exist_ok=True)

                # Define input and output file paths
                input_file_path = os.path.join(base_directory, file_name)
                output_file_name = 'ML_prepped_' + file_name
                output_file_path = os.path.join(scaled_directory, output_file_name)
                process_and_save_file(input_file_path, output_file_path)

def process_and_save_file(input_file_path, output_file_path):
    df = load_data(input_file_path)
    df_processed = preprocess_data(df)
    df_processed.to_csv(output_file_path, index=False, sep='\t')
    print(f"Data processed and saved to {output_file_path}")

# Example call to the function
base_directory = '/mnt/sdb/markus-bsc-thesis-data/machine-learning'
process_files_dynamically(base_directory)

## Divide Data into Training Testing and Validation Sets (using positive, negative, and validation groups). And suffle 

In [None]:
def load_data(base_dir, group_name):
    tsv_dir = os.path.join(base_dir, group_name, 'scaled')
    print(f"Loading data from {tsv_dir}")
    files = glob(os.path.join(tsv_dir, "*.tsv"))
    data_list = [pd.read_csv(file, sep='\t') for file in files]
    if data_list:
        data = pd.concat(data_list)
        data['group'] = group_name
    else:
        data = pd.DataFrame()
    return data

def load_all_groups(base_directory):
    groups = ["positive_group", "negative_group", "validation_group"]
    data_frames = {}
    for group in groups:
        data_frames[group] = load_data(base_directory, group)
    return data_frames

def prepare_datasets(data_frames):
    data_positive = data_frames["positive_group"]
    data_negative = data_frames["negative_group"]
    validation_data = data_frames["validation_group"]  # Using provided validation data directly
    validation_data.to_csv(f"{base_directory}/validation_set.csv", index=False)


    # Combine positive and negative data, excluding validation data
    combined_data = pd.concat([data_positive, data_negative])
    total_positives = combined_data[combined_data['group'] == 'positive_group'].shape[0]
    total_negatives = combined_data[combined_data['group'] == 'negative_group'].shape[0]
    print(f"Combined data count: {combined_data.shape[0]} rows (Positive: {total_positives}, Negative: {total_negatives})")

    # Splitting combined data into training and testing sets (70/30 split)
    train_data, test_data = train_test_split(combined_data, test_size=0.3, random_state=42)

    # Checking if total rows in train and test match the combined data rows
    total_train_test_rows = train_data.shape[0] + test_data.shape[0]
    print(f"Total rows in train + test: {total_train_test_rows} rows. Matches combined data: {total_train_test_rows == combined_data.shape[0]}")

    # Separating positive and negative data within training and testing sets
    train_pos = train_data[train_data['group'] == 'positive_group']
    train_neg = train_data[train_data['group'] == 'negative_group']
    test_pos = test_data[test_data['group'] == 'positive_group']
    test_neg = test_data[test_data['group'] == 'negative_group']

    # Calculating distributions
    train_pos_pct = (train_pos.shape[0] / train_data.shape[0]) * 100
    train_neg_pct = (train_neg.shape[0] / train_data.shape[0]) * 100
    test_pos_pct = (test_pos.shape[0] / test_data.shape[0]) * 100
    test_neg_pct = (test_neg.shape[0] / test_data.shape[0]) * 100

    print(f"Distribution in training set — Positive: {train_pos_pct:.2f}%, Negative: {train_neg_pct:.2f}%")
    print(f"Distribution in testing set — Positive: {test_pos_pct:.2f}%, Negative: {test_neg_pct:.2f}%")

    # Ensuring both sets contain positive and negative data (following the natural distribution)
    train_data = pd.concat([train_pos, train_neg])
    test_data = pd.concat([test_pos, test_neg])

    print(f"Final Training set: {train_data.shape[0]} rows (Positive: {train_pos.shape[0]}, Negative: {train_neg.shape[0]})")
    print(f"Final Testing set: {test_data.shape[0]} rows (Positive: {test_pos.shape[0]}, Negative: {test_neg.shape[0]})")
    print(f"Validation set: {validation_data.shape[0]} rows")

    return train_data, test_data

def shuffle_and_save(data, file_path):
    # Shuffle data
    column_headers = data.columns
    shuffled_data = data.sample(frac=1, random_state=42)
    # Save to TSV
    shuffled_data.to_csv(file_path, index=False, header=column_headers)


base_directory = '/mnt/sdb/markus-bsc-thesis-data/machine-learning'
data_frames = load_all_groups(base_directory)
training_data, testing_data = prepare_datasets(data_frames)

shuffle_and_save(training_data, f"{base_directory}/training_set.csv")
shuffle_and_save(testing_data, f"{base_directory}/testing_set.csv")

## XGBoost

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# Define the base directory and paths
base_directory = '/mnt/sdb/markus-bsc-thesis-data/machine-learning'
training_file = f"{base_directory}/training_set.csv"
validation_file = f"{base_directory}/validation_set.csv"
testing_file = f"{base_directory}/testing_set.csv"

# Function to load and prepare the data
def load_data(file_path):
    data = pd.read_csv(file_path)
    # Convert group to numeric labels: assuming 'positive_group' as 1, 'negative_group' as 0
    label_mapping = {'positive_group': 1, 'negative_group': 0, 'validation_group': 2}  # Update as per actual data
    data['label'] = data['group'].map(label_mapping)
    data.drop(['group'], axis=1, inplace=True)  # Remove 'group' column from features
    return data

# Load datasets
training_data = load_data(training_file)
validation_data = load_data(validation_file)
testing_data = load_data(testing_file)

# Prepare XGBoost DMatrices
dtrain = xgb.DMatrix(training_data.drop('label', axis=1), label=training_data['label'])
dval = xgb.DMatrix(validation_data.drop('label', axis=1), label=validation_data['label'])
dtest = xgb.DMatrix(testing_data.drop('label', axis=1), label=testing_data['label'])

# Define XGBoost model parameters
params = {
    'max_depth': 7,
    'objective': 'binary:logistic',
    'eta': 0.1,
    'eval_metric': 'logloss',
    'random_state': 42
}
num_rounds = 100

# Initialize an empty dictionary to store evaluation results
evals_result = {}

# Initialize lists to store classification error for each dataset
train_error = []
val_error = []
test_error = []

# Train the model
model = xgb.train(params, dtrain, num_rounds, evals=[(dtrain, 'train'), (dtest, 'test')], evals_result=evals_result)

# Predictions and evaluation
predictions_proba = model.predict(dtest)
predictions = [1 if p >= 0.4 else 0 for p in predictions_proba]
accuracy = accuracy_score(testing_data['label'], predictions)
precision = precision_score(testing_data['label'], predictions)
recall = recall_score(testing_data['label'], predictions)
f1 = f1_score(testing_data['label'], predictions)
logloss = log_loss(testing_data['label'], predictions_proba)
roc_auc = roc_auc_score(testing_data['label'], predictions_proba)  # Calculate ROC AUC
pr_auc = average_precision_score(testing_data['label'], predictions_proba)  # Calculate PR AUC

print(f"Test Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Log Loss: {logloss}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")

# Plotting feature importance and decision tree
xgb.plot_importance(model)
fig, ax = plt.subplots(figsize=(20, 10), dpi=300)
xgb.plot_tree(model, num_trees=10, ax=ax)
plt.title('XGBoost Tree')
plt.savefig('xgb_tree_high_res.png', dpi=300)
plt.show()


In [None]:
# Define XGBoost model parameters
params = {
    'max_depth': 7,
    'objective': 'binary:logistic',
    'eta': 0.1,
    'eval_metric': 'logloss',
    'random_state': 42
}
num_rounds = 100

# Initialize an empty dictionary to store evaluation results
evals_result = {}

# Train the model
model = xgb.train(params, dtrain, num_rounds,
                  evals=[(dtrain, 'train'), (dval, 'validate')],
                  evals_result=evals_result)

# Plot learning curves
epochs = range(1, num_rounds + 1)
train_logloss = evals_result['train']['logloss']
val_logloss = evals_result['validate']['logloss']

plt.figure(figsize=(12, 6))
plt.plot(epochs, train_logloss, label='Train Log Loss')
plt.plot(epochs, val_logloss, label='Validation Log Loss')
plt.title('Training and Validation Log Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.legend()
plt.show()
plt.savefig('training_and_validation_log_loss_over_epochs.png', dpi=300)
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt  # Ensure plt is imported

# Calculate the confusion matrix
cm = confusion_matrix(testing_data['label'], predictions)

# Create a heatmap from the confusion matrix
plt.figure(figsize=(10, 7))  # Optional: Specify the figure size
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')

# Add titles and labels
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

# Save the figure
plt.savefig('conf_mtx.png', dpi=300)  # Save the figure before showing it

# Show the plot
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt  # Ensure plt is imported

# Compute precision-recall pairs for different probability thresholds
precision, recall, thresholds = precision_recall_curve(testing_data['label'], predictions_proba)

# Create the plot for the precision-recall curve
plt.figure(figsize=(8, 6))  # Specify figure size
plt.plot(recall, precision, marker='.', label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()  # Optional: Add a legend if you have multiple lines

# Save the figure before displaying it
plt.savefig('precision_recall_curve.png', dpi=600)  # Save the figure with high DPI

# Show the plot
plt.show()

# Calculate the average precision score
average_precision = average_precision_score(testing_data['label'], predictions_proba)
print(f"Average Precision-Recall Score: {average_precision}")

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt  # Make sure plt is imported

# Compute False Positive Rate, True Positive Rate, and thresholds
fpr, tpr, thresholds = roc_curve(testing_data['label'], predictions_proba)
roc_auc = auc(fpr, tpr)

# Create a new figure for the ROC curve
plt.figure(figsize=(8, 6))  # You can specify the figure size as needed

# Plot the ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line for no-skill classifier

# Add labels and title
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')

# Add a legend
plt.legend(loc="lower right")

# Save the figure before showing it
plt.savefig('roc.png', dpi=600)  # Save the figure with high DPI to ensure high resolution

# Display the plot
plt.show()

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score, average_precision_score

# Assuming the model is already trained and the validation_data is loaded
# Prepare XGBoost DMatrix for validation data
# Ensure only the original features used during training are in the DMatrix
original_features = ['IMPACT', 'QUAL', 'DP', 'QD', 'MAX_AF']  # List all feature names used during training
dval = xgb.DMatrix(validation_data[original_features])

# Predict probabilities on the validation dataset
validation_proba = model.predict(dval)
validation_data['predicted_proba'] = validation_proba  # Add probabilities to validation data

# Applying a threshold to classify predictions
validation_predictions = [1 if p >= 0.38 else 0 for p in validation_proba]
validation_data['predicted_label'] = validation_predictions  # Add predictions as a new column

# Filter out the positive predictions
positive_predictions = validation_data[validation_data['predicted_label'] == 1]

# Save the positive predictions to a CSV file
output_path = '/mnt/sdb/markus-bsc-thesis-data/machine-learning/positive_predictions.csv'
positive_predictions[original_features + ['predicted_proba', 'predicted_label']].to_csv(output_path, index=False)

print(f"Exported {len(positive_predictions)} positive predictions to CSV.")
