In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os



In [2]:
train_data = pd.read_csv('002_train_padel_fps.csv')
test_data = pd.read_csv('002_test_padel_fps.csv')

print('traindata shape: ', train_data.shape)
print('testdata shape: ', test_data.shape)
train_data.head(5)

traindata shape:  (9347, 1445)
testdata shape:  (2334, 1445)


Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ACTIVITY
0,0,0.0435,0.001892,74.9344,39.292309,0,0,32,19,13,...,38.842627,2.044349,14.641057,5.661056,6.431126,726.0,26.0,2.168,100.0,1
1,0,1.6888,2.852045,139.5712,75.406204,0,0,63,35,28,...,73.193694,2.091248,23.704766,2.586746,12.936973,3948.0,57.0,5.582,198.0,1
2,0,1.381,1.907161,100.6898,53.751446,0,0,47,25,22,...,49.88537,1.995415,20.501919,2.522207,17.979712,1794.0,35.0,2.103,120.0,0
3,0,2.5668,6.588462,117.7691,62.954204,0,0,55,27,28,...,54.636352,2.023569,12.038128,5.551697,6.486431,2074.0,41.0,3.09,138.0,1
4,0,1.7116,2.929575,107.6245,49.991516,0,0,37,25,12,...,51.678547,2.067142,16.921402,0.0,14.395067,1320.0,47.0,2.42,138.0,1


In [3]:
continuous_cols = train_data.select_dtypes(include='float64').columns

Q1 = train_data[continuous_cols].quantile(0.25)
Q3 = train_data[continuous_cols].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

train_data[continuous_cols] = train_data[continuous_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)
test_data[continuous_cols] = test_data[continuous_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)

In [4]:
# from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def create_preprocessing_pipeline():
    pipeline = Pipeline([
        ('variance_selector', VarianceThreshold(threshold=0.01)),
        ('imputer', SimpleImputer()),
        ('scaler', RobustScaler())
    ])
    return pipeline

In [5]:
preprocessor = create_preprocessing_pipeline()

X_train = train_data.drop('ACTIVITY', axis=1)
y_train = train_data['ACTIVITY']

X_test = test_data.drop('ACTIVITY', axis=1)
y_test = test_data['ACTIVITY']


preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [6]:
from catboost import CatBoostClassifier

# Initialize the CatBoost Classifier with minimal parameters
catboost_clf = CatBoostClassifier(
    iterations=100,  # Basic number of trees
    learning_rate=0.1,  # Basic learning rate
    depth=3,  # Depth of trees
    auto_class_weights='Balanced',  # Automatically handle class imbalance
    random_seed=0,
    verbose=False
)

# Fit the model to the training data
catboost_clf.fit(X_train, y_train)

# Predict on the test data
y_pred = catboost_clf.predict(X_test)


In [7]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix

# Existing scores
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))

# Additional Scores
# ROC AUC Score
print('ROC AUC:', roc_auc_score(y_test, y_pred))

# Balanced Accuracy
print('Balanced Accuracy:', balanced_accuracy_score(y_test, y_pred))

# Specificity Calculation
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print('Specificity:', specificity)


Accuracy: 0.7172236503856041
F1 Score: 0.68054211035818
Precision: 0.6316262353998203
Recall: 0.7376705141657922
ROC AUC: 0.7203920999503834
Balanced Accuracy: 0.7203920999503834
Specificity: 0.7031136857349747


In [17]:
'''
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 4, 6],
    'l2_leaf_reg': [1, 3, 5]
}

grid_search = GridSearchCV(
    estimator=CatBoostClassifier(
        auto_class_weights='Balanced',
        random_seed=0,
        verbose=False  # Set verbose to False to reduce CatBoost output
    ),
    param_grid=param_grid,
    scoring='accuracy',  # Metric to evaluate the estimators
    cv=3,                # Number of cross-validation folds
    verbose=2,           # Verbose output level
    n_jobs=-1            # Number of jobs to run in parallel
)
grid_search.fit(X_train, y_train)
'''

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=  22.8s
[CV] END depth=3, iterations=100, l2_leaf_reg=3, learning_rate=0.1; total time=  21.9s
[CV] END depth=3, iterations=100, l2_leaf_reg=5, learning_rate=0.05; total time=  20.3s
[CV] END depth=3, iterations=200, l2_leaf_reg=1, learning_rate=0.05; total time=  34.0s
[CV] END depth=3, iterations=200, l2_leaf_reg=3, learning_rate=0.01; total time=  35.1s
[CV] END depth=3, iterations=200, l2_leaf_reg=5, learning_rate=0.01; total time=  31.4s
[CV] END depth=3, iterations=200, l2_leaf_reg=5, learning_rate=0.1; total time=  35.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=1, learning_rate=0.05; total time=  46.6s
[CV] END depth=3, iterations=300, l2_leaf_reg=3, learning_rate=0.05; total time=  52.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=5, learning_rate=0.05; total time=  56.2s
[CV] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=  32.6s
[CV] END depth=4, iterations=100, 

[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.1; total time=  22.0s
[CV] END depth=3, iterations=100, l2_leaf_reg=3, learning_rate=0.01; total time=  21.8s
[CV] END depth=3, iterations=100, l2_leaf_reg=5, learning_rate=0.05; total time=  21.6s
[CV] END depth=3, iterations=200, l2_leaf_reg=1, learning_rate=0.01; total time=  35.5s
[CV] END depth=3, iterations=200, l2_leaf_reg=3, learning_rate=0.05; total time=  33.4s
[CV] END depth=3, iterations=200, l2_leaf_reg=3, learning_rate=0.1; total time=  35.1s
[CV] END depth=3, iterations=300, l2_leaf_reg=1, learning_rate=0.01; total time=  48.3s
[CV] END depth=3, iterations=300, l2_leaf_reg=3, learning_rate=0.01; total time=  53.1s
[CV] END depth=3, iterations=300, l2_leaf_reg=3, learning_rate=0.1; total time=  57.3s
[CV] END depth=3, iterations=300, l2_leaf_reg=5, learning_rate=0.1; total time=  52.1s
[CV] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.1; total time=  36.0s
[CV] END depth=4, iterations=100, l2_

In [8]:
X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [9]:
continuous_cols = X_val.select_dtypes(include='float64').columns

Q1 = X_val[continuous_cols].quantile(0.25)
Q3 = X_val[continuous_cols].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

X_val[continuous_cols] = X_val[continuous_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)

In [10]:
# from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def create_preprocessing_pipeline():
    pipeline = Pipeline([
        ('variance_selector', VarianceThreshold(threshold=0.01)),
        ('imputer', SimpleImputer()),
        ('scaler', RobustScaler())
    ])
    return pipeline

In [11]:
preprocessor = create_preprocessing_pipeline()
preprocessor.fit(X_val)
X_val = preprocessor.transform(X_val)

In [12]:
from catboost import CatBoostClassifier

# Initialize the CatBoost Classifier with minimal parameters
catboost_clf = CatBoostClassifier(
    iterations=100,  # Basic number of trees
    learning_rate=0.1,  # Basic learning rate
    depth=3,  # Depth of trees
    auto_class_weights='Balanced',  # Automatically handle class imbalance
    random_seed=0,
    verbose=False
)

# Fit the model to the training data
catboost_clf.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7da6978eed40>

In [13]:
feature_names_train = [f'feature_{i}' for i in range(X_train.shape[1])]
feature_names_val = [f'feature_{i}' for i in range(X_val.shape[1])]

# Convert both X_train and X_val to DataFrames with their respective column names
X_train_df = pd.DataFrame(X_train, columns=feature_names_train)
X_val_df = pd.DataFrame(X_val, columns=feature_names_val)

# Align the columns: keep only the columns from X_train in X_val
common_columns = X_train_df.columns.intersection(X_val_df.columns)

# Now filter X_val to have only the common columns
X_val_aligned = X_val_df[common_columns]

In [15]:
y_pred = catboost_clf.predict(X_val_aligned)

In [16]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print('accuracy: ', accuracy_score(y_val, y_pred))
print('f1_score: ', f1_score(y_val, y_pred))
print('precision_score: ', precision_score(y_val, y_pred))
print('recall_score: ', recall_score(y_val, y_pred))


accuracy:  0.5796460176991151
f1_score:  0.5026178010471204
precision_score:  0.5333333333333333
recall_score:  0.4752475247524752
