In [1]:
import pandas as pd


cox2_df = pd.read_csv("cox2.csv")


cox2_df.info(), cox2_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Columns: 257 entries, QikProp_.stars to cox2Class
dtypes: float64(212), int64(44), object(1)
memory usage: 927.7+ KB


(None,
    QikProp_.stars  QikProp_.amine  QikProp_.acid  QikProp_.rotor  \
 0               1               0              0               1   
 1               0               0              0               2   
 2               0               0              0               3   
 3               1               0              0               2   
 4               1               0              0               2   
 
    QikProp_.rctvFG  QikProp_CNS  QikProp_MW  QikProp_dipole  QikProp_SASA  \
 0                0            0     358.882           7.645       635.022   
 1                0           -1     359.870           7.959       622.548   
 2                0           -1     355.451           7.500       610.972   
 3                0           -1     393.423           9.342       651.852   
 4                0           -1     394.315           9.409       637.688   
 
    QikProp_FOSA  ...  moe2D_vsa_acc  moe2D_vsa_don  moe2D_vsa_hyd  \
 0       273.363  ...          32.02 

In [2]:
from sklearn.model_selection import train_test_split

# Zielvariable und Merkmale
X = cox2_df.drop(columns=['cox2Class'])     # Alle Features außer Ziel
y = cox2_df['cox2Class']                    # Zielvariable

# Aufteilen in Trainings- und Testset (75/25)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Größen zur Kontrolle
print("Trainingsdaten:", X_train.shape)
print("Testdaten:", X_test.shape)


Trainingsdaten: (346, 256)
Testdaten: (116, 256)


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Skalierung (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest Klassifikator definieren
rf_clf = RandomForestClassifier(random_state=42)


In [4]:
from sklearn.model_selection import GridSearchCV

# Parameter-Gitter definieren
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2']  # 'auto' ist veraltet
}

# GridSearch mit 10-facher Kreuzvalidierung
grid_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

# Modelltraining starten
grid_search.fit(X_train_scaled, y_train)

# Beste Parameter anzeigen
print("Beste Parameter:", grid_search.best_params_)


Beste Parameter: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200}


In [5]:
from sklearn.model_selection import cross_val_score

# Bestes Modell extrahieren
best_model = grid_search.best_estimator_

# 10-fache Kreuzvalidierung auf Trainingsdaten
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=10, scoring='accuracy')

# Ergebnisse anzeigen
print("Accuracy Scores (10-fold CV):", cv_scores)
print("Durchschnittliche Accuracy:", cv_scores.mean())


Accuracy Scores (10-fold CV): [0.94285714 0.97142857 1.         0.94285714 0.97142857 0.94285714
 0.97058824 0.97058824 0.97058824 0.94117647]
Durchschnittliche Accuracy: 0.962436974789916


In [6]:
# Feature Importances ausgeben
importances = best_model.feature_importances_
feature_names = X.columns

# Wichtigkeiten sortiert anzeigen
for name, importance in sorted(zip(feature_names, importances), key=lambda x: -x[1])[:10]:
    print(f"{name}: {importance:.4f}")


IC50: 0.2936
QikProp_QPlogKhsa: 0.0140
moe2D_SMR_VSA2: 0.0130
QikProp_QPlogS: 0.0127
QikProp_accptHB: 0.0119
moe2D_logS: 0.0117
moe2D_bpol: 0.0108
QikProp_IP.eV.: 0.0108
moe2D_Q_VSA_POS: 0.0108
moe2D_PEOE_VSA_FPPOS: 0.0099


In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# Vorhersage auf Testdaten
y_pred = best_model.predict(X_test_scaled)

# Metriken berechnen
acc = accuracy_score(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# ROC AUC Score (nur bei binärer Klassifikation)
lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)
y_pred_prob = best_model.predict_proba(X_test_scaled)[:, 1]
auc = roc_auc_score(y_test_bin, y_pred_prob)

# Ergebnisse anzeigen
print("Accuracy:", acc)
print("Cohen's Kappa:", kappa)
print("Confusion Matrix:\n", cm)
print("ROC AUC Score:", auc)


Accuracy: 1.0
Cohen's Kappa: 1.0
Confusion Matrix:
 [[23  0]
 [ 0 93]]
ROC AUC Score: 1.0
