In [2]:
# Bibliotheken laden
import pandas as pd

# CSV-Datei laden
df = pd.read_csv("BloodBrain.csv")

# Übersicht über die Daten
print(df.shape)            # Zeilen und Spalten
print(df.columns)          # Spaltennamen
print(df.dtypes)           # Datentypen
print(df.head())           # Erste 5 Zeilen


(208, 135)
Index(['tpsa', 'nbasic', 'negative', 'vsa_hyd', 'a_aro', 'weight',
       'peoe_vsa.0', 'peoe_vsa.1', 'peoe_vsa.2', 'peoe_vsa.3',
       ...
       'ctdh', 'ctaa', 'mchg', 'achg', 'rdta', 'n_sp2', 'n_sp3', 'o_sp2',
       'o_sp3', 'logBBB'],
      dtype='object', length=135)
tpsa        float64
nbasic        int64
negative      int64
vsa_hyd     float64
a_aro         int64
             ...   
n_sp2       float64
n_sp3       float64
o_sp2       float64
o_sp3       float64
logBBB      float64
Length: 135, dtype: object
        tpsa  nbasic  negative    vsa_hyd  a_aro   weight  peoe_vsa.0  \
0  12.030000       1         0  167.06700      0  156.293    76.94749   
1  49.330002       0         0   92.64243      6  151.165    38.24339   
2  50.529999       1         0  295.16700     15  366.485    58.05473   
3  37.389999       0         0  319.11220     15  382.552    62.23933   
4  37.389999       1         0  299.65800     12  326.464    74.80064   

   peoe_vsa.1  peoe_vsa.2  

In [3]:
from sklearn.model_selection import train_test_split

# Zielvariable definieren
y = df['logBBB']

# Alle anderen Spalten sind die Merkmale (Features)
X = df.drop(columns=['logBBB'])

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# Ausgabe der Größen zur Kontrolle
print("Trainingsdaten:", X_train.shape)
print("Testdaten:", X_test.shape)


Trainingsdaten: (156, 134)
Testdaten: (52, 134)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Skalierung 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest Modell definieren
rf = RandomForestRegressor(random_state=42)


In [None]:
from sklearn.model_selection import GridSearchCV

#Hyperparameter-Gitter
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2']  
}

# GridSearchCV definieren
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=10,                 # 10-fache Kreuzvalidierung
    scoring='r2',
    n_jobs=-1              # Alle Kerne nutzen
)

# Suche starten (dauert etwas)
grid_search.fit(X_train_scaled, y_train)

# Bestes Modell ausgeben
print("Beste Parameter:", grid_search.best_params_)


Beste Parameter: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200}


In [7]:
from sklearn.model_selection import cross_val_score

# Bestes Modell holen
best_model = grid_search.best_estimator_

# Kreuzvalidierung mit dem besten Modell (Trainingdaten)
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=10, scoring='r2')

# Ergebnisse anzeigen
print("R² Scores aus 10-facher Kreuzvalidierung:", cv_scores)
print("Durchschnittlicher R² Score:", cv_scores.mean())


R² Scores aus 10-facher Kreuzvalidierung: [0.58012335 0.27991372 0.6863204  0.63616341 0.57545527 0.7122683
 0.6507631  0.60175096 0.58704458 0.49337574]
Durchschnittlicher R² Score: 0.5803178828213141


In [8]:
# Feature Importances
importances = best_model.feature_importances_
feature_names = X.columns

# Wichtigkeit sortiert ausgeben
for name, importance in sorted(zip(feature_names, importances), key=lambda x: -x[1]):
    print(f"{name}: {importance:.4f}")


fnsa3: 0.0435
tpsa: 0.0380
prx: 0.0350
tcnp: 0.0296
most_positive_charge: 0.0282
polar_area: 0.0277
rpcg: 0.0259
tpsa.1: 0.0219
psa_npsa: 0.0209
pnsa3: 0.0204
logp.o.w.: 0.0201
clogp: 0.0188
scaa3: 0.0180
dpsa3: 0.0177
vsa_acc: 0.0171
fpsa3: 0.0162
tcsa: 0.0159
scaa1: 0.0131
chdh2: 0.0128
scdh3: 0.0128
smr_vsa0: 0.0127
tcpa: 0.0124
vsa_other: 0.0124
mlogp: 0.0112
nocount: 0.0109
scaa2: 0.0103
slogp_vsa0: 0.0102
scdh1: 0.0098
adistm: 0.0093
peoe_vsa.3: 0.0093
saaa3: 0.0091
chdh3: 0.0089
mw: 0.0087
sadh1: 0.0085
ppsa3: 0.0085
pnsa1: 0.0084
n_sp3: 0.0079
wnsa2: 0.0078
n_sp2: 0.0077
o_sp2: 0.0077
saaa1: 0.0075
saaa2: 0.0073
most_negative_charge: 0.0073
peoe_vsa.1.1: 0.0073
pnsa2: 0.0072
peoe_vsa.3.1: 0.0067
nonpolar_area: 0.0066
weight: 0.0066
andrewbind: 0.0065
wpsa3: 0.0065
vsa_hyd: 0.0065
rncg: 0.0062
smr_vsa6: 0.0061
sadh2: 0.0061
slogp_vsa7: 0.0061
ovality: 0.0060
slogp_vsa1: 0.0060
smr_vsa5: 0.0059
adistd: 0.0058
wpcs: 0.0057
peoe_vsa.6.1: 0.0056
slogp_vsa9: 0.0056
wnsa1: 0.0056
lumo

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

# Vorhersagen auf den Testdaten
y_pred = best_model.predict(X_test_scaled)

# Modellbewertung
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Ergebnisse ausgeben
print("Mean Squared Error (MSE) auf Testdaten:", mse)
print("R²-Score auf Testdaten:", r2)



Mean Squared Error (MSE) auf Testdaten: 0.2541175855288462
R²-Score auf Testdaten: 0.3921436468747588
