In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

# Read the CSV file
file_path = '/Users/pipi/Documents/研究工作资料/数据/54-20des.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path, index_col=0)  # Assuming the first column of the CSV file is the index

df


 Select the specified descriptors

In [None]:
import matplotlib.pyplot as plt
selected_descriptors = ['Dipole y ', 'HOMO eigenvalue ', 'E-state keys (sums): S_dssC ', 
                        'E-state keys (sums): S_sCH3 ', 'Heat of formation ']
df_selected = df[selected_descriptors]

In [None]:
# Standardize the selected data
scaler = StandardScaler()
df_selected_standardized = pd.DataFrame(scaler.fit_transform(df_selected), columns=df_selected.columns)



 Calculate the correlation matrix for the selected descriptors

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
corr_matrix_selected = df_selected_standardized.corr()

# Create an upper triangular mask
mask = ~np.tri(corr_matrix_selected.shape[0], k=-1, dtype=bool)

# Set diagonal values to 1
np.fill_diagonal(corr_matrix_selected.values, 1)

# Display the correlation matrix
plt.figure(figsize=(10, 8))
heatmap_selected = sns.heatmap(corr_matrix_selected, annot=True, cmap='coolwarm', fmt=".2f", 
                               linewidths=.5, mask=mask, vmin=-1, vmax=1,
                               cbar_kws={"ticks": [-1, -0.5, 0, 0.5, 1]})
plt.title('Correlation Matrix of Selected Descriptors')
plt.show()



Calculate VIF for the selected descriptors

In [None]:

vif_values_selected = pd.DataFrame()
vif_values_selected["Feature"] = df_selected.columns
vif_values_selected["VIF"] = [variance_inflation_factor(df_selected_standardized.values, i) 
                              for i in range(df_selected_standardized.shape[1])]

# Output VIF results
print("VIF values for selected descriptors:")
print(vif_values_selected)

Implement Kennard-Stone algorithm to split the data into training and testing sets

In [None]:

import numpy as np
from sklearn.cluster import KMeans

def kennard_stone_split(X, ratio=0.8):
    kmeans = KMeans(n_clusters=int(X.shape[0]*ratio), random_state=42).fit(X)
    clusters = kmeans.labels_
    train_indices = np.where(clusters == np.argmax(np.bincount(clusters)))[0]
    test_indices = np.setdiff1d(range(X.shape[0]), train_indices)
    return train_indices, test_indices

# Apply Kennard-Stone split
train_indices, test_indices = kennard_stone_split(df_selected_standardized.values, ratio=0.8)

# Split the data
X_train, X_test = df_selected_standardized.iloc[train_indices],df_selected_standardized.iloc[test_indices]
X_test

Split Dataset from RM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


df = pd.read_csv(file_path, index_col=0) 

activity_column = 'Activity'
df_sorted = df.sort_values(by=activity_column, ascending=False)

# Define the number of class intervals (in this case, 8)
num_class_intervals = 8

# Split the dataset into class intervals
df['Class Interval'] = pd.qcut(df_sorted[activity_column], q=num_class_intervals, labels=False)

# Initialize empty sets for the training and test sets
train_set = pd.DataFrame()
test_set = pd.DataFrame()

# Iterate over each class interval to construct the test set
for interval in range(num_class_intervals):
    # Randomly select one compound from each class interval
    test_sample = df[df['Class Interval'] == interval].sample(n=1, random_state=42)
    
    # Append the selected samples to the test set
    test_set = pd.concat([test_set, test_sample])
    
    # Remove the selected samples from the original DataFrame to form the training set
    train_set = df.drop(test_sample.index)

# Drop the 'Class Interval' column from the final sets
train_set = train_set.drop('Class Interval', axis=1)
test_set = test_set.drop('Class Interval', axis=1)

# Display the training and test sets
print("Training Set:")
print(train_set)

print("\nTest Set:")
print(test_set)
