## Importing necessary libraries

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [79]:
train_data = pd.read_csv('../data/preprocessed/002_train_padel_fps.csv')
test_data = pd.read_csv('../data/preprocessed/002_test_padel_fps.csv')

print('traindata shape: ', train_data.shape)
print('testdata shape: ', test_data.shape)
train_data.head(5)

traindata shape:  (9347, 1445)
testdata shape:  (2334, 1445)


Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ACTIVITY
0,0,0.04,0.0,74.93,39.29,0,0,32,19,13,...,38.84,2.04,14.64,5.66,6.43,726.0,26.0,2.17,100.0,1
1,0,1.69,2.85,139.57,75.41,0,0,63,35,28,...,73.19,2.09,23.7,2.59,12.94,3948.0,57.0,5.58,198.0,1
2,0,1.38,1.91,100.69,53.75,0,0,47,25,22,...,49.89,2.0,20.5,2.52,17.98,1794.0,35.0,2.1,120.0,0
3,0,2.57,6.59,117.77,62.95,0,0,55,27,28,...,54.64,2.02,12.04,5.55,6.49,2074.0,41.0,3.09,138.0,1
4,0,1.71,2.93,107.62,49.99,0,0,37,25,12,...,51.68,2.07,16.92,0.0,14.4,1320.0,47.0,2.42,138.0,1


## pipeline

In [80]:
continuous_cols = train_data.select_dtypes(include='float64').columns

Q1 = train_data[continuous_cols].quantile(0.25)
Q3 = train_data[continuous_cols].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

train_data[continuous_cols] = train_data[continuous_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)
test_data[continuous_cols] = test_data[continuous_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)

In [81]:
# from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def create_preprocessing_pipeline():
    pipeline = Pipeline([
        ('variance_selector', VarianceThreshold(threshold=0.01)),
        ('imputer', SimpleImputer()),
        ('scaler', RobustScaler())
    ])
    return pipeline

In [82]:
preprocessor = create_preprocessing_pipeline()

X_train = train_data.drop('ACTIVITY', axis=1)
y_train = train_data['ACTIVITY']

X_test = test_data.drop('ACTIVITY', axis=1)
y_test = test_data['ACTIVITY']


preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [83]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

## model building

In [84]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, class_weight='balanced')
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_test)

In [85]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

print('accuracy: ', accuracy_score(y_test, y_pred))
print('f1_score: ', f1_score(y_test, y_pred))
print('precision_score: ', precision_score(y_test, y_pred))
print('recall_score: ', recall_score(y_test, y_pred))


accuracy:  0.7540702656383891
f1_score:  0.6943556975505857
precision_score:  0.7048648648648649
recall_score:  0.6841552990556139
