In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [23]:
class DatasetTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, transform_to_features):
        self.transform_to_features = transform_to_features
        self.ohe = OneHotEncoder()
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        for transform in self.transform_to_features:
            columns = self.transform_to_features[transform]
            if transform == 'StandardScaler':
                self.scaler.fit(X[columns])
            elif transform == 'OneHotEncoder':
                self.ohe.fit(X[columns])
        return self

    def transform(self, X, y=None):
        transformed_features = []
        for transform in self.transform_to_features:
            columns = self.transform_to_features[transform]
            if transform == 'StandardScaler':
                scaled_feats = self.scaler.transform(X[columns])
                transformed_features.append(scaled_feats)
            elif transform == 'OneHotEncoder':
                ohe_feats = self.ohe.transform(X[columns]).toarray()
                transformed_features.append(ohe_feats)
            elif transform == 'pass':
                defualt_feats = X[columns]
                transformed_features.append(defualt_feats)
        X_transformed = np.hstack(transformed_features)
        return X_transformed

In [24]:
data = pd.read_csv('../data/raw/heart_cleveland_upload.csv')

In [25]:
target = data['condition']
data.drop(['condition'], axis=1, inplace=True)
binary_feats = list(data.nunique()[data.nunique() == 2].index)
categorical_feats = list(data.nunique()[(data.nunique() > 2) & (data.nunique() < 10)].index)
numerical_feats = list(data.nunique()[data.nunique() > 10].index)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [27]:
d = {'StandardScaler':numerical_feats, 'OneHotEncoder':categorical_feats, 'pass': binary_feats}
transform.fit(X_train)
X_train_transformed = transform.transform(X_train)

In [28]:
X_test_transformed = transform.transform(X_test)

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
logreg = LogisticRegression()
logreg.fit(X_train_transformed, y_train)
y_pred = logreg.predict(X_test_transformed)
sum(y_pred == y_test)/y_test.shape[0]

0.8

In [16]:
from catboost import CatBoostClassifier

In [72]:
clf_boost = CatBoostClassifier()

In [73]:
clf_boost.fit(X_train_transformed, y_train, verbose=False)
y_pred = clf_boost.predict(X_test_transformed)
sum(y_pred == y_test)/y_test.shape[0]

0.7833333333333333