In [377]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import pandas as pd
import numpy as np

In [378]:

def evaluate_model(model, X, y):
# Predict on test set
    y_pred = model.predict(X)

    # Calculate Accuracy

    acc = accuracy_score(y, y_pred)

    # Calculate F1 Score (binary classification)
    f1 = f1_score(y, y_pred)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    try:
        y_pred = model.predict_proba(X)[:, 1]
        auc = roc_auc_score(y, y_pred)
        print(f"AUC {auc:.4f}")
    except AttributeError:
        print("It doesn't return probabilty")

In [379]:
link = "C:\\Users\\bogus\\PycharmProjects\\Classification Projects\\GiveLife\\data\\transfusion.data"
df = pd.read_csv(link)
df["New Column"] = (df["Time (months)"] - df["Recency (months)"]) / df["Frequency (times)"]
df["New Column2"] = df["Frequency (times)"] / df["Time (months)"]
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007,New Column,New Column2
0,2,50,12500,98,1,1.92,0.510204
1,0,13,3250,28,1,2.153846,0.464286
2,1,16,4000,35,1,2.125,0.457143
3,2,20,5000,45,1,2.15,0.444444
4,1,24,6000,77,0,3.166667,0.311688


In [380]:
x_vars = ["Recency (months)", "Frequency (times)", "Time (months)", "New Column", "New Column2"]
y_var = "whether he/she donated blood in March 2007"
X = df[x_vars]
y = df[y_var]

In [381]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [382]:
pipe1 = Pipeline(
    [
        ("classification", LogisticRegression())
    ]
)

In [383]:
pipe1.fit(X_train, y_train)

In [384]:
evaluate_model(pipe1, X_train, y_train)
evaluate_model(pipe1, X_test, y_test)

Accuracy: 0.7772
F1 Score: 0.1830
AUC 0.7538
Accuracy: 0.7540
F1 Score: 0.1786
AUC 0.7610


In [407]:
pipe2 = Pipeline(
    [
        ("sqrt", FunctionTransformer(lambda X: np.hstack([X, 
                                                          # np.sqrt(X), 
                                                          np.log1p(X), 
                                                          # np.pow(X, 2)
                                                          ]))),
        # ("PCA", PCA(8)),
        # ('select', SelectKBest(k=6)),
        # ("polynomial", PolynomialFeatures(degree=2, include_bias=False)),
        # ("scaler", MinMaxScaler()),
        ("classification", LogisticRegression(max_iter=1000))
    ]
)

In [408]:
pipe2.fit(X_train, y_train)

In [409]:
evaluate_model(pipe2, X_train, y_train)
evaluate_model(pipe2, X_test, y_test)

Accuracy: 0.8075
F1 Score: 0.4066
AUC 0.7651
Accuracy: 0.7540
F1 Score: 0.2333
AUC 0.7612


In [388]:
pipe3 = Pipeline(
    [
        ("polynomial", KBinsDiscretizer()),
        ("classification", LogisticRegression())
    ]
)

In [389]:
pipe3.fit(X_train, y_train)



In [390]:
evaluate_model(pipe3, X_train, y_train)
evaluate_model(pipe3, X_test, y_test)

Accuracy: 0.8021
F1 Score: 0.4188
AUC 0.7710
Accuracy: 0.7701
F1 Score: 0.3175
AUC 0.7530


In [443]:
pipe4 = Pipeline(
    [
        ("sqrt", FunctionTransformer(lambda X: np.hstack([X, 
                                                          # np.sqrt(X), 
                                                          np.log1p(X), 
                                                          # np.pow(X, 2)
                                                          ]))),
        ("classification", RandomForestClassifier(n_estimators=1000, criterion="entropy", max_depth=3, max_features="log2"))
    ]
)

In [444]:
pipe4.fit(X_train, y_train)

In [445]:
evaluate_model(pipe4, X_train, y_train)
evaluate_model(pipe4, X_test, y_test)

Accuracy: 0.8146
F1 Score: 0.4951
AUC 0.8264
Accuracy: 0.7914
F1 Score: 0.4658
AUC 0.7556


In [394]:
pipe5 = Pipeline(
    [
        ("sqrt", FunctionTransformer(lambda X: np.hstack([X, np.sqrt(X), np.log1p(X), np.pow(X, 2)]))),
        ("classification", AdaBoostClassifier())
    ]
)

In [395]:
pipe5.fit(X_train, y_train)

In [396]:
evaluate_model(pipe5, X_train, y_train)
evaluate_model(pipe5, X_test, y_test)

Accuracy: 0.8128
F1 Score: 0.5116
AUC 0.7963
Accuracy: 0.7914
F1 Score: 0.4800
AUC 0.7357


In [397]:
pipe6 = Pipeline(
    [
        ("sqrt", FunctionTransformer(lambda X: np.hstack([X, np.sqrt(X), np.log1p(X), np.pow(X, 2)]))),
        ("scaler", MinMaxScaler()),
        # ("scaler", StandardScaler()),
        ("classification", SVC(kernel="rbf"))
    ]
)

In [398]:
pipe6.fit(X_train, y_train)

In [399]:
evaluate_model(pipe6, X_train, y_train)
evaluate_model(pipe6, X_test, y_test)

Accuracy: 0.7932
F1 Score: 0.2750
It doesn't return probabilty
Accuracy: 0.7380
F1 Score: 0.1091
It doesn't return probabilty


In [400]:
max_clip = np.array([30, 30, 100])
preprocessing = Pipeline(
    [
        ("sqrt", FunctionTransformer(lambda X: np.hstack([X, np.sqrt(X), np.log1p(X), np.pow(X, 2), X[..., :1] - X[..., -1:]]))),
        # ("original", "passthrough"),
        # ("scaler", MinMaxScaler((-1, 1)))
    ]
)