In [122]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import KFold, cross_val_score
from tune_sklearn import TuneGridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pandas as pd


In [123]:

X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [124]:
y.head()


0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']

In [125]:
numeric_features = ["age", "fare"]
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
categorical_features = ["embarked", "pclass"]

selected_features = numeric_features + categorical_features
print(selected_features)

X=X[selected_features]
X.head()

['age', 'fare', 'embarked', 'pclass']


Unnamed: 0,age,fare,embarked,pclass
0,29.0,211.3375,S,1
1,0.9167,151.55,S,1
2,2.0,151.55,S,1
3,30.0,151.55,S,1
4,25.0,151.55,S,1


In [126]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
m = len(X_train.index)

In [127]:
# Transform the data

preprocessor = ColumnTransformer(
    transformers= [
        ("num", numeric_transformer, numeric_features),
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=True,
)



In [139]:
select_features = SelectKBest(k=3)
steps = [
    ('preprocessor', preprocessor),
    ('select_features', select_features),
    ('logistic_regressor_sgd', SGDClassifier(loss='log_loss',  learning_rate='constant', eta0=1e-4, n_jobs=-1, shuffle=True))
]
logistic_reg_pipeline = Pipeline(steps)
logistic_reg_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'select_features', 'logistic_regressor_sgd', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__simpleimputer', 'preprocessor__num__standardscaler', 'preprocessor__num__simpleimputer__add_indicator', 'preprocessor__num__simpleimputer__copy', 'preprocessor__num__simpleimputer__fill_value', 'preprocessor__num__simpleimputer__keep_empty_features', 'preprocessor__num__simpleimputer__missing_values', 'preprocessor__num__simpleimputer__strategy', 'preprocessor__num__simpleimputer__verbose', 'preprocessor__num__standardscaler__copy', 'preprocessor__num__standardscaler__with_mean', 'preprocessor__num__standardscaler__with_std

In [140]:
param_grid = {
    'logistic_regressor_sgd__eta0': [1e-4, 1e-3, 1e-2],
    'logistic_regressor_sgd__max_iter': [int(np.ceil(1e6 / m))]
}
randsearch_auc = RandomizedSearchCV(estimator=logistic_reg_pipeline, param_distributions=param_grid, n_iter=5, scoring='roc_auc', cv=5, verbose=2)


In [142]:
for train_df in tqdm(pd.read_csv("DATA/creditcard.csv", chunksize=chunksize, iterator=True)):
    X = train_df
    Y = train_df['Class']
    randsearch_auc.partial_fit(X_train, y_train, classes=[0,1])
    
randsearch_auc.fit(X_train, y_train)



In [143]:
randsearch_auc.cv_results_



{'mean_fit_time': array([0.01125438, 0.01076853, 0.01036565]),
 'std_fit_time': array([0.00041194, 0.00034422, 0.00011   ]),
 'mean_score_time': array([0.00501354, 0.00504141, 0.00494459]),
 'std_score_time': array([1.32287857e-04, 1.52852150e-04, 8.08919104e-05]),
 'param_logistic_regressor_sgd__max_iter': masked_array(data=[956, 956, 956],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_logistic_regressor_sgd__eta0': masked_array(data=[0.0001, 0.001, 0.01],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.0001},
  {'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.001},
  {'logistic_regressor_sgd__max_iter': 956,
   'logistic_regressor_sgd__eta0': 0.01}],
 'split0_test_score': array([0.59347997, 0.59347997, 0.59347997]),
 'split1_test_score': array([0.6948154 , 0.696386

In [144]:
y_pred = randsearch_auc.predict(X_test)
pred_df = pd.DataFrame({'y': y_test,'y_pred': y_pred})
randsearch_auc.best_score_
#gini = 2*roc_auc_score(y_test, y_pred)-1


0.679242184672271

In [135]:
test_score = logistic_reg_pipeline.score(X_test, y_test)
print(test_score)

0.6221374045801527
