In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression

data = pd.read_csv("sample.csv")

In [28]:
X = data.drop(columns="label")
y = data["label"]

In [31]:
train_X,test_X, train_y, test_y = train_test_split(X,y, test_size = .30)

In [88]:
def get_pipeline(features):
    '''
    func:This function build the pipeline for all the input features and for classifier
    input:dict:mean_encoding_dict-dictionary which contains mean encoding for categorical features
    output:Pipeline:full_pipeline_model - fully built pipeline with classifer
    '''
    categorical_transformer = Pipeline(steps=[
    ('hash_encoder', ce.hashing.HashingEncoder(n_components=1024)),
    ('scaler', StandardScaler())])
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, features)])
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(penalty='l2', C = 1e-6, solver = 'liblinear'))])
    return pipe

In [89]:
pipeline = get_pipeline(list(X.columns))

In [90]:
pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('hash_encoder',
                                                                   HashingEncoder(max_process=6,
                                                                                  n_components=1024)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['a', 'b', 'c', 'd', 'e'])])),
                ('classifier',
                 LogisticRegression(C=1e-06, solver='liblinear'))])

In [91]:
pipeline.fit(train_X,train_y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('hash_encoder',
                                                                   HashingEncoder(max_process=6,
                                                                                  n_components=1024)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['a', 'b', 'c', 'd', 'e'])])),
                ('classifier',
                 LogisticRegression(C=1e-06, solver='liblinear'))])

In [92]:
tst = test_X.head(10)

In [93]:
pipeline.predict_proba(tst)[:,1]

array([0.50002663, 0.49996946, 0.49999301, 0.49996946, 0.49999301,
       0.49999301, 0.50001265, 0.49996946, 0.49999301, 0.49996946])

In [94]:
tst = test_X.head(6)

In [95]:
pipeline.predict_proba(tst)[:,1]

array([0.50002663, 0.49996946, 0.49999301, 0.49996946, 0.49999301,
       0.49999301])

In [96]:
tst = test_X.head(5)

In [None]:
#this will never finish completing becauuse number of samples is <= number of columns we have trained
pipeline.predict_proba(tst)[:,1]