In [None]:
# Function transformer -->
# dataset ==> custom operation apply ----> use function 

# Function transformer ---> object create ---> as an argument function(pre-defined or custom)
# logic ---> apply dataset

In [1]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

In [2]:
# create a dataset
x = np.array([[1,2] , [3,4]])
print(x)

# define the transformation function
log_transform = FunctionTransformer(np.log1p)

# apply the transform to the dataset
x_transformed = log_transform.transform(x)

# view the transformed data 
print(x_transformed)

[[1 2]
 [3 4]]
[[0.69314718 1.09861229]
 [1.38629436 1.60943791]]


In [3]:
X = np.array([[1,2] , [3,4]])
print(X)

# define a custom feature engineering function 
def squ(X):
    return np.hstack((X , X**2))

# create a FunctionTransformer to apply the custom function
custom_transformer = FunctionTransformer(squ)

# apply the transformer to the input data
X_transformed = custom_transformer.transform(X)

# view the transformed data 
print(X_transformed)

[[1 2]
 [3 4]]
[[ 1  2  1  4]
 [ 3  4  9 16]]


In [4]:
x = np.array([[1,2],[3,4]])

# define a custom scaling function
def my_scaling(x):
    return x / np.max(x)

# create a FunctionTransformer to apply the custom function 
custom_transformer = FunctionTransformer(my_scaling)

# apply the transformer to te input data
x_transformed = custom_transformer.transform(x)

# view the transformed data
print(x_transformed)

[[0.25 0.5 ]
 [0.75 1.  ]]


In [5]:
# Pipeline --> container steps of process but sequentially 

# column transformer --> 1 step output will not the 2 step input 
# Pipline ---> output of 1 step will be input of second step
# data ---> x,y ---> categorical data[] ---> encoding ---> normal distribution , normerical data[] ---> data

In [6]:
import numpy as np 
import pandas as pd

In [7]:
df = pd.read_csv("covid_toy - covid_toy.csv")

In [8]:
df.head(2)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [14]:
x = df.drop(columns = ['has_covid'])
y = df['has_covid']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [15]:
# define the columns that need to be preprocessed
categorical_features = ['gender','city']
numeric_features = ['age','fever']

In [18]:
# create transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])
# combine transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# create a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

# train the model
clf.fit(x_train, y_train)

# evalute the model
y_pred = clf.predict(x_test)

In [None]:
fr