- Pipelines chains together multiple steps so that the output of each step is used
as input to the next step.
- Pipelines makes it easy to apply the same preprocessing to train and test!

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder


In [227]:
train=pd.read_csv("C:\\datascience End to End Projects\\End-to-End-Heart-Disease-Application-\\artifacts\\data_ingestion\\train.csv")
test=pd.read_csv("C:\\datascience End to End Projects\\End-to-End-Heart-Disease-Application-\\artifacts\\data_ingestion\\test.csv")

In [228]:
train.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,50,F,ASY,160,0,1,Normal,110,N,0.0,Flat,1
1,34,M,ATA,150,214,0,ST,168,N,0.0,Up,0
2,61,M,ASY,141,292,0,ST,115,Y,1.7,Flat,1
3,57,M,ASY,156,173,0,LVH,119,Y,3.0,Down,1
4,63,M,ASY,185,0,0,Normal,98,Y,0.0,Up,1


In [229]:
test.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,52,M,NAP,128,0,0,ST,180,N,3.0,Up,1
1,43,M,ASY,115,0,0,Normal,145,Y,2.0,Flat,1
2,71,M,ASY,130,221,0,ST,115,Y,0.0,Flat,1
3,37,M,ASY,130,315,0,Normal,158,N,0.0,Up,0
4,53,M,ASY,126,0,0,Normal,106,N,0.0,Flat,1


In [230]:
train['Oldpeak'].unique()

array([ 0. ,  1.7,  3. ,  1. ,  0.5,  2. ,  1.1,  1.6,  1.5,  0.6,  0.3,
        1.9,  0.4,  0.8,  2.5,  1.3,  2.4,  3.4,  1.4,  1.8,  3.5,  1.2,
        2.2,  2.8, -0.8,  0.2,  2.6,  3.1, -0.7, -0.9,  3.6,  0.1,  4. ,
       -0.5,  0.9,  0.7, -1. ,  6.2,  4.2,  2.1,  5. ,  3.8, -2. , -1.1,
        3.7,  5.6,  2.9, -0.1,  3.2, -2.6, -1.5,  2.3])

In [231]:
print(train.shape)
print(test.shape)

(734, 12)
(184, 12)


In [232]:
x_train=train.drop(columns=['RestingBP','RestingECG','HeartDisease'])
y_train=train['HeartDisease']
x_test=test.drop(columns=['RestingBP','RestingECG','HeartDisease'])
y_test=test['HeartDisease']

In [233]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(734, 9)
(734,)
(184, 9)
(184,)


In [234]:
x_train_index = x_train.columns.tolist()  # Get the column names as a list
y_train_index = y_train.name  # Get the name of the target column

print("Index values of x_train:", x_train_index)
print("Index value of y_train:", y_train_index)

Index values of x_train: ['Age', 'Sex', 'ChestPainType', 'Cholesterol', 'FastingBS', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
Index value of y_train: HeartDisease


In [235]:
trf1 = ColumnTransformer(
    [('ordinal_encode', OrdinalEncoder(), [1, 2, 6, 8])],  # Indices of categorical columns
    remainder='passthrough'
)

In [236]:
train.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,50,F,ASY,160,0,1,Normal,110,N,0.0,Flat,1
1,34,M,ATA,150,214,0,ST,168,N,0.0,Up,0
2,61,M,ASY,141,292,0,ST,115,Y,1.7,Flat,1
3,57,M,ASY,156,173,0,LVH,119,Y,3.0,Down,1
4,63,M,ASY,185,0,0,Normal,98,Y,0.0,Up,1


### sparse=False:

By default, Scikit-learn's LabelEncoder returns a sparse matrix when transforming categorical variables into numerical ones. A sparse matrix is a data structure that only stores non-zero elements, which is memory-efficient when dealing with large datasets with many zero values.
However, sometimes you might prefer to have a dense array instead of a sparse matrix. A dense array stores all elements, regardless of whether they are zero or non-zero. This can be more intuitive to work with and might be necessary for certain operations that don't support sparse matrices.
When you set sparse=False, you're instructing the LabelEncoder to return a dense array instead of a sparse matrix, ensuring that all elements are stored explicitly.
remainder='passthrough':

### ColumnTransformer
allows you to specify how to handle columns that are not explicitly transformed. This is useful when you have a dataset with a mix of columns that require different preprocessing steps.
When remainder='passthrough', it means that any columns not specified in the transformer tuples should be passed through unchanged to the output. In other words, these columns will be included in the transformed output dataset without any modifications.
This is particularly helpful when you want to apply different transformations to different subsets of columns but still keep all columns in the final dataset, maintaining their original values.

In [237]:
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

trf2 = ColumnTransformer([
    ('minmax_scale', minmax_scaler, [7]),
    ('standard_scale', standard_scaler, [0,3,5])],remainder='passthrough')


print(trf2)

ColumnTransformer(remainder='passthrough',
                  transformers=[('minmax_scale', MinMaxScaler(), [7]),
                                ('standard_scale', StandardScaler(),
                                 [0, 3, 5])])


In [238]:
trf3=LogisticRegression(random_state=0,C=10,penalty='l2',max_iter=2000)
print(trf3)

LogisticRegression(C=10, max_iter=2000, random_state=0)


### Create Pipeline

In [239]:
pipe=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3)
])  # this is the pipeline object which consist of individual pipelines for labelencoding and feature scalling and model 
 # ('trf1'=name, trf1=object of pipeline for label_encoding)

### Pipeline Vs make_pipeline
### Pipeline requires naming of steps or names of pipeline along with object of pipeline, make_pipeline does not require names of the pipeline , it requires only object of the pipelines
(Same applies to ColumnTransformer vs make_column_transformer)

In [240]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(trf1, trf2, trf3)
print(pipe)

Pipeline(steps=[('columntransformer-1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinal_encode',
                                                  OrdinalEncoder(),
                                                  [1, 2, 6, 8])])),
                ('columntransformer-2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('minmax_scale',
                                                  MinMaxScaler(), [7]),
                                                 ('standard_scale',
                                                  StandardScaler(),
                                                  [0, 3, 5])])),
                ('logisticregression',
                 LogisticRegression(C=10, max_iter=2000, random_state=0))])


In [241]:
pipe.fit(x_train,y_train)

In [242]:
y_pred=pipe.predict(x_test)

In [243]:
y_pred

array([0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0], dtype=int64)

In [244]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8695652173913043

In [245]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
x = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(pipe,x_train, y_train, cv=x, scoring='roc_auc').mean()
print(scores)


0.9111735275226128


In [246]:
print(y_train.head())

0    1
1    0
2    1
3    1
4    1
Name: HeartDisease, dtype: int64


In [247]:
x_train.shape

(734, 9)

In [248]:
y_train.shape

(734,)

In [249]:
import numpy as np
import pickle 
pickle.dump(pipe,open('pipe.pkl','wb'))

In [250]:
pipe=pickle.load(open('pipe.pkl','rb'))

In [251]:
user_data_passed=np.array([52,'M','NAP',0,1,180,'N',0.0,'Up'],dtype=object).reshape(1,9)

In [252]:
pipe.predict(user_data_passed)



array([0], dtype=int64)

In [189]:
train.head(15)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,50,F,ASY,160,0,1,Normal,110,N,0.0,Flat,1
1,34,M,ATA,150,214,0,ST,168,N,0.0,Up,0
2,61,M,ASY,141,292,0,ST,115,Y,1.7,Flat,1
3,57,M,ASY,156,173,0,LVH,119,Y,3.0,Down,1
4,63,M,ASY,185,0,0,Normal,98,Y,0.0,Up,1
5,58,M,NAP,140,211,1,LVH,165,N,0.0,Up,0
6,59,M,NAP,180,213,0,Normal,100,N,0.0,Up,0
7,45,M,ASY,130,219,0,ST,130,Y,1.0,Flat,1
8,57,F,ASY,128,303,0,LVH,159,N,0.0,Up,0
9,55,M,ASY,140,229,0,Normal,110,Y,0.5,Flat,0


In [253]:
A=[6,4,7,10,11]
B=[2,4,6,8,10]
c=[x for x in A if x in B]
print(c)

[6, 4, 10]


In [55]:
data=pd.read_csv("C:\\Users\\mahen\\Downloads\\BankChurners.csv")

In [56]:
data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [57]:
import pandas as pd

# Original column names
old_names = ['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count', 'Education_Level',
             'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count',
             'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
             'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct',
             'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

# New column names
new_names = ['Clientnum', 'Attrition', 'Age', 'Gender', 'Dependent_count', 'Education', 'Marital_Status', 'Income',
             'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive', 'Contacts_Count',
             'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
             'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

# Assuming df is your DataFrame containing the data
data.columns = new_names

# Now, df has column names changed according to new_names
print(data.columns)
data.head()

y=data.isnull().sum()
print(y)

columns_to_drop = ['Clientnum', 'Avg_Open_To_Buy']
data.drop(columns=columns_to_drop, inplace=True)

print(data.columns)
data.info()

y=data.isnull().sum()
print(y)



                


Index(['Clientnum', 'Attrition', 'Age', 'Gender', 'Dependent_count',
       'Education', 'Marital_Status', 'Income', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive',
       'Contacts_Count', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
      dtype='object')
Clientnum                   0
Attrition                   0
Age                         0
Gender                      0
Dependent_count             0
Education                   0
Marital_Status              0
Income                      0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive             0
Contacts_Count              0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct           

In [58]:
trf1 = ColumnTransformer(
    [('ordinal_encode', OrdinalEncoder(), [3,5,6,7,8])],  # Indices of categorical columns
    remainder='passthrough'
)



pipe=Pipeline([
            ('trf1',trf1)
        ])

pipe = make_pipeline(trf1) 
print(pipe)


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinal_encode',
                                                  OrdinalEncoder(),
                                                  [3, 5, 6, 7, 8])]))])


In [61]:
label_encoder = LabelEncoder()
#data['Attrition']= label_encoder.fit_transform(data['Attrition'])
data = pipe.fit_transform(data)
data = pd.DataFrame(data)
print(data)
data.info()

        0    1     2    3    4    5    6    7     8  9  10 11       12    13  \
0      0.0  1.0  19.0  1.0  3.0  3.0  1.0  2.0  26.0  5  1  3  12691.0   777   
1      0.0  1.0  23.0  0.0  2.0  5.0  2.0  4.0  31.0  6  1  2   8256.0   864   
2      0.0  1.0  25.0  1.0  2.0  3.0  1.0  3.0  23.0  4  1  0   3418.0     0   
3      0.0  1.0  14.0  0.0  3.0  4.0  3.0  4.0  21.0  3  4  1   3313.0  2517   
4      0.0  1.0  14.0  1.0  5.0  3.0  1.0  2.0   8.0  5  1  0   4716.0     0   
...    ...  ...   ...  ...  ...  ...  ...  ...   ... .. .. ..      ...   ...   
10122  0.0  1.0  24.0  1.0  2.0  2.0  2.0  1.0  27.0  3  2  3   4003.0  1851   
10123  0.0  0.0  15.0  1.0  6.0  2.0  0.0  1.0  12.0  4  2  3   4277.0  2186   
10124  0.0  0.0  18.0  0.0  3.0  1.0  1.0  4.0  23.0  5  3  4   5409.0     0   
10125  0.0  0.0   4.0  1.0  2.0  2.0  3.0  1.0  23.0  4  3  3   5281.0     0   
10126  3.0  0.0  17.0  0.0  2.0  2.0  1.0  4.0  12.0  6  2  4  10388.0  1961   

          14     15   16     17     18 

In [51]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Ensure data is a DataFrame or Series
# Assuming 'data' is a DataFrame
data1= label_encoder.fit_transform(data['Attrition'])

# Assuming 'pipe' is your pipeline object
data_transformed = pipe.fit_transform(data)

# Convert the transformed data back to DataFrame
data_transformed = pd.DataFrame(data_transformed)

print(data_transformed)
print(data_transformed.info())

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [38]:
old_names = ['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count', 'Education_Level',
                    'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count',
                    'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
                    'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct',
                    'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

        # New column names
new_names = ['Clientnum', 'Attrition', 'Age', 'Gender', 'Dependent_count', 'Education', 'Marital_Status', 'Income',
                    'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive', 'Contacts_Count',
                    'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
                    'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

        # Assuming df is your DataFrame containing the data
data.columns = new_names
data = pd.DataFrame(data)

columns_to_drop = ['Clientnum', 'Avg_Open_To_Buy']
data.drop(columns=columns_to_drop, inplace=True)

print(data.columns)
data.head()

y=data.isnull().sum()
print(y)
data.info()

logger.info("done with renaming the columns ")
train,test = train_test_split(data,test_size=0.33)

x_train=train.drop(columns=['Attrition'])
y_train=train['Attrition']
x_test=test.drop(columns=['Attrition'])
y_test=test['Attrition']


ValueError: Length mismatch: Expected axis has 19 elements, new values have 21 elements

In [11]:
print(train.shape)
print(test.shape)

(6785, 21)
(3342, 21)


In [30]:
old_names = ['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count', 'Education_Level',
                    'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count',
                    'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
                    'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct',
                    'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

        # New column names
new_names = ['Clientnum', 'Attrition', 'Age', 'Gender', 'Dependent_count', 'Education', 'Marital_Status', 'Income',
                    'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive', 'Contacts_Count',
                    'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
                    'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

        # Assuming df is your DataFrame containing the data
self.data.columns = new_names
self.data = pd.DataFrame(self.data)
print(self.data.columns)
self.data.head()

y=self.data.isnull().sum()
print(y)

NameError: name 'self' is not defined

In [37]:
import pandas as pd

In [46]:
data=pd.read_csv("C:\\Users\\mahen\\Downloads\\BankChurners.csv")

In [47]:
data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10127 non-null  int64  
 1   Attrition_Flag            10127 non-null  object 
 2   Customer_Age              10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education_Level           10127 non-null  object 
 6   Marital_Status            10127 non-null  object 
 7   Income_Category           10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive_12_mon    10127 non-null  int64  
 12  Contacts_Count_12_mon     10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_

In [49]:
# New column names
new_names = ['Clientnum', 'Attrition', 'Age', 'Gender', 'Dependent_count', 'Education', 'Marital_Status', 'Income',
                    'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive', 'Contacts_Count',
                    'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
                    'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

data.columns = new_names

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Clientnum                 10127 non-null  int64  
 1   Attrition                 10127 non-null  object 
 2   Age                       10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education                 10127 non-null  object 
 6   Marital_Status            10127 non-null  object 
 7   Income                    10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive           10127 non-null  int64  
 12  Contacts_Count            10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

# Sample DataFrame (replace with your actual data)
data = pd.DataFrame({
    'Clientnum': [1, 2, 3],
    'Attrition': ['Yes', 'No', 'Yes'],
    'Age': [35, 42, 28],
    'Gender': ['Male', 'Female', 'Male'],
    # ... other columns ...
})

# Define the columns to be encoded (excluding 'Attrition')
categorical_columns = ['Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']

# Initialize the OrdinalEncoder
encoder = OrdinalEncoder()

# Fit the encoder on the categorical columns
encoder.fit(data[categorical_columns])

# Create a transformer that applies ordinal encoding to categorical columns
ordinal_transformer = ColumnTransformer(
    transformers=[('ordinal', encoder, categorical_columns)],
    remainder='passthrough'  # Pass through numerical columns unchanged
)

# Transform the data
transformed_data = ordinal_transformer.fit_transform(data)

# Now 'transformed_data' contains the encoded features
# You can save the 'ordinal_transformer' object for later use

# Example: Access the transformed DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=categorical_columns + data.columns[len(categorical_columns):])
print(transformed_df.head())


In [8]:
# Assuming df is your DataFrame containing the data
categorical_columns = ['Attrition', 'Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
#df=pd.read_csv("artifacts\\data_ingestion\\BankChurners.csv")
x=data.describe()
print(x)
# Print unique categories for each categorical column
for column in categorical_columns:
    unique_categories = data[column].unique()
    print(f"Unique categories in column '{column}':")
    print(unique_categories)
    print()


          Clientnum           Age  Dependent_count  Months_on_book  \
count  1.012700e+04  10127.000000     10127.000000    10127.000000   
mean   7.391776e+08     46.325960         2.346203       35.928409   
std    3.690378e+07      8.016814         1.298908        7.986416   
min    7.080821e+08     26.000000         0.000000       13.000000   
25%    7.130368e+08     41.000000         1.000000       31.000000   
50%    7.179264e+08     46.000000         2.000000       36.000000   
75%    7.731435e+08     52.000000         3.000000       40.000000   
max    8.283431e+08     73.000000         5.000000       56.000000   

       Total_Relationship_Count  Months_Inactive  Contacts_Count  \
count              10127.000000     10127.000000    10127.000000   
mean                   3.812580         2.341167        2.455317   
std                    1.554408         1.010622        1.106225   
min                    1.000000         0.000000        0.000000   
25%                    3.0000

In [None]:
Unique categories in column 'Gender':
['M' 'F']

Unique categories in column 'Education_Level':
['High School' 'Graduate' 'Uneducated' 'Unknown' 'College' 'Post-Graduate'
 'Doctorate']

Unique categories in column 'Marital_Status':
['Married' 'Single' 'Unknown' 'Divorced']

Unique categories in column 'Income_Category':
['$60K - $80K' 'Less than $40K' '$80K - $120K' '$40K - $60K' '$120K +'
 'Unknown']

Unique categories in column 'Card_Category':
['Blue' 'Gold' 'Silver' 'Platinum']


In [None]:
this is code for getting to know the unique categories for the object datatype
# Assuming df is your DataFrame containing the data
categorical_columns = ['Attrition', 'Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
#df=pd.read_csv("artifacts\\data_ingestion\\BankChurners.csv")
x=data.describe()
print(x)
# Print unique categories for each categorical column
for column in categorical_columns:
    unique_categories = data[column].unique()
    print(f"Unique categories in column '{column}':")
    print(unique_categories)
    print()


output 
Unique categories in column 'Attrition':
['Existing Customer' 'Attrited Customer']

Unique categories in column 'Gender':
['M' 'F']

Unique categories in column 'Education':
['High School' 'Graduate' 'Uneducated' 'Unknown' 'College' 'Post-Graduate'
 'Doctorate']

Unique categories in column 'Marital_Status':
['Married' 'Single' 'Unknown' 'Divorced']

Unique categories in column 'Income':
['$60K - $80K' 'Less than $40K' '$80K - $120K' '$40K - $60K' '$120K +'
 'Unknown']

Unique categories in column 'Card_Category':
['Blue' 'Gold' 'Silver' 'Platinum']



In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import numpy as np

# Assuming 'data' is your DataFrame
categorical_columns = ['Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()

# Define the ordinal encoding mapping
ordinal_mapping = [{'col': 'Gender', 'mapping': {'M': 0, 'F': 1}},
                   {'col': 'Education', 'mapping': {'Unknown': 0, 'Uneducated': 1, 'High School': 2, 'College': 3, 'Graduate': 4, 'Post-Graduate': 5, 'Doctorate': 6}},
                   {'col': 'Marital_Status', 'mapping': {'Unknown': 0, 'Single': 1, 'Married': 2, 'Divorced': 3}},
                   {'col': 'Income', 'mapping': {'Unknown': 0, 'Less than $40K': 1, '$40K - $60K': 2, '$60K - $80K': 3, '$80K - $120K': 4, '$120K +': 5}},
                   {'col': 'Card_Category', 'mapping': {'Blue': 0, 'Silver': 1, 'Gold': 2, 'Platinum': 3}}]
# Create the column transformer
preprocessor = make_column_transformer(
    (OrdinalEncoder(mapping=ordinal_mapping), categorical_columns),
    remainder='passthrough'  # Numerical columns are passed through
)

# Apply the transformations to the features
X = data.drop('Attrition', axis=1)
X_transformed = preprocessor.fit_transform(X)

# Apply label encoding to the target column
le = LabelEncoder()
y = le.fit_transform(data['Attrition'])

# Now, 'X_transformed' is your preprocessed feature matrix and 'y' is your target vector


In [36]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import joblib

# Assuming 'data' is your DataFrame
categorical_columns = ['Attrition', 'Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()

# Create the column transformer
preprocessor = make_column_transformer(
    (OrdinalEncoder(), categorical_columns),
    remainder='passthrough'  # Numerical columns are passed through
)

# Apply the transformations to the features
X = data.drop('Attrition', axis=1)
X_transformed = preprocessor.fit_transform(X)

# Save the preprocessor object for later use on unseen data
joblib.dump(preprocessor, 'preprocessor.joblib')

# Now, 'X_transformed' is your preprocessed feature matrix


ValueError: A given column is not a column of the dataframe

In [86]:
data=pd.read_csv("C:\\Users\\mahen\\Downloads\\BankChurners.csv")
# New column names
new_names = ['Clientnum', 'Attrition', 'Age', 'Gender', 'Dependent_count', 'Education', 'Marital_Status', 'Income',
                    'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive', 'Contacts_Count',
                    'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
                    'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

data.columns = new_names

data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Clientnum                 10127 non-null  int64  
 1   Attrition                 10127 non-null  object 
 2   Age                       10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education                 10127 non-null  object 
 6   Marital_Status            10127 non-null  object 
 7   Income                    10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive           10127 non-null  int64  
 12  Contacts_Count            10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_

Unnamed: 0,Clientnum,Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0
mean,739177600.0,46.32596,2.346203,35.928409,3.81258,2.341167,2.455317,8631.953698,1162.814061,7469.139637,0.759941,4404.086304,64.858695,0.712222,0.274894
std,36903780.0,8.016814,1.298908,7.986416,1.554408,1.010622,1.106225,9088.77665,814.987335,9090.685324,0.219207,3397.129254,23.47257,0.238086,0.275691
min,708082100.0,26.0,0.0,13.0,1.0,0.0,0.0,1438.3,0.0,3.0,0.0,510.0,10.0,0.0,0.0
25%,713036800.0,41.0,1.0,31.0,3.0,2.0,2.0,2555.0,359.0,1324.5,0.631,2155.5,45.0,0.582,0.023
50%,717926400.0,46.0,2.0,36.0,4.0,2.0,2.0,4549.0,1276.0,3474.0,0.736,3899.0,67.0,0.702,0.176
75%,773143500.0,52.0,3.0,40.0,5.0,3.0,3.0,11067.5,1784.0,9859.0,0.859,4741.0,81.0,0.818,0.503
max,828343100.0,73.0,5.0,56.0,6.0,6.0,6.0,34516.0,2517.0,34516.0,3.397,18484.0,139.0,3.714,0.999


In [None]:
x_train=train.drop(columns=['RestingBP','RestingECG','HeartDisease'])
y_train=train['HeartDisease']
x_test=test.drop(columns=['RestingBP','RestingECG','HeartDisease'])
y_test=test['HeartDisease']
pipe.fit(x_train,y_train)

x_train, y_test, x_train, y_test = train_test_split(X, Y, test_size=0.33

In [84]:
train,test=train_test_split(data,test_size=0.33)

train.shape
test.shape



le = LabelEncoder()
train['Attrition'] = le.fit_transform(train['Attrition'])

# Assuming 'data' is your DataFrame
categorical_columns = ['Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
numerical_columns = train.select_dtypes(include=[np.number]).columns.tolist()


# Create the column transformer
preprocessor = make_column_transformer(
    (OrdinalEncoder(), categorical_columns),
    remainder='passthrough'  # Numerical columns are passed through
)


# Apply the transformations to the features
X_transformed = preprocessor.fit_transform(train)



le = LabelEncoder()
test['Attrition'] = le.fit_transform(test['Attrition'])

# Assuming 'data' is your DataFrame
categorical_columns = ['Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
numerical_columns = test.select_dtypes(include=[np.number]).columns.tolist()


Y_transformed=preprocessor.transform(test)

# Save the preprocessor object for later use on unseen data
joblib.dump(preprocessor, 'preprocessor.joblib')



# Convert the transformed data back into a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=categorical_columns+numerical_columns)

# Convert numerical columns back to their original data types
for col in numerical_columns:
    X_transformed_df[col] = X_transformed_df[col].astype(train[col].dtype)

train[categorical_columns+numerical_columns] = X_transformed_df


# Replace the original columns in 'data' with the transformed columns
train[categorical_columns+numerical_columns] = X_transformed_df



train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6785 entries, 5250 to 9135
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Clientnum                 4561 non-null   float64
 1   Attrition                 4561 non-null   float64
 2   Age                       4561 non-null   float64
 3   Gender                    4561 non-null   float64
 4   Dependent_count           4561 non-null   float64
 5   Education                 4561 non-null   float64
 6   Marital_Status            4561 non-null   float64
 7   Income                    4561 non-null   float64
 8   Card_Category             4561 non-null   float64
 9   Months_on_book            4561 non-null   float64
 10  Total_Relationship_Count  4561 non-null   float64
 11  Months_Inactive           4561 non-null   float64
 12  Contacts_Count            4561 non-null   float64
 13  Credit_Limit              4561 non-null   float64
 14  Total_Revo

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import numpy as np
import joblib




le = LabelEncoder()
data['Attrition'] = le.fit_transform(data['Attrition'])

# Assuming 'data' is your DataFrame
categorical_columns = ['Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()

# Apply label encoding to the target column


#X = data.drop('Attrition', axis=1)



# Create the column transformer
preprocessor = make_column_transformer(
    (OrdinalEncoder(), categorical_columns),
    remainder='passthrough'  # Numerical columns are passed through
)

print(data.info())

# Apply the transformations to the features
X_transformed = preprocessor.fit_transform(data)

# Save the preprocessor object for later use on unseen data
joblib.dump(preprocessor, 'preprocessor.joblib')

# Now, 'X_transformed' is your preprocessed feature matrix

# Apply the transformations to the features
#X_transformed = preprocessor.fit_transform(data)

# Convert the transformed data back into a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=categorical_columns+numerical_columns)

# Convert numerical columns back to their original data types
for col in numerical_columns:
    X_transformed_df[col] = X_transformed_df[col].astype(data[col].dtype)

data[categorical_columns+numerical_columns] = X_transformed_df


# Replace the original columns in 'data' with the transformed columns
data[categorical_columns+numerical_columns] = X_transformed_df

data.info()

# Now, 'data' is your preprocessed DataFrame




In [67]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import numpy as np
import joblib

le = LabelEncoder()
data['Attrition'] = le.fit_transform(data['Attrition'])

# Assuming 'data' is your DataFrame
categorical_columns = ['Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()

# Apply label encoding to the target column


#X = data.drop('Attrition', axis=1)



# Create the column transformer
preprocessor = make_column_transformer(
    (OrdinalEncoder(), categorical_columns),
    remainder='passthrough'  # Numerical columns are passed through
)

print(data.info())

# Apply the transformations to the features
X_transformed = preprocessor.fit_transform(data)

# Save the preprocessor object for later use on unseen data
joblib.dump(preprocessor, 'preprocessor.joblib')

# Now, 'X_transformed' is your preprocessed feature matrix

# Apply the transformations to the features
#X_transformed = preprocessor.fit_transform(data)

# Convert the transformed data back into a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=categorical_columns+numerical_columns)

# Convert numerical columns back to their original data types
for col in numerical_columns:
    X_transformed_df[col] = X_transformed_df[col].astype(data[col].dtype)

data[categorical_columns+numerical_columns] = X_transformed_df


# Replace the original columns in 'data' with the transformed columns
data[categorical_columns+numerical_columns] = X_transformed_df

data.info()

# Now, 'data' is your preprocessed DataFrame




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Clientnum                 10127 non-null  int64  
 1   Attrition                 10127 non-null  int32  
 2   Age                       10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education                 10127 non-null  object 
 6   Marital_Status            10127 non-null  object 
 7   Income                    10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive           10127 non-null  int64  
 12  Contacts_Count            10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_

In [68]:
print(preprocessor)

ColumnTransformer(remainder='passthrough',
                  transformers=[('ordinalencoder', OrdinalEncoder(),
                                 ['Gender', 'Education', 'Marital_Status',
                                  'Income', 'Card_Category'])])


In [69]:
data.head()

Unnamed: 0,Clientnum,Attrition,Age,Gender,Dependent_count,Education,Marital_Status,Income,Card_Category,Months_on_book,...,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,1,45,1.0,3,3.0,1.0,2.0,0.0,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,1,49,0.0,5,2.0,2.0,4.0,0.0,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,1,51,1.0,3,2.0,1.0,3.0,0.0,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,1,40,0.0,4,3.0,3.0,4.0,0.0,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,1,40,1.0,3,5.0,1.0,2.0,0.0,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [70]:
# Import pandas
import pandas as pd

# Create a new DataFrame with the same structure as 'data'
new_data = pd.DataFrame({
    'Gender': ['M'],  # Replace with your actual values
    'Education': ['Graduate'],  # Replace with your actual values
    'Marital_Status': ['Married'],  # Replace with your actual values
    'Income': ['$40K - $60K'],  # Replace with your actual values
    'Card_Category': ['Silver']  # Replace with your actual values
})

# Ensure the numerical columns in 'new_data' are the same as in 'data'
for col in numerical_columns:
    new_data[col] = [0]  # Replace 0 with your actual values

# Load the preprocessor
preprocessor = joblib.load('preprocessor.joblib')

# Transform the new data using the preprocessor
new_X_transformed = preprocessor.transform(new_data)

# Now, 'new_X_transformed' is your transformed new data


In [71]:
print(new_X_transformed)

[[1. 2. 1. 1. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [72]:
# Get the ordinal encoder from the preprocessor
ordinal_encoder = preprocessor.named_transformers_['ordinalencoder']

# Get the categories
categories = ordinal_encoder.categories_

# Create a dictionary mapping column names to their categories
column_categories = dict(zip(categorical_columns, categories))

# Print the categories for each column
for column, categories in column_categories.items():
    print(f"{column}: {dict(enumerate(categories))}")


Gender: {0: 'F', 1: 'M'}
Education: {0: 'College', 1: 'Doctorate', 2: 'Graduate', 3: 'High School', 4: 'Post-Graduate', 5: 'Uneducated', 6: 'Unknown'}
Marital_Status: {0: 'Divorced', 1: 'Married', 2: 'Single', 3: 'Unknown'}
Income: {0: '$120K +', 1: '$40K - $60K', 2: '$60K - $80K', 3: '$80K - $120K', 4: 'Less than $40K', 5: 'Unknown'}
Card_Category: {0: 'Blue', 1: 'Gold', 2: 'Platinum', 3: 'Silver'}


In [25]:
import category_encoders as ce
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import joblib
import pandas as pd

# Assuming 'data' is your DataFrame
categorical_columns = ['Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()

# Define the ordinal encoding mapping
ordinal_mapping = [{'col': 'Gender', 'mapping': {'M': 0, 'F': 1}},
                   {'col': 'Education', 'mapping': {'Unknown': 0, 'Uneducated': 1, 'High School': 2, 'College': 3, 'Graduate': 4, 'Post-Graduate': 5, 'Doctorate': 6}},
                   {'col': 'Marital_Status', 'mapping': {'Unknown': 0, 'Single': 1, 'Married': 2, 'Divorced': 3}},
                   {'col': 'Income', 'mapping': {'Unknown': 0, 'Less than $40K': 1, '$40K - $60K': 2, '$60K - $80K': 3, '$80K - $120K': 4, '$120K +': 5}},
                   {'col': 'Card_Category', 'mapping': {'Blue': 0, 'Silver': 1, 'Gold': 2, 'Platinum': 3}}]

# Apply label encoding to the target column
le = LabelEncoder()
data['Attrition'] = le.fit_transform(data['Attrition'])

# Create the column transformer
preprocessor = make_column_transformer(
    (ce.OrdinalEncoder(mapping=ordinal_mapping), categorical_columns),
    remainder='passthrough'  # Numerical columns are passed through
)

# Apply the transformations to the features
X_transformed = preprocessor.fit_transform(data)

# Save the preprocessor object for later use on unseen data
joblib.dump(preprocessor, 'preprocessor.joblib')

# Convert the transformed data back into a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=categorical_columns+numerical_columns)

# Replace the original columns in 'data' with the transformed columns
data[categorical_columns+numerical_columns] = X_transformed_df

# Now, 'data' is your preprocessed DataFrame


ModuleNotFoundError: No module named 'category_encoders'

In [None]:
rather than including this code do write the code in such a way that 1st perform the label encoding for the target feature and then after include the target feature processed data inside the orginal dataset and then perform the ordinal encoding on the rest of the object data types and make sure pass all the numerical columns as pass through ok now save that pipeline in joblib object

In [None]:
now create a column transformer in which include the ordinal encoding for the object datatypes based on there index values i want to save the preprocessor object of features which you will perform ordinal encoding so it will useful for me to load the this preprocessor object and perform transform on unseen data and makesure create columntransformer like which should able to perform ordinal encoding to object data type features and rest of the numerical features needs to be passed so give me the code

this is code for getting to know the unique categories for the object datatype
# Assuming df is your DataFrame containing the data
categorical_columns = ['Attrition', 'Gender', 'Education', 'Marital_Status', 'Income', 'Card_Category']
#df=pd.read_csv("artifacts\\data_ingestion\\BankChurners.csv")
x=data.describe()
print(x)
# Print unique categories for each categorical column
for column in categorical_columns:
    unique_categories = data[column].unique()
    print(f"Unique categories in column '{column}':")
    print(unique_categories)
    print()


output 
Unique categories in column 'Attrition':
['Existing Customer' 'Attrited Customer']

Unique categories in column 'Gender':
['M' 'F']

Unique categories in column 'Education':
['Unknown' 'Uneducated' 'High School' 'College' 'Graduate' 'Post-Graduate'
 'Doctorate']

Unique categories in column 'Marital_Status':
['Unknown' 'Single' 'Married' 'Divorced']

Unique categories in column 'Income':
['Unknown' 'Less than $40K' '$40K - $60K' '$60K - $80K' '$80K - $120K' '$120K +']

Unique categories in column 'Card_Category':
['Blue' 'Silver' 'Gold' 'Platinum']