# Pipeline with Feature Union

In [1]:
import numpy as np
import pandas as pd

In [2]:
file=r'../dataset/Existing Base.csv'

bd=pd.read_csv(file)

bd.head()

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment Tax Saving Bond,Home Loan,Online Purchase Amount,Revenue Grid,gender,region,Investment in Commudity,Investment in Equity,Investment in Derivative,Portfolio Balance
0,1,Zero,51-55,Partner,Manual Worker,Secretarial/Admin,Own Home,"<17,500, >=15,000",No,No,...,19.99,0.0,0.0,1,Female,Wales,74.67,18.66,32.32,89.43
1,2,Zero,55-60,Single/Never Married,Retired,Retired,Own Home,"<27,500, >=25,000",No,No,...,0.0,0.0,0.0,2,Female,North West,20.19,0.0,4.33,22.78
2,3,Zero,26-30,Single/Never Married,Professional,Other,Own Home,"<30,000, >=27,500",Yes,No,...,0.0,3.49,0.0,2,Male,North,98.06,31.07,80.96,171.78
3,5,Zero,18-21,Single/Never Married,Professional,Manual Worker,Own Home,"<15,000, >=12,500",No,No,...,0.0,0.0,0.0,2,Female,West Midlands,4.1,14.15,17.57,-41.7
4,6,Zero,45-50,Partner,Business Manager,Unknown,Own Home,"<30,000, >=27,500",No,No,...,0.0,45.91,25.98,2,Female,Scotland,70.16,55.86,80.44,235.02


In [3]:
bd.nunique()

REF_NO                             10155
children                               5
age_band                              13
status                                 5
occupation                             9
occupation_partner                     9
home_status                            5
family_income                         13
self_employed                          2
self_employed_partner                  2
year_last_moved                       95
TVarea                                14
post_code                          10040
post_area                           2039
Average Credit Card Transaction     1411
Balance Transfer                    2183
Term Deposit                        1419
Life Insurance                      3111
Medical Insurance                   1589
Average A/C Balance                 2223
Personal Loan                       1760
Investment in Mutual Fund           2470
Investment Tax Saving Bond           832
Home Loan                            884
Online Purchase 

In [4]:
bd['Revenue Grid'].value_counts()

Revenue Grid
2    9069
1    1086
Name: count, dtype: int64

In [5]:
bd.dtypes

REF_NO                               int64
children                            object
age_band                            object
status                              object
occupation                          object
occupation_partner                  object
home_status                         object
family_income                       object
self_employed                       object
self_employed_partner               object
year_last_moved                      int64
TVarea                              object
post_code                           object
post_area                           object
Average Credit Card Transaction    float64
Balance Transfer                   float64
Term Deposit                       float64
Life Insurance                     float64
Medical Insurance                  float64
Average A/C Balance                float64
Personal Loan                      float64
Investment in Mutual Fund          float64
Investment Tax Saving Bond         float64
Home Loan  

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
class VarTypeSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self,vartype,ignore_var):
        self.vartype=vartype
        self.ignore_var=ignore_var
    
    def fit(self,x,y=None):
        return self
    
    def transform(self,X):
        return X.select_dtypes(self.vartype).drop(self.ignore_var,axis=1)

In [8]:
class get_dummies_PipeLineFriendly(BaseEstimator, TransformerMixin):
    
    def __init__(self,freq_cutoff=0):
        self.freq_cutoff=freq_cutoff
        self.var_cat_dict={}
        
    def fit(self,x,y=None):
        data_cols=x.columns
        for col in data_cols:
            k=x[col].value_counts()
            cats=k.index[k>self.freq_cutoff][:-1]
            self.var_cat_dict[col]=cats
        return self
            
    def transform(self,x,y=None):
        dummy_data=x.copy()
        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                name=col+'_'+cat
                dummy_data[name]=(dummy_data[col]==cat).astype(int)
            del dummy_data[col]
        return dummy_data

In [9]:
from sklearn.pipeline import Pipeline,FeatureUnion

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [10]:
bd_train,bd_test=train_test_split(bd,test_size=0.2,random_state=2)

In [11]:
x_train=bd_train.drop('Revenue Grid',axis=1)
x_test=bd_test.drop('Revenue Grid',axis=1)
y_train=bd_train['Revenue Grid']
y_test=bd_test['Revenue Grid']

In [12]:
cat_pipe=Pipeline([
    ('cat_var',VarTypeSelector(['object'],ignore_var=['post_code','post_area'])),
    ('dummies',get_dummies_PipeLineFriendly(100))
])

In [13]:
cat_pipe

In [14]:
pipe2=Pipeline([
    ('features',FeatureUnion([
        ('cat_pipe',cat_pipe),
        ('num_var',VarTypeSelector(['int64','float64'],ignore_var=['REF_NO']))
    ])),
    ('clf',LogisticRegression())
])
pipe2

In [15]:
pipe2.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
pipe2.predict_proba(x_test)

array([[0.00344215, 0.99655785],
       [0.006387  , 0.993613  ],
       [0.00527296, 0.99472704],
       ...,
       [0.40345559, 0.59654441],
       [0.09556321, 0.90443679],
       [0.01493396, 0.98506604]])

## Save python objects to use later

In [17]:
# from sklearn.joblib import joblib
import joblib

In [18]:
joblib.dump(pipe2,'my_model_pipeline27Sep24.pkl')

['my_model_pipeline27Sep24.pkl']

In [19]:
model = joblib.load("my_model_pipeline27Sep24.pkl")

In [20]:
model

In [21]:
x_test

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment in Mutual Fund,Investment Tax Saving Bond,Home Loan,Online Purchase Amount,gender,region,Investment in Commudity,Investment in Equity,Investment in Derivative,Portfolio Balance
184,204,1,41-45,Partner,Secretarial/Admin,Business Manager,Own Home,"<22,500, >=20,000",No,No,...,120.41,2.00,0.00,0.00,Female,West Midlands,48.57,37.13,53.37,159.22
1546,1762,3,45-50,Partner,Secretarial/Admin,Unknown,Own Home,">=35,000",No,Yes,...,47.98,2.00,19.95,0.00,Female,Scotland,102.65,31.31,76.30,197.45
9912,11242,2,36-40,Partner,Other,Business Manager,Own Home,"<20,000, >=17,500",No,No,...,0.00,0.00,1.24,0.00,Female,North West,0.01,15.53,15.33,-8.54
5078,5771,Zero,51-55,Divorced/Separated,Business Manager,Housewife,Own Home,"<30,000, >=27,500",No,No,...,28.96,0.00,0.00,0.00,Male,South East,3.60,9.83,12.82,-0.46
7138,8107,3,31-35,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,69.93,0.00,8.98,0.00,Female,Scotland,0.00,17.56,16.07,31.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2766,3166,Zero,51-55,Partner,Professional,Professional,Own Home,">=35,000",Yes,No,...,118.82,2.48,0.00,78.86,Female,South East,47.65,65.33,83.40,181.01
8086,9193,Zero,26-30,Single/Never Married,Professional,Professional,Own Home,">=35,000",No,No,...,0.00,0.00,10.49,0.00,Female,West Midlands,0.00,1.75,0.00,17.04
4324,4925,Zero,31-35,Divorced/Separated,Manual Worker,Unknown,Own Home,"<30,000, >=27,500",No,No,...,41.98,2.99,0.00,68.46,Male,West Midlands,0.00,19.90,8.49,10.64
9097,10329,1,41-45,Partner,Business Manager,Secretarial/Admin,Own Home,">=35,000",No,No,...,144.63,15.99,4.49,15.99,Male,Scotland,85.44,41.92,75.56,227.72


In [22]:
model.predict_proba(x_test[:1])

array([[0.00344215, 0.99655785]])

In [34]:
import json

In [39]:
print(json.loads(x_test[0:1].to_json(orient="records"))[0])

{'REF_NO': 204, 'children': '1', 'age_band': '41-45', 'status': 'Partner', 'occupation': 'Secretarial/Admin', 'occupation_partner': 'Business Manager', 'home_status': 'Own Home', 'family_income': '<22,500, >=20,000', 'self_employed': 'No', 'self_employed_partner': 'No', 'year_last_moved': 1984, 'TVarea': 'Central', 'post_code': 'CV10 0BT', 'post_area': 'CV10', 'Average Credit Card Transaction': 34.47, 'Balance Transfer': 49.99, 'Term Deposit': 60.97, 'Life Insurance': 89.95, 'Medical Insurance': 7.49, 'Average A/C Balance': 61.44, 'Personal Loan': 38.95, 'Investment in Mutual Fund': 120.41, 'Investment Tax Saving Bond': 2.0, 'Home Loan': 0.0, 'Online Purchase Amount': 0.0, 'gender': 'Female', 'region': 'West Midlands', 'Investment in Commudity': 48.57, 'Investment in Equity': 37.13, 'Investment in Derivative': 53.37, 'Portfolio Balance': 159.22}


In [40]:
x_test

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment in Mutual Fund,Investment Tax Saving Bond,Home Loan,Online Purchase Amount,gender,region,Investment in Commudity,Investment in Equity,Investment in Derivative,Portfolio Balance
184,204,1,41-45,Partner,Secretarial/Admin,Business Manager,Own Home,"<22,500, >=20,000",No,No,...,120.41,2.00,0.00,0.00,Female,West Midlands,48.57,37.13,53.37,159.22
1546,1762,3,45-50,Partner,Secretarial/Admin,Unknown,Own Home,">=35,000",No,Yes,...,47.98,2.00,19.95,0.00,Female,Scotland,102.65,31.31,76.30,197.45
9912,11242,2,36-40,Partner,Other,Business Manager,Own Home,"<20,000, >=17,500",No,No,...,0.00,0.00,1.24,0.00,Female,North West,0.01,15.53,15.33,-8.54
5078,5771,Zero,51-55,Divorced/Separated,Business Manager,Housewife,Own Home,"<30,000, >=27,500",No,No,...,28.96,0.00,0.00,0.00,Male,South East,3.60,9.83,12.82,-0.46
7138,8107,3,31-35,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,69.93,0.00,8.98,0.00,Female,Scotland,0.00,17.56,16.07,31.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2766,3166,Zero,51-55,Partner,Professional,Professional,Own Home,">=35,000",Yes,No,...,118.82,2.48,0.00,78.86,Female,South East,47.65,65.33,83.40,181.01
8086,9193,Zero,26-30,Single/Never Married,Professional,Professional,Own Home,">=35,000",No,No,...,0.00,0.00,10.49,0.00,Female,West Midlands,0.00,1.75,0.00,17.04
4324,4925,Zero,31-35,Divorced/Separated,Manual Worker,Unknown,Own Home,"<30,000, >=27,500",No,No,...,41.98,2.99,0.00,68.46,Male,West Midlands,0.00,19.90,8.49,10.64
9097,10329,1,41-45,Partner,Business Manager,Secretarial/Admin,Own Home,">=35,000",No,No,...,144.63,15.99,4.49,15.99,Male,Scotland,85.44,41.92,75.56,227.72


In [41]:
bd_test

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment Tax Saving Bond,Home Loan,Online Purchase Amount,Revenue Grid,gender,region,Investment in Commudity,Investment in Equity,Investment in Derivative,Portfolio Balance
184,204,1,41-45,Partner,Secretarial/Admin,Business Manager,Own Home,"<22,500, >=20,000",No,No,...,2.00,0.00,0.00,2,Female,West Midlands,48.57,37.13,53.37,159.22
1546,1762,3,45-50,Partner,Secretarial/Admin,Unknown,Own Home,">=35,000",No,Yes,...,2.00,19.95,0.00,2,Female,Scotland,102.65,31.31,76.30,197.45
9912,11242,2,36-40,Partner,Other,Business Manager,Own Home,"<20,000, >=17,500",No,No,...,0.00,1.24,0.00,2,Female,North West,0.01,15.53,15.33,-8.54
5078,5771,Zero,51-55,Divorced/Separated,Business Manager,Housewife,Own Home,"<30,000, >=27,500",No,No,...,0.00,0.00,0.00,2,Male,South East,3.60,9.83,12.82,-0.46
7138,8107,3,31-35,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,0.00,8.98,0.00,2,Female,Scotland,0.00,17.56,16.07,31.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2766,3166,Zero,51-55,Partner,Professional,Professional,Own Home,">=35,000",Yes,No,...,2.48,0.00,78.86,2,Female,South East,47.65,65.33,83.40,181.01
8086,9193,Zero,26-30,Single/Never Married,Professional,Professional,Own Home,">=35,000",No,No,...,0.00,10.49,0.00,2,Female,West Midlands,0.00,1.75,0.00,17.04
4324,4925,Zero,31-35,Divorced/Separated,Manual Worker,Unknown,Own Home,"<30,000, >=27,500",No,No,...,2.99,0.00,68.46,1,Male,West Midlands,0.00,19.90,8.49,10.64
9097,10329,1,41-45,Partner,Business Manager,Secretarial/Admin,Own Home,">=35,000",No,No,...,15.99,4.49,15.99,2,Male,Scotland,85.44,41.92,75.56,227.72
