In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from consumerpipes import *


In [2]:
train_file=r'Consumer_Complaints_train.csv'

In [3]:
ld_train=pd.read_csv(train_file)

In [4]:
ld_train.shape

(478421, 18)

In [5]:
ld_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Date received                 478421 non-null  object
 1   Product                       478421 non-null  object
 2   Sub-product                   339948 non-null  object
 3   Issue                         478421 non-null  object
 4   Sub-issue                     185796 non-null  object
 5   Consumer complaint narrative  75094 non-null   object
 6   Company public response       90392 non-null   object
 7   Company                       478421 non-null  object
 8   State                         474582 non-null  object
 9   ZIP code                      474573 non-null  object
 10  Tags                          67206 non-null   object
 11  Consumer consent provided?    135487 non-null  object
 12  Submitted via                 478421 non-null  object
 13 

In [6]:
ld_train.sample(2)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
379031,2012-09-25,Mortgage,FHA mortgage,"Loan servicing, payments, escrow account",,,,IBERIABANK,LA,70583,Older American,,Web,2012-09-25,Closed with explanation,Yes,No,159721
134207,2016-05-03,Credit reporting,,Incorrect information on credit report,Information is not mine,,,Equifax,NV,89110,,,Postal mail,2016-05-23,Closed with explanation,Yes,Yes,1908711


In [7]:
#drop Complint ID
# create dummies pipeline: Product, sub- product(nan),issue,sub issue,company,State,Company response to consumer,Timely response?
#Consumer disputed?
#nan+dummies:Consumer complaint narrative,Company public response,ZIP code, Tags,Consumer consent provided?,submitted via
#convert to numeric : all price vars
# convert to datetime: Date received,Date sent to compnay,cyclic feature for month[12],day of month[31], weekday[7] 
 

In [8]:
ld_train['Consumer complaint narrative'].value_counts(dropna=False)

NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [9]:
date_var=[i for i in ld_train.columns if 'Date' in i]

In [10]:
date_var

['Date received', 'Date sent to company']

In [11]:
cat_var=ld_train.select_dtypes(include='object').columns
cat_var=[_ for _ in cat_var if _ not in date_var]

In [12]:
type(cat_var)

list

In [13]:
cat_var.remove('Consumer complaint narrative')

In [14]:
text_var=['Consumer complaint narrative']

In [15]:
a=pd.to_datetime(ld_train['Date sent to company'],errors='coerce')
b=pd.to_datetime(ld_train['Date received'],errors='coerce')

In [16]:
a.dt.month

0          5
1          9
2          4
3          7
4         11
          ..
478416     7
478417     6
478418     9
478419     9
478420    10
Name: Date sent to company, Length: 478421, dtype: int64

In [17]:
a.dt.day

0         16
1         24
2          3
3         17
4         28
          ..
478416     1
478417    21
478418     9
478419    24
478420    30
Name: Date sent to company, Length: 478421, dtype: int64

In [18]:
a.dt.dayofweek

0         4
1         2
2         3
3         4
4         4
         ..
478416    2
478417    4
478418    1
478419    0
478420    3
Name: Date sent to company, Length: 478421, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

In [20]:
p_train,p_test=train_test_split(ld_train,test_size=0.2,random_state=42)

In [21]:
p_train.reset_index(drop=True,inplace=True)
p_test.reset_index(drop=True,inplace=True)

In [22]:
p1=pdPipeline([
    ('var_select',VarSelector(cat_var)),
    ('missing_trt',DataFrameImputer()),
    ('creating_dummies',creat_dummies(1000))
])


p2=pdPipeline([
    ('var_select',VarSelector(date_var)),
    ('convert_to_datetime',convert_to_datetime()),
    ('cyclic_feature',cyclic_features())
    
])
p3=pdPipeline([
    ('var_select',VarSelector(text_var)),
    ('clenaning_string',string_clean("XXXX","")),
    ('missing_trt',DataFrameImputer()),
    ('text_frequency',text_frequency()),
    #('toarray',ArrayTransformer())
    
])
    
data_pipe=FeatureUnion([
    ('p1',p1),
    ('p2',p2),
    ('p3',p3)
    
    
])

In [23]:
data_pipe.fit(p_train)

FeatureUnion(transformer_list=[('p1',
                                pdPipeline(steps=[('var_select',
                                                   VarSelector(feature_names=['Product',
                                                                              'Sub-product',
                                                                              'Issue',
                                                                              'Sub-issue',
                                                                              'Company '
                                                                              'public '
                                                                              'response',
                                                                              'Company',
                                                                              'State',
                                                                              'ZIP '
         

In [24]:
data_pipe.get_feature_names

<bound method FeatureUnion.get_feature_names of FeatureUnion(transformer_list=[('p1',
                                pdPipeline(steps=[('var_select',
                                                   VarSelector(feature_names=['Product',
                                                                              'Sub-product',
                                                                              'Issue',
                                                                              'Sub-issue',
                                                                              'Company '
                                                                              'public '
                                                                              'response',
                                                                              'Company',
                                                                              'State',
                                              

In [25]:
data_pipe.transform(p_train).shape

(382736, 240)

In [26]:
len(data_pipe.get_feature_names())

240

In [27]:
x_train=pd.DataFrame(data=data_pipe.transform(p_train),
                    columns=data_pipe.get_feature_names())

In [28]:
x_test=pd.DataFrame(data=data_pipe.transform(p_test),
                    columns=data_pipe.get_feature_names())

In [29]:
x_train.shape

(382736, 240)

In [30]:
x_test.shape

(95685, 240)

In [31]:
x_train.columns

Index(['p1__Product_Mortgage', 'p1__Product_Debt collection',
       'p1__Product_Credit reporting', 'p1__Product_Credit card',
       'p1__Product_Bank account or service', 'p1__Product_Consumer Loan',
       'p1__Product_Student loan', 'p1__Product_Money transfers',
       'p1__Product_Payday loan', 'p1__Sub-product_missing',
       ...
       'p2__Date received_month_cos', 'p2__Date received_month_day_sin',
       'p2__Date received_month_day_cos', 'p2__Date sent to company_week_sin',
       'p2__Date sent to company_week_cos',
       'p2__Date sent to company_month_sin',
       'p2__Date sent to company_month_cos',
       'p2__Date sent to company_month_day_sin',
       'p2__Date sent to company_month_day_cos',
       'p3__Consumer complaint narrative'],
      dtype='object', length=240)

In [32]:
x_test.columns.to_list()

['p1__Product_Mortgage',
 'p1__Product_Debt collection',
 'p1__Product_Credit reporting',
 'p1__Product_Credit card',
 'p1__Product_Bank account or service',
 'p1__Product_Consumer Loan',
 'p1__Product_Student loan',
 'p1__Product_Money transfers',
 'p1__Product_Payday loan',
 'p1__Sub-product_missing',
 'p1__Sub-product_Other mortgage',
 'p1__Sub-product_Conventional fixed mortgage',
 'p1__Sub-product_Checking account',
 'p1__Sub-product_Other (i.e. phone, health club, etc.)',
 'p1__Sub-product_I do not know',
 'p1__Sub-product_Credit card',
 'p1__Sub-product_Conventional adjustable mortgage (ARM)',
 'p1__Sub-product_FHA mortgage',
 'p1__Sub-product_Non-federal student loan',
 'p1__Sub-product_Medical',
 'p1__Sub-product_Vehicle loan',
 'p1__Sub-product_Other bank product/service',
 'p1__Sub-product_Payday loan',
 'p1__Sub-product_Home equity loan or line of credit',
 'p1__Sub-product_Installment loan',
 'p1__Sub-product_Savings account',
 'p1__Sub-product_VA mortgage',
 'p1__Sub-prod