In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
class NullHandler_cc(BaseEstimator, TransformerMixin):
    def __init__(self, num_fill='mean', cat_fill= 'most_frequent'):
        self.num_fill= num_fill
        self.cat_fill= cat_fill
        
        
    def fit(self,df,num_cols=['a2', 'a3', 'a8', 'a11', 'a14', 'a15']):
        if type(df) != pd.DataFrame:
            raise TypeError("Pandas DataFrame Expected")
        
        self.df= df
        self.num_cols= num_cols
        df.columns= list(map(str.lower, list(df.columns)))
        self.cat_cols= list( set(list(df.columns)) - set(self.num_cols))
        
        self.cat_fill_vals={}
        for col in self.cat_cols:
            self.cat_fill_vals[col]= df[col].value_counts().idxmax()
        
        df[self.num_cols]= df[self.num_cols].replace('?', np.NAN)
        df['a2']= df['a2'].astype('float64')
        df['a11']= df['a11'].astype('float64')
        df['a14']= df['a14'].astype('float64')
        df['a15']= df['a15'].astype('float64')
        self.num_fill_vals= dict(df.mean())
        
        return self
    
    def transform(self, df):
        df.replace('?', np.NAN, inplace= True)
        df['a2']= df['a2'].astype('float64')
        df['a11']= df['a11'].astype('float64')
        df['a14']= df['a14'].astype('float64')
        df['a15']= df['a15'].astype('float64')
        self.cat_fill_vals.update(self.num_fill_vals)
        fill_vals= self.cat_fill_vals
        print("fill_vals={}".format(fill_vals))
        df_ = df.fillna(value= fill_vals)
        
        return df_
        
        

In [9]:
class Cat_columns_handler_cc(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
        
    def fit(self, df, binary_cols=['a1', 'a9', 'a10', 'a12'], multivalcatcolumns= ['a4', 'a5', 'a6', 'a7', 'a13']):
        self.binary_cols= binary_cols
        self.multivalcatcolumns= multivalcatcolumns
        
        if type(df) != pd.DataFrame:
            raise TypeError("Pandas DataFrame Expected")
        
        self.binary_encoders= []
        
        for col in self.binary_cols:
            le_enc= LabelEncoder()
            le_enc.fit(df[col])
            self.binary_encoders.append(le_enc)
            del le_enc
            
        
        self.ohe= OneHotEncoder(sparse=False, handle_unknown= 'ignore')
        self.ohe.fit(df[self.multivalcatcolumns])
        
        return self
    
    def transform(self, df):
        if type(df) != pd.DataFrame:
            raise TypeError("Pandas DataFrame Expected")
        
        for col, le_enc in zip(self.binary_cols, self.binary_encoders):
            df.loc[:,col]= le_enc.transform(df[col])
        
        temp_df= pd.DataFrame(self.ohe.transform(df[self.multivalcatcolumns]))
        print(temp_df.shape)
        df.drop(self.multivalcatcolumns, axis=1 , inplace=True)
        print(df.shape)
        return pd.concat([df, temp_df], axis= 1)
    
    

In [15]:
 class Encoding_y(BaseEstimator, TransformerMixin):
        def __init__(self):
            pass
        
        def fit(self, y):
            self.le= LabelEncoder()
            self.le.fit(y)
            return self
        
        def transform(self, y):
            return self.le.transform(y)

In [4]:
def scaling_data(df):
    scaler= StandardScaler()
    return scaler.fit_transform(df)

In [4]:
X_train= pd.read_csv("/home/souvik/ML_projects/P1_CreditCard/data/X_train.csv")
y_train= pd.read_csv("/home/souvik/ML_projects/P1_CreditCard/data/y_train.csv")
X_train.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15
0,b,20.17,8.17,u,g,aa,v,1.96,t,t,14,f,g,60,158
1,a,27.58,3.0,u,g,m,v,2.79,f,t,1,t,g,280,10
2,b,39.58,13.915,u,g,w,v,8.625,t,t,6,t,g,70,0
3,b,41.17,1.25,y,p,w,v,0.25,f,f,0,f,g,0,195
4,b,22.25,0.46,u,g,k,v,0.125,f,f,0,t,g,280,55


In [7]:
#print(X_train.isna().sum())

null_handler= NullHandler_cc()
null_handler.fit(X_train)
X_train_no_null= null_handler.transform(X_train)
print(X_train_no_null.isna().sum())

fill_vals={'a5': 'g', 'a1': 'b', 'a4': 'u', 'a6': 'c', 'a10': 'f', 'a7': 'v', 'a9': 't', 'a12': 'f', 'a13': 'g', 'a2': 32.23839483394834, 'a3': 4.865760869565217, 'a8': 2.34768115942029, 'a11': 2.572463768115942, 'a14': 178.070110701107, 'a15': 1104.5}
a1     0
a2     0
a3     0
a4     0
a5     0
a6     0
a7     0
a8     0
a9     0
a10    0
a11    0
a12    0
a13    0
a14    0
a15    0
dtype: int64


In [10]:
print(X_train_no_null.shape)
cat_cols_handler= Cat_columns_handler_cc()
cat_cols_handler.fit(X_train_no_null)
X_train_non_scaled= cat_cols_handler.transform(X_train_no_null)
print(X_train_non_scaled.shape)
#need to check why it has 46 colums, label encoders should be 0,1 
print(X_train_non_scaled.head())

(552, 15)
(552, 32)
(552, 10)
(552, 42)
   a1     a2      a3     a8  a9  a10   a11  a12    a14    a15  ...   22   23  \
0   1  20.17   8.170  1.960   1    1  14.0    0   60.0  158.0  ...  0.0  0.0   
1   0  27.58   3.000  2.790   0    1   1.0    1  280.0   10.0  ...  0.0  0.0   
2   1  39.58  13.915  8.625   1    1   6.0    1   70.0    0.0  ...  0.0  0.0   
3   1  41.17   1.250  0.250   0    0   0.0    0    0.0  195.0  ...  0.0  0.0   
4   1  22.25   0.460  0.125   0    0   0.0    1  280.0   55.0  ...  0.0  0.0   

    24   25   26   27   28   29   30   31  
0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  
1  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  
2  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  
3  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  
4  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  

[5 rows x 42 columns]


In [11]:
cat_cols_handler.ohe.categories_

[array(['l', 'u', 'y'], dtype=object),
 array(['g', 'gg', 'p'], dtype=object),
 array(['aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm', 'q', 'r', 'w',
        'x'], dtype=object),
 array(['bb', 'dd', 'ff', 'h', 'j', 'n', 'o', 'v', 'z'], dtype=object),
 array(['g', 'p', 's'], dtype=object)]

In [16]:
enc_y= Encoding_y()
enc_y.fit(y_train)
y_train= enc_y.transform(y_train)

  y = column_or_1d(y, warn=True)


In [17]:
y_train[:5]

array([0, 1, 0, 1, 1])

In [None]:
X_train['a4'].value_counts()

In [None]:
X_train['a5'].value_counts()