In [37]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [9]:
class NullHandler_cc(BaseEstimator, TransformerMixin):
    def __init__(self, num_fill='mean', cat_fill= 'most_frequent'):
        self.num_fill= num_fill
        self.cat_fill= cat_fill
        
        
    def fit(self,df,num_cols=['a2', 'a3', 'a8', 'a11', 'a14', 'a15']):
        if type(df) != pd.DataFrame:
            raise TypeError("Pandas DataFrame Expected")
        
        self.df= df
        self.num_cols= num_cols
        df.columns= list(map(str.lower, list(df.columns)))
        self.cat_cols= list( set(list(df.columns)) - set(self.num_cols))
        
        self.cat_fill_vals={}
        for col in self.cat_cols:
            self.cat_fill_vals[col]= df[col].value_counts().idxmax()
        
        df[self.num_cols]= df[self.num_cols].replace('?', np.NAN)
        df['a2']= df['a2'].astype('float64')
        df['a11']= df['a11'].astype('float64')
        df['a14']= df['a14'].astype('float64')
        df['a15']= df['a15'].astype('float64')
        self.num_fill_vals= dict(df.mean())
        
        return self
    
    def transform(self, df):
        df.replace('?', np.NAN, inplace= True)
        df['a2']= df['a2'].astype('float64')
        df['a11']= df['a11'].astype('float64')
        df['a14']= df['a14'].astype('float64')
        df['a15']= df['a15'].astype('float64')
        self.cat_fill_vals.update(self.num_fill_vals)
        fill_vals= self.cat_fill_vals
        #print("fill_vals={}".format(fill_vals))
        df_ = df.fillna(value= fill_vals)
        
        return df_
        
        

In [15]:
class Cat_columns_handler_cc(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
        
    def fit(self, df, binary_cols=['a1', 'a9', 'a10', 'a12'], multivalcatcolumns= ['a4', 'a5', 'a6', 'a7', 'a13']):
        self.binary_cols= binary_cols
        self.multivalcatcolumns= multivalcatcolumns
        
        if type(df) != pd.DataFrame:
            raise TypeError("Pandas DataFrame Expected")
        
        self.binary_encoders= []
        
        for col in self.binary_cols:
            le_enc= LabelEncoder()
            le_enc.fit(df[col])
            self.binary_encoders.append(le_enc)
            del le_enc
            
        
        self.ohe= OneHotEncoder(sparse=False, handle_unknown= 'ignore')
        self.ohe.fit(df[self.multivalcatcolumns])
        
        return self
    
    def transform(self, df):
        if type(df) != pd.DataFrame:
            raise TypeError("Pandas DataFrame Expected")
        
        for col, le_enc in zip(self.binary_cols, self.binary_encoders):
            df.loc[:,col]= le_enc.transform(df[col])
        
        temp_df= pd.DataFrame(self.ohe.transform(df[self.multivalcatcolumns]))
        #print(temp_df.shape)
        df.drop(self.multivalcatcolumns, axis=1 , inplace=True)
        #print(df.shape)
        return pd.concat([df, temp_df], axis= 1)
    
    

In [11]:
 class Encoding_y(BaseEstimator, TransformerMixin):
        def __init__(self):
            pass
        
        def fit(self, y):
            self.le= LabelEncoder()
            self.le.fit(y)
            return self
        
        def transform(self, y):
            return self.le.transform(y)

In [12]:
X_train= pd.read_csv("/home/souvik/ML_projects/P1_CreditCard/data/X_train.csv")
y_train= pd.read_csv("/home/souvik/ML_projects/P1_CreditCard/data/y_train.csv")
X_train.head()

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15
0,b,20.17,8.17,u,g,aa,v,1.96,t,t,14,f,g,60,158
1,a,27.58,3.0,u,g,m,v,2.79,f,t,1,t,g,280,10
2,b,39.58,13.915,u,g,w,v,8.625,t,t,6,t,g,70,0
3,b,41.17,1.25,y,p,w,v,0.25,f,f,0,f,g,0,195
4,b,22.25,0.46,u,g,k,v,0.125,f,f,0,t,g,280,55


In [13]:
enc_y= Encoding_y()
enc_y.fit(y_train)
#y_train= enc_y.transform(y_train)

  y = column_or_1d(y, warn=True)


Encoding_y()

In [14]:
non_scaled_pipeline= Pipeline([
    ('NullHandler', NullHandler_cc()),
    ('CatHandler', Cat_columns_handler_cc())
])

scaled_pipeline= Pipeline([
    ('NullHandler', NullHandler_cc()),
    ('CatHandler', Cat_columns_handler_cc()),
    ('Scaler', StandardScaler())
])

X_train_not_scaled= non_scaled_pipeline.fit_transform(X_train)
X_train_scaled= scaled_pipeline.fit_transform(X_train)
y_train_enc= enc_y.transform(y_train)

(552, 32)
(552, 10)
(552, 32)
(552, 10)


  y = column_or_1d(y, warn=True)


In [49]:
svc_mod= SVC(random_state=711, probability=True)
parameters= {'C':[10, 1, 0.1, 0.01, 0.001], 'kernel':['rbf', 'poly', 'sigmoid'], 'degree':[0.5,1,1.5,2]}
grid_svc= GridSearchCV(svc_mod, param_grid=parameters,cv =4)
grid_svc.fit(X_train_scaled, y_train_enc)
print(grid_svc.best_params_)
print(grid_svc.best_score_)
best_svc= grid_svc.best_estimator_

{'C': 0.1, 'degree': 1, 'kernel': 'poly'}
0.8623188405797102


In [50]:
sgd_clf= SGDClassifier(loss='log',random_state=711)
parameters={'penalty':['elasticnet'], 'alpha':[0.0001, 0.001, 0.01], 'l1_ratio':[0.0,0.05,0.1,0.3]}
grid_sgd_clf= GridSearchCV(sgd_clf, param_grid=parameters,cv =4)
grid_sgd_clf.fit(X_train_scaled, y_train_enc)
print(grid_sgd_clf.best_params_)
print(grid_sgd_clf.best_score_)
best_sgd_clf= grid_sgd_clf.best_estimator_

{'alpha': 0.01, 'l1_ratio': 0.3, 'penalty': 'elasticnet'}
0.8496376811594204


In [52]:
log_reg= LogisticRegression(random_state=711)
parameters={'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
           'C':[0.0001,0.001,0.01,0.1, 0.5,1,]}
grid_log_reg= GridSearchCV(log_reg, param_grid=parameters,cv =4)
grid_log_reg.fit(X_train_scaled, y_train_enc)
print(grid_log_reg.best_params_)
print(grid_log_reg.best_score_)
best_log_reg= grid_log_reg.best_estimator_



{'C': 0.01, 'solver': 'newton-cg'}
0.8496376811594202




In [53]:
current_models=[('svc',best_svc),('sgd_classifier',best_sgd_clf),('logistic_reg',best_log_reg)]
voting_clf= VotingClassifier(estimators=current_models,
                            voting='soft')
voting_clf.fit(X_train_scaled, y_train_enc)
cross_val_score(voting_clf, X_train_scaled, y_train_enc, cv=4)

array([0.84782609, 0.84782609, 0.86956522, 0.86956522])

In [58]:
X_test= pd.read_csv("/home/souvik/ML_projects/P1_CreditCard/data/X_test.csv")
y_test= pd.read_csv("/home/souvik/ML_projects/P1_CreditCard/data/y_test.csv")

X_test_scaled= scaled_pipeline.transform(X_test)
y_test_enc= enc_y.transform(y_test)
for model in current_models:
    y_pred= model[1].predict(X_test_scaled)
    print("Accuracy of {}= {}".format(model[0], accuracy_score(y_test_enc, y_pred)))
    
y_pred= voting_clf.predict(X_test_scaled)
print("Accuracy of voting classifier= {}".format( accuracy_score(y_test_enc, y_pred)))

(138, 32)
(138, 10)
Accuracy of svc= 0.8840579710144928
Accuracy of sgd_classifier= 0.8623188405797102
Accuracy of logistic_reg= 0.8840579710144928
Accuracy of voting classifier= 0.8768115942028986


  y = column_or_1d(y, warn=True)
