In [36]:
import pandas as pd
import numpy as np
import scipy.stats as st
from sklearn.metrics import accuracy_score

In [37]:
df = pd.read_csv('Creditcard_data.csv')

In [38]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [39]:
d = dict(df['Class'].value_counts())
d

{0: 763, 1: 9}

In [40]:
def under_sampling(df : pd.DataFrame,target : str)->pd.DataFrame:
    val_counts = dict(df[target].value_counts())
    min_vals = min(list(val_counts.values()))
    und_samp = pd.DataFrame(columns  = df.columns)
    for id in val_counts.keys():
        df1 = df[df[target] == id]
        df1 = df1.sample(frac = 1)
        df1 = df1.iloc[0:min_vals]
        und_samp = pd.concat([und_samp,df1])
    und_samp = und_samp.sample(frac = 1)
    return und_samp

In [41]:
und_df = under_sampling(df,'Class')

In [42]:
und_df['Class'].value_counts()

1    9
0    9
Name: Class, dtype: int64

In [43]:
def over_sampling(df : pd.DataFrame,target : str)->pd.DataFrame:
   
   val_counts = dict(df[target].value_counts())
   max_vals = max(list(val_counts.values()))
   
   ovr_samp = pd.DataFrame(columns  = df.columns)
   
   for id in val_counts.keys():
        
      df1 = df[df[target] == id]
        
      rem = max_vals % df1.shape[0]
      copies = max_vals // df1.shape[0]
      rem_rows = df1.sample(frac = 1).iloc[0:rem]
      
      if copies > 0:
         df1 = pd.concat([df1]*copies)
           
      if rem > 0:
         df1 = pd.concat([df1,rem_rows])
        
      ovr_samp = pd.concat([ovr_samp,df1])
        
      
   ovr_samp = ovr_samp.sample(frac = 1)
   return ovr_samp
     

In [44]:
ovr_df = over_sampling(df,'Class')

In [45]:
ovr_df['Class'].value_counts()

1    763
0    763
Name: Class, dtype: int64

In [46]:
def Simple_Random_Sampling(df,z,e,p = 0.5):
    z = st.norm.ppf(z)
    n = (z**2)*(p*(1-p))//e**2
    rand_sample = df.sample(int(n))
    return rand_sample
    

In [47]:
Simple_Random_Sampling(df,0.95,0.05)['Class'].value_counts()

0    266
1      4
Name: Class, dtype: int64

In [48]:
def Sratified_Sampling(df,tar,z,e,p = 0.5):
    s = len(dict(df[tar].value_counts()))
    z = st.norm.ppf(z)
    n = (z**2)*(p*(1-p))//((e/s)**2)
    rows = df.shape[0]
    strat_sam = df.groupby(tar, group_keys=False).apply(lambda x: x.sample(frac=n/rows))
    return strat_sam

In [49]:
Sratified_Sampling(ovr_df,'Class',0.95,0.05)['Class'].value_counts()

0    541
1    541
Name: Class, dtype: int64

In [50]:
def Cluster_Sampling(df,tar,z,e,c,p = 0.5):
    z = st.norm.ppf(z)
    rows = df.shape[0]
    n = ((z**2)*(p*(1-p))/((e)**2))/(rows - c)
    clust = df.sample(frac=n)
    return clust
    

In [51]:
Cluster_Sampling(ovr_df,'Class',0.95,0.05,300)['Class'].value_counts()

0    171
1    166
Name: Class, dtype: int64

In [52]:
def Systematic_sampling(df,k):
    indexes = np.arange(0, len(df), step=k)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

In [53]:
Systematic_sampling(ovr_df,5)['Class'].value_counts()

0    155
1    151
Name: Class, dtype: int64

In [54]:
print(max(df['Amount']),min(df['Amount']))

3828.04 0.0


Convenience Sampling is a method of collecting data in which the investigator selects the items from the population that suits his convenience.

In [55]:
def Convenience_Sampling(df,par,amt):
    return df[df[par]<=amt]

Selecting rows where amount is less than 5 

In [56]:
Convenience_Sampling(ovr_df,'Amount',5)['Class'].value_counts()

1    679
0    177
Name: Class, dtype: int64

### Taking Samples from balanced data (Over Sampling) ###

In [57]:
sample1 = Simple_Random_Sampling(ovr_df,0.95,0.05)
sample2 = Systematic_sampling(ovr_df,5)
sample3 = Sratified_Sampling(ovr_df,'Class',0.95,0.05)
sample4 = Cluster_Sampling(ovr_df,'Class',0.95,0.05,300)
sample5 = Convenience_Sampling(ovr_df,'Amount',5)

In [58]:
samples = []
samples.append(sample1)
samples.append(sample2)
samples.append(sample3)
samples.append(sample4)
samples.append(sample5)

### Testing on whole dataset ###

In [59]:
x_test = np.array(df.iloc[:,:-1])
y_test = np.array(df.iloc[:,-1:].values).reshape(-1,).astype('int')

### Model 1  - Logistic Regression ###

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
logistic_acc = []
for sample in samples:
    classifier1 = make_pipeline(StandardScaler(), LogisticRegression())
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier1.fit(x_train,y_train)
    y_pred = classifier1.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    logistic_acc.append(acc)
    print(acc)
    

0.8341968911917098
0.8626943005181347
0.8704663212435233
0.8367875647668394
0.8082901554404145


### Model 2 - SVM ###

In [61]:
from sklearn.svm import SVC  
svm_acc = []
for sample in samples:
    classifier2 = SVC(kernel='linear') 
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier2.fit(x_train,y_train)
    y_pred = classifier2.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    svm_acc.append(acc)
    print(acc)

0.8549222797927462
0.8575129533678757
0.8795336787564767
0.8238341968911918
0.7448186528497409


### Model 3  - Random Forest Classifier ###

In [63]:
from sklearn.ensemble import RandomForestClassifier
knn_acc = []
for sample in samples:
    classifier3 = RandomForestClassifier() 
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier3.fit(x_train,y_train)
    y_pred = classifier3.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    knn_acc.append(acc)
    print(acc)

0.9883419689119171
0.9974093264248705
1.0
0.9922279792746114
0.9987046632124352


### Model 4 - Decision Tree ###

In [65]:
from sklearn.tree import DecisionTreeClassifier
dtc_acc = []
for sample in samples:
    classifier4 = DecisionTreeClassifier()
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier4.fit(x_train,y_train)
    y_pred = classifier4.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    dtc_acc.append(acc)
    print(acc)

0.9520725388601037
0.9676165803108808
0.9974093264248705
0.9676165803108808
0.9987046632124352


### Model 5 - SGD Classifier ###

In [66]:
from sklearn.linear_model import SGDClassifier
gnb_acc = []
for sample in samples:
    classifier5  = SGDClassifier()
    x_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    classifier5.fit(x_train,y_train)
    y_pred = classifier5.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    gnb_acc.append(acc)
    print(acc)

0.3393782383419689
0.28367875647668395
0.24611398963730569
0.7292746113989638
0.35621761658031087


### COMPARISION ###

In [68]:
metrics = pd.DataFrame()
metrics['logistic_regression'] = logistic_acc
metrics['SVM (linear)'] = svm_acc
metrics['Random Forest Classifier'] = knn_acc
metrics['Decision_tree'] = dtc_acc
metrics['SGDClassifier'] = gnb_acc
metrics = metrics.rename(index = {0:'Simple_Random_Sampling',1:'Systematic_sampling',2:'Sratified_Sampling',3:'Cluster_Sampling',4:'Convenience_Sampling'}).T

In [None]:
metrics

Unnamed: 0,Simple_Random_Sampling,Systematic_sampling,Sratified_Sampling,Cluster_Sampling,Convenience_Sampling
logistic_regression,0.857513,0.854922,0.86658,0.835492,0.80829
SVM (linear),0.841969,0.870466,0.873057,0.831606,0.744819
KNN (n = 7),0.836788,0.831606,0.958549,0.86658,0.92487
Decision_tree,0.976684,0.933938,1.0,0.96114,0.998705
SGDClassifier,0.204663,0.020725,0.010363,0.046632,0.119171
