# Bank transaction classifier

## Data exploring

In [None]:
# import 'em all
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
#from scipy.stats import norm
#from sklearn.preprocessing import StandardScaler
#from scipy import stats
#import warnings
#warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# go get the data
df = pd.read_csv("../input/creditcard.csv")

In [None]:
# let's see what's inside
df.head(5)

All V variables are normalized, we need to decide how to manage Amount and Time.

I will start with the latter one. Time is counted from 0.0 and is measured in seconds.

In [None]:
df.Time.max()/3600

Almost 48 hours or 2 whole days. First thing on my mind? Plot frad vs normal transactions depending on second, minute, hour..

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=False, figsize=(12,12))

ax1.hist((df.Time[df.Class == 1]%60, df.Time[df.Class == 0]%60), bins = 60, normed=True, label=("Fraud","Normal"))
ax1.set_title('Seconds')

ax2.hist((df.Time[df.Class == 1]%3600,  df.Time[df.Class == 0]%3600), bins = 60, normed=True, label=("Fraud","Normal"))
ax2.set_title('Minutes')

ax3.hist((df.Time[df.Class == 1]%(3600*24),  df.Time[df.Class == 0]%(3600*24)), bins = 24, normed=True, label=("Fraud","Normal"))
ax3.set_title('Hours')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Number of Transactions')
plt.show()

And we got.. nothing. Plan B: Amount against Time. 

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,8))

ax1.scatter(df.Time[df.Class == 1], df.Amount[df.Class == 1])
ax1.set_title('Fraud')

ax2.scatter(df.Time[df.Class == 0], df.Amount[df.Class == 0])
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

At this point we cannot tell anything specyfic about the data. The time seems to be useless, so we'll get rid of it. 

In [None]:
df = df.drop(['Time'], axis=1)
df.head()

All features but Amount have been previously transformed with PCA.

We can assume that they are all lineary independent. But.. can we?

In [None]:
#normalize Amount 
df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape((-1, 1)))
df = df.drop(['Amount'], axis=1)
df.head()

In [None]:
#correlation matrix
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
corrmat.nlargest(5, 'Class')['Class']

Not a single suspicious correlation for now. Too suspicius to be left unexamined.


In [None]:
#scatterplot
sns.set()
cols = ['Class', 'V11', 'V4', 'V2', 'V20', 'V7', 'normAmount']
sns.pairplot(df[cols], size = 2.5)
plt.show();

It's hard to find anything usefull here.

Next step: checking the missing data

In [None]:
#every soul on board?
df.isnull().sum().sum()

Finally a perfectly filled dataset!

In [None]:
sns.countplot("Class",data=df)

In [None]:
class Perceptron(object):
    """Perceptron classifier.
    
    Parameters    
    -----------    
    eta : float        
        Learning rate (between 0.0 and 1.0)    
    n_iter : int        
        Passes over the training dataset.
    
     Attributes    
     ----------    
     w_ : 1d-array        
         Weights after fitting.    
    errors_ : list        
        Number of misclassifications in every epoch.
    
    """    
    def __init__(self, eta=0.01, n_iter=10):        
        self.eta = eta        
        self.n_iter = n_iter
    def fit(self, X, y):        
        """Fit training data.
        
        Parameters        
        ---------        
        X : {array-like}, shape = [n_samples, n_features]            
            Training vectors, where n_samples is the number of samples and 
            n_features is the number of features.
        y : array-like, shape = [n_samples]            
            Target values.
        
        Returns        
        ------        
        self : object
        
        """        
        self.w_ = np.zeros(1 + X.shape[1])        
        self.errors_ = []
        
        for _ in range(self.n_iter):            
            errors = 0            
            for xi, target in zip(X, y):                
                update = self.eta * (target - self.predict(xi))                
                self.w_[1:] += update * xi                
                self.w_[0] += update                
                errors += int(update != 0.0)            
            self.errors_.append(errors)        
        return self
    
    def net_input(self, X):        
        """Calculate net input"""        
        return np.dot(X, self.w_[1:]) + self.w_[0]
    
    def predict(self, X):        
        """Return class label after unit step"""        
        return np.where(self.net_input(X) >= 0.0, 1, -1)