# Data cleaning 

### Reading filtered data set

In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE as smote
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
df = pd.read_csv ('../../data/creditcard_v2.csv')

In [8]:
df.sample (5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
224135,144204.0,1.954576,-0.481934,-0.495271,0.246692,-0.589961,-0.251121,-0.668394,0.105699,1.156155,...,0.238028,0.764191,0.081076,-0.41536,-0.152867,-0.237843,0.021939,-0.050101,19.12,0
49760,44334.0,-0.42294,1.026647,1.390851,0.100551,0.021996,-0.727213,0.58823,0.079556,-0.330504,...,-0.219542,-0.56427,0.066915,0.332704,-0.295766,0.083343,0.25783,0.097368,3.59,0
195312,131459.0,-1.868955,0.864992,1.273568,1.167206,-0.799064,0.544424,1.067554,0.096423,0.430538,...,-0.651156,-1.328265,-0.181685,-0.145213,0.324514,-0.837478,-0.285219,-0.190006,185.04,0
263926,161831.0,0.849057,-1.69014,-2.524388,1.869082,-0.081688,-1.177074,1.643105,-0.677455,-0.040077,...,0.523436,0.288451,-0.634369,0.096669,0.376153,-0.455414,-0.149658,0.041723,633.4,0
265066,162337.0,1.915875,-1.189403,1.015883,1.164328,-1.845727,0.806549,-1.780862,0.316776,1.182805,...,-0.21706,0.471933,0.185159,0.003131,-0.219307,-0.474606,0.169581,-0.008481,21.7,0


### Logic: Remove non-informative outliers using the 6 std dev
##### Non-informative we've defined as a domain of variables that does not contain examples of fraud. Variables outside the range will be removed, unless it contains fraud examples. In that case, the cut-off is moved to include the largest observed instance of fraud

In [9]:
df_fraud = df [df.Class == 1]

In [10]:
def calculate_bounds (v):
    iqr = v.quantile (0.75) - v.quantile (0.25)
    
    return [v.quantile (0.25) - (3 * iqr), v.quantile (0.75) + (3 * iqr)]

In [11]:
ranges = []

for i in range (1,29):
    bounds_data = calculate_bounds (df ['V' + str (i)])
    bounds_fraud = [min (df_fraud ['V' + str (i)]), max (df_fraud ['V' + str (i)])]
    
    lb = min (bounds_data [0], bounds_fraud [0])
    ub = max (bounds_data [1], bounds_fraud [1])
    ranges.append ([lb, ub])

In [12]:
df.shape

(283296, 31)

In [13]:
for i in range (1,29):
    df = df [df ['V'+ str (i)].apply (lambda x : ((x > ranges [i - 1] [0]) & (x < ranges [i - 1] [1])))]

In [14]:
df.shape

(279071, 31)

### Implementing smote over sampling

#### Synthetic Minority Oversampling Technique

### Splitting between variables and target

In [15]:
X =         df [['V1', 
                 'V2', 
                 'V3', 
                 'V4', 
                 'V5', 
                 'V6', 
                 'V7', 
                 'V8', 
                 'V9', 
                 'V10',
                 'V11', 
                 'V12', 
                 'V13', 
                 'V14', 
                 'V15', 
                 'V16', 
                 'V17', 
                 'V18', 
                 'V19', 
                 'V20',
                 'V21', 
                 'V22', 
                 'V23', 
                 'V24', 
                 'V25', 
                 'V26', 
                 'V27', 
                 'V28', 
                 'Amount']]

y = df.Class

In [16]:
X_train, X_test, y_train, y_test = train_test_split (X, y, 
                                                    stratify = y,
                                                    test_size = 0.25,
                                                    random_state = 0)

### Implement smote to upsample to 20% 

In [17]:
sm = smote (random_state = 42, sampling_strategy = 0.20)

In [18]:
X_train_res, y_train_res = sm.fit_resample (X_train, y_train)

In [19]:
y_train.value_counts ()

0    208960
1       343
Name: Class, dtype: int64

In [20]:
y_train_res.value_counts ()

0    208960
1     41792
Name: Class, dtype: int64

#### It worked!

In [21]:
X_train_res ['Class'] = y_train_res

In [23]:
X_train_res.to_csv ('../../data/creditcard_train.csv', index = False)

In [24]:
X_test ['Class'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [25]:
X_test.to_csv ('../../data/creditcard_test.csv', index = False)

### Re-running SMOTE with no outliers removed

In [26]:
df = pd.read_csv ('../../data/creditcard_v2.csv')

In [3]:
X =         df [['V1', 
                 'V2', 
                 'V3', 
                 'V4', 
                 'V5', 
                 'V6', 
                 'V7', 
                 'V8', 
                 'V9', 
                 'V10',
                 'V11', 
                 'V12', 
                 'V13', 
                 'V14', 
                 'V15', 
                 'V16', 
                 'V17', 
                 'V18', 
                 'V19', 
                 'V20',
                 'V21', 
                 'V22', 
                 'V23', 
                 'V24', 
                 'V25', 
                 'V26', 
                 'V27', 
                 'V28', 
                 'Amount']]

y = df.Class

In [4]:
X_train, X_test, y_train, y_test = train_test_split (X, y, 
                                                    stratify = y,
                                                    test_size = 0.25,
                                                    random_state = 0)

In [11]:
sm = smote (random_state = 42, sampling_strategy = 0.20, k_neighbors = 300)

In [12]:
X_train_res, y_train_res = sm.fit_resample (X_train, y_train)

In [31]:
X_train_res ['Class'] = y_train_res
X_train_res.to_csv ('../../data/creditcard_withoutliers_train.csv', index = False)

X_test ['Class'] = y_test
X_test.to_csv ('../../data/creditcard_withoutliers_test.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [7]:
X_train_res.shape

(254523, 29)

In [8]:
X_train_res.drop_duplicates().shape

(249298, 29)