# Data cleaning 

### Reading filtered data set

In [31]:
import pandas as pd
import numpy as np

In [32]:
df = pd.read_csv ('../data/creditcard_v2.csv')

In [33]:
df.sample (5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
221510,143076.0,-0.026777,0.89833,-0.309383,-0.609974,0.93577,-1.265027,1.224954,-0.342768,-0.025301,...,0.248436,1.015054,-0.164446,0.057522,-0.7298,0.423567,0.204843,0.268343,11.5,0
123239,77116.0,-1.221978,-5.50942,-1.364834,-0.122499,-2.85097,-0.361547,0.997227,-0.479045,-2.087052,...,0.688025,-0.759149,-1.288401,0.599196,0.07195,-0.239421,-0.257958,0.24173,1408.75,0
234179,148389.0,0.044493,0.861139,-2.406577,-1.516339,2.905412,3.001557,-0.021396,1.222323,-0.485722,...,0.324431,0.783173,-0.01183,0.66096,-0.661763,0.532282,-0.026439,0.043499,1.5,0
209683,138112.0,1.939718,-0.601441,-0.840558,-0.001947,-0.118494,0.455587,-0.634316,0.215074,1.3916,...,-0.139837,-0.380148,0.131194,-1.515046,-0.247049,-0.835897,0.048583,-0.044679,50.0,0
156296,109545.0,-0.595777,0.883105,2.293802,-0.172475,0.175996,-0.17634,0.498141,-0.264382,1.632006,...,-0.302497,-0.460923,-0.319898,-0.280394,0.2476,-0.727202,-0.214285,-0.146006,2.12,0


### Logic: Remove non-informative outliers using the 6 std dev
##### Non-informative we've defined as a domain of variables that does not contain examples of fraud. Variables outside the range will be removed, unless it contains fraud examples. In that case, the cut-off is moved to include the largest observed instance of fraud

In [34]:
df_fraud = df [df.Class == 1]

In [35]:
def calculate_bounds (v):
    iqr = v.quantile (0.75) - v.quantile (0.25)
    
    return [v.quantile (0.25) - (3 * iqr), v.quantile (0.75) + (3 * iqr)]

In [36]:
ranges = []

for i in range (1,29):
    bounds_data = calculate_bounds (df ['V' + str (i)])
    bounds_fraud = [min (df_fraud ['V' + str (i)]), max (df_fraud ['V' + str (i)])]
    
    lb = min (bounds_data [0], bounds_fraud [0])
    ub = max (bounds_data [1], bounds_fraud [1])
    ranges.append ([lb, ub])

In [37]:
df.shape

(283296, 31)

In [38]:
for i in range (1,29):
    df = df [df ['V'+ str (i)].apply (lambda x : ((x > ranges [i - 1] [0]) & (x < ranges [i - 1] [1])))]

In [39]:
df.shape

(279071, 31)