# James Lloyd - Exploratory Data analysis

#### Reading in credit fraud data
#### Data obtained from https://www.kaggle.com/mlg-ulb/creditcardfraud

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

#### Note, matplotlib inline allows plots to be displayed in jupyter notebooks

In [3]:
df = pd.read_csv ('../data/raw/creditcard.csv')

In [4]:
df.shape

(284807, 31)

In [5]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

### It's better to use .sample() instead of .head()

In [8]:
df.sample (5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
89143,62447.0,1.436075,-0.543677,-0.228048,-0.734305,-0.730617,-1.03688,-0.311605,-0.213513,-0.70649,...,0.112714,0.133838,-0.16504,-0.132113,0.670299,-0.111217,-0.032206,0.0012,29.5,0
52076,45261.0,-0.077515,-0.989908,-0.421795,1.701,-1.825194,0.259901,2.113534,-0.15124,-0.132655,...,0.546892,0.668844,1.573428,0.290347,-0.628187,-0.409705,-0.191307,-0.084005,561.1,0
238687,149778.0,2.026527,-1.106691,-0.200923,-0.339185,-1.184604,0.142397,-1.26909,0.130509,0.571318,...,-0.298369,-0.335476,0.320028,0.409669,-0.464038,-0.432784,0.068969,-0.011025,47.11,0
218444,141320.0,1.785215,-0.382678,-1.876997,0.398142,0.076803,-0.732037,0.075962,-0.062329,0.937907,...,-0.166706,-0.533536,0.057758,-0.548197,-0.141992,-0.090177,-0.033242,-0.015984,115.92,0
6697,8319.0,-0.424671,0.572668,1.206903,-1.856929,0.407276,-0.534332,0.680425,-0.195243,2.389246,...,-0.123844,0.206307,-0.394025,-0.465199,0.330115,-0.82353,0.144603,0.126277,7.6,0


In [9]:
df.describe ()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


#### Check for the proportion of missing values . . . appears to be zero

In [12]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
missing_data['Missing Percent'] = missing_data['Missing Percent'].apply(lambda x: x * 100)
missing_data#.loc[missing_data['Missing Percent'] > 10][:10]

Unnamed: 0,Total,Missing Percent
Class,0,0.0
V14,0,0.0
V1,0,0.0
V2,0,0.0
V3,0,0.0
V4,0,0.0
V5,0,0.0
V6,0,0.0
V7,0,0.0
V8,0,0.0


In [13]:
import pandas_profiling

In [None]:
profile = df.profile_report(title='Pandas Profiling Report for Credit Card Fraud')
profile.to_file (output_file="fifa_pandas_profiling.html")