In [1]:
# Data Leakage
    # Data leakage causes a model to look accurate, until its not
    # There are 2 types of data leakages: Leaky Predictors & Leaky Validation
    
# Leaky Predictors
    # Data that isn't available until you actually predict it
    # For example:
        # Getting sick vs taking antibiotics
        # If you're trying to predict who got sick based on who took antibiotics
        # then this can become skewed because folks who get sick, often take antibiotics to get better
        # The problem here is that, you don't know if they took the antibiotics before or after getting sick
        # Therefore, taking antibiotics changes after you get sick which indicates
    # PREVENT
        # To prevent this type of leakage, exclude any variable that gets updated or created
        # Look at columns statistically correlated to the predictor
        # If your model is overly accurate
        
# Leaky Validation
    # Fitting imputer data prior to train_test_splitting
    # This happens when you aren't careful on distinguishing training data from validation data

    # PREVENT
        # If your validation uses train_test_split, exclude validation data from fitting (easy when using pipelines)

# Let's use an example

In [2]:
import pandas as pd

data = pd.read_csv('./datasets/credit_data.csv',
                  true_values = ['yes'],
                  false_values = ['no'])
print(data.head())

   card  reports       age  income     share  expenditure  owner  selfemp  \
0  True        0  37.66667  4.5200  0.033270   124.983300   True    False   
1  True        0  33.25000  2.4200  0.005217     9.854167  False    False   
2  True        0  33.66667  4.5000  0.004156    15.000000   True    False   
3  True        0  30.50000  2.5400  0.065214   137.869200  False    False   
4  True        0  32.16667  9.7867  0.067051   546.503300   True    False   

   dependents  months  majorcards  active  
0           3      54           1      12  
1           3      34           1      13  
2           4      58           1       5  
3           0      25           1       7  
4           2      64           1       5  


In [4]:
# Check the dynamis of the data

data.shape

# (rows, columns)

(1319, 12)

In [15]:
# Let's setup a Pipeline and fit the data
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

y = data.card
# Use each column except our predictor(card)
X = data.drop(['card'], axis=1)

my_pipeline = make_pipeline(RandomForestClassifier())

cross_validation_score = cross_val_score(my_pipeline, X, y, scoring='accuracy')

print(cross_validation_score.mean())

0.9810450748947331




In [22]:
# It's rare you'll find a model with 98% accuracy so let's inspect the data
expenditures_cardholders = data.expenditure[data.card]
expenditures_noncardholders = data.expenditure[~data.card]

print((expenditures_cardholders == 0).mean())
print((expenditures_noncardholders == 0).mean())

0.020527859237536656
1.0


In [30]:
# The above means that everyone with NO card had no expenditures while only 2% of those with a card had expenditures
# This displays a dataleak so we drop any variables that contribute to a dataleak and run our validation again

leaky_variables = ['expenditure', 'share', 'active', 'majorcards']
X2 = X.drop(leaky_variables, axis=1)

cross_validation_score = cross_val_score(my_pipeline, X2, y, scoring='accuracy')

print(cross_validation_score.mean())

0.8021277697245806




In [None]:
# This means, that we can expect our model to be right 80% of the time when used on new applicants
# This is more realistic than the 97% which would've proved inaccurate due to the data leak