# Data Leakage and its Avoidance

If our workflow is:
1. Prepare the data set (eliminate redundancy, impute, scale, ...)
2. Split the data set
3. Evaluate the model
then information from the test set contaminates the preparation and fitting of the training data

To avoid this the workflow should be:
1. Split the data set
2. **Prepare using the training data set**
3. **Apply the same preparation to the Training and Test data sets**
4. Evaluate the model


In [69]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
import pandas as pd

In [70]:
df = pd.read_csv('wildfires.csv')
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,fire_name,fire_size,fire_size_class,stat_cause_descr,latitude,longitude,state,disc_clean_date,cont_clean_date,discovery_month,disc_date_final,cont_date_final,putout_time,disc_date_pre,disc_pre_year,disc_pre_month,wstation_usaf,dstation_m,wstation_wban,wstation_byear,wstation_eyear,Vegetation,fire_mag,weather_file,Temp_pre_30,Temp_pre_15,Temp_pre_7,Temp_cont,Wind_pre_30,Wind_pre_15,Wind_pre_7,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness
0,0,0,,10.0,C,Missing/Undefined,18.105072,-66.753044,PR,2/11/2007,,Feb,,,,1/12/2007,2007,Jan,785140,58917.69716,11603,1945,2018,12,1.0,785140-11603-2007.gz,24.480974,24.716923,24.902597,24.527961,4.341807,3.492857,3.262092,3.250413,78.21659,76.79375,76.381579,78.72437,0.0,0.0,0.0,0.0,0.017923
1,1,1,,3.0,B,Arson,35.03833,-87.61,TN,12/11/2006,,Dec,,,,11/11/2006,2006,Nov,723235,32665.34327,13896,1978,2020,15,0.1,723235-13896-2006.gz,7.553433,7.01,0.343529,10.448298,2.709764,2.881707,1.976471,2.12232,70.84,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355


In [71]:
#drop first two index columns
df = df.iloc[:, 2:]

In [72]:
#check for nulls
df.isna().sum()

fire_name           29454
fire_size               0
fire_size_class         0
stat_cause_descr        0
latitude                0
longitude               0
state                   0
disc_clean_date         0
cont_clean_date     27890
discovery_month         0
disc_date_final     26659
cont_date_final     29735
putout_time         27890
disc_date_pre           0
disc_pre_year           0
disc_pre_month          0
wstation_usaf           0
dstation_m              0
wstation_wban           0
wstation_byear          0
wstation_eyear          0
Vegetation              0
fire_mag                0
weather_file            0
Temp_pre_30             0
Temp_pre_15             0
Temp_pre_7              0
Temp_cont               0
Wind_pre_30             0
Wind_pre_15             0
Wind_pre_7              0
Wind_cont               0
Hum_pre_30              0
Hum_pre_15              0
Hum_pre_7               0
Hum_cont                0
Prec_pre_30             0
Prec_pre_15             0
Prec_pre_7  

In [73]:
#just rearranging columns so fire-size is first (y)
cols = list(df.columns)
cols = [cols[1]] + cols[0:1] + cols[2:]
df = df[cols]

In [74]:
df.columns

Index(['fire_size', 'fire_name', 'fire_size_class', 'stat_cause_descr',
       'latitude', 'longitude', 'state', 'disc_clean_date', 'cont_clean_date',
       'discovery_month', 'disc_date_final', 'cont_date_final', 'putout_time',
       'disc_date_pre', 'disc_pre_year', 'disc_pre_month', 'wstation_usaf',
       'dstation_m', 'wstation_wban', 'wstation_byear', 'wstation_eyear',
       'Vegetation', 'fire_mag', 'weather_file', 'Temp_pre_30', 'Temp_pre_15',
       'Temp_pre_7', 'Temp_cont', 'Wind_pre_30', 'Wind_pre_15', 'Wind_pre_7',
       'Wind_cont', 'Hum_pre_30', 'Hum_pre_15', 'Hum_pre_7', 'Hum_cont',
       'Prec_pre_30', 'Prec_pre_15', 'Prec_pre_7', 'Prec_cont', 'remoteness'],
      dtype='object')

In [75]:
df.shape

(55367, 41)

In [76]:
#encode stat_cause_descr
df["cause_code"] = df["stat_cause_descr"].astype('category').cat.codes

c = df["stat_cause_descr"].astype('category')
cause_dict = dict(enumerate(c.cat.categories))
print(cause_dict)

{0: 'Arson', 1: 'Campfire', 2: 'Children', 3: 'Debris Burning', 4: 'Equipment Use', 5: 'Fireworks', 6: 'Lightning', 7: 'Miscellaneous', 8: 'Missing/Undefined', 9: 'Powerline', 10: 'Railroad', 11: 'Smoking', 12: 'Structure'}


In [77]:
#encode state
df["state_code"] = df["state"].astype('category').cat.codes

s = df["state"].astype('category')
state_dict = dict(enumerate(s.cat.categories))
print(state_dict)


{0: 'AK', 1: 'AL', 2: 'AR', 3: 'AZ', 4: 'CA', 5: 'CO', 6: 'CT', 7: 'DE', 8: 'FL', 9: 'GA', 10: 'HI', 11: 'IA', 12: 'ID', 13: 'IL', 14: 'IN', 15: 'KS', 16: 'KY', 17: 'LA', 18: 'MA', 19: 'MD', 20: 'ME', 21: 'MI', 22: 'MN', 23: 'MO', 24: 'MS', 25: 'MT', 26: 'NC', 27: 'ND', 28: 'NE', 29: 'NH', 30: 'NJ', 31: 'NM', 32: 'NV', 33: 'NY', 34: 'OH', 35: 'OK', 36: 'OR', 37: 'PA', 38: 'PR', 39: 'RI', 40: 'SC', 41: 'SD', 42: 'TN', 43: 'TX', 44: 'UT', 45: 'VA', 46: 'VT', 47: 'WA', 48: 'WI', 49: 'WV', 50: 'WY'}


In [78]:
#encode fire size class
df["fire_size_class_code"] = df["fire_size_class"].astype('category').cat.codes

s = df["fire_size_class"].astype('category')
fire_size_class_dict = dict(enumerate(s.cat.categories))

print(fire_size_class_dict)

{0: 'B', 1: 'C', 2: 'D', 3: 'E', 4: 'F', 5: 'G'}


In [79]:
#encode discovery month
month_dict = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}

df.discovery_month = df.discovery_month.map(month_dict)

In [80]:
df.head(2)

Unnamed: 0,fire_size,fire_name,fire_size_class,stat_cause_descr,latitude,longitude,state,disc_clean_date,cont_clean_date,discovery_month,disc_date_final,cont_date_final,putout_time,disc_date_pre,disc_pre_year,disc_pre_month,wstation_usaf,dstation_m,wstation_wban,wstation_byear,wstation_eyear,Vegetation,fire_mag,weather_file,Temp_pre_30,Temp_pre_15,Temp_pre_7,Temp_cont,Wind_pre_30,Wind_pre_15,Wind_pre_7,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness,cause_code,state_code,fire_size_class_code
0,10.0,,C,Missing/Undefined,18.105072,-66.753044,PR,2/11/2007,,2,,,,1/12/2007,2007,Jan,785140,58917.69716,11603,1945,2018,12,1.0,785140-11603-2007.gz,24.480974,24.716923,24.902597,24.527961,4.341807,3.492857,3.262092,3.250413,78.21659,76.79375,76.381579,78.72437,0.0,0.0,0.0,0.0,0.017923,8,38,1
1,3.0,,B,Arson,35.03833,-87.61,TN,12/11/2006,,12,,,,11/11/2006,2006,Nov,723235,32665.34327,13896,1978,2020,15,0.1,723235-13896-2006.gz,7.553433,7.01,0.343529,10.448298,2.709764,2.881707,1.976471,2.12232,70.84,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355,0,42,0


In [81]:
#export clean df
df.to_csv('wildfires_clean.csv', index=False)

In [82]:
pd.set_option('display.max_columns', df.shape[1])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55367 entries, 0 to 55366
Data columns (total 44 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fire_size             55367 non-null  float64
 1   fire_name             25913 non-null  object 
 2   fire_size_class       55367 non-null  object 
 3   stat_cause_descr      55367 non-null  object 
 4   latitude              55367 non-null  float64
 5   longitude             55367 non-null  float64
 6   state                 55367 non-null  object 
 7   disc_clean_date       55367 non-null  object 
 8   cont_clean_date       27477 non-null  object 
 9   discovery_month       55367 non-null  int64  
 10  disc_date_final       28708 non-null  object 
 11  cont_date_final       25632 non-null  object 
 12  putout_time           27477 non-null  object 
 13  disc_date_pre         55367 non-null  object 
 14  disc_pre_year         55367 non-null  int64  
 15  disc_pre_month     

Index(['fire_size', 'fire_name', 'fire_size_class', 'stat_cause_descr',
       'latitude', 'longitude', 'state', 'disc_clean_date', 'cont_clean_date',
       'discovery_month', 'disc_date_final', 'cont_date_final', 'putout_time',
       'disc_date_pre', 'disc_pre_year', 'disc_pre_month', 'wstation_usaf',
       'dstation_m', 'wstation_wban', 'wstation_byear', 'wstation_eyear',
       'Vegetation', 'fire_mag', 'weather_file', 'Temp_pre_30', 'Temp_pre_15',
       'Temp_pre_7', 'Temp_cont', 'Wind_pre_30', 'Wind_pre_15', 'Wind_pre_7',
       'Wind_cont', 'Hum_pre_30', 'Hum_pre_15', 'Hum_pre_7', 'Hum_cont',
       'Prec_pre_30', 'Prec_pre_15', 'Prec_pre_7', 'Prec_cont', 'remoteness'],
      dtype='object')

In [83]:
X = df[['cause_code', 'state_code', 'fire_size_class_code', 'discovery_month','Vegetation',
       'fire_mag', 'Temp_pre_30', 'Temp_pre_15', 'Temp_pre_7',
       'Temp_cont', 'Wind_pre_30', 'Wind_pre_15', 'Wind_pre_7', 'Wind_cont',
       'Hum_pre_30', 'Hum_pre_15', 'Hum_pre_7', 'Hum_cont', 'Prec_pre_30',
       'Prec_pre_15', 'Prec_pre_7', 'Prec_cont', 'remoteness']]

y = df[['fire_size']]


print(X.shape)
print(y.shape)

(55367, 23)
(55367, 1)


### Correct workflow avoiding data leakage

In [84]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55367 entries, 0 to 55366
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cause_code            55367 non-null  int8   
 1   state_code            55367 non-null  int8   
 2   fire_size_class_code  55367 non-null  int8   
 3   discovery_month       55367 non-null  int64  
 4   Vegetation            55367 non-null  int64  
 5   fire_mag              55367 non-null  float64
 6   Temp_pre_30           55367 non-null  float64
 7   Temp_pre_15           55367 non-null  float64
 8   Temp_pre_7            55367 non-null  float64
 9   Temp_cont             55367 non-null  float64
 10  Wind_pre_30           55367 non-null  float64
 11  Wind_pre_15           55367 non-null  float64
 12  Wind_pre_7            55367 non-null  float64
 13  Wind_cont             55367 non-null  float64
 14  Hum_pre_30            55367 non-null  float64
 15  Hum_pre_15         

In [85]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55367 entries, 0 to 55366
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   fire_size  55367 non-null  float64
dtypes: float64(1)
memory usage: 432.7 KB


In [86]:
# split first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# scale the test dataset
X_test = scaler.transform(X_test)
model1 = LinearRegression()

model1.fit(X_train, y_train)

yhat = model1.predict(X_test)

print("Train R-squared: ", model1.score(X_train,y_train))
print('Train MSE: %.2f'% mean_squared_error(y_train, model1.predict(X_train)))
print("Testing R-squared: ", model1.score(X_test,y_test))
print('Test MSE: %.2f'% mean_squared_error(y_test, model1.predict(X_test)))

Train R-squared:  0.22271571945437274
Train MSE: 157582076.12
Testing R-squared:  0.21225088211596466
Test MSE: 208885962.91


### Use a Pipeline to avoid data leakage during k-fold CV

In [87]:
from sklearn.pipeline import Pipeline

X, y = make_classification(n_samples=55367, n_features=21, 
                           n_informative=15, n_redundant=5, 
                           random_state=7)

pipe = Pipeline(steps = 
                [('scaler', MinMaxScaler()),
                 ('model', LinearRegression())])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(pipe, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

print('Accuracy: %.3f (%.3f)' % (-scores.mean(), scores.std()))

Accuracy: 0.102 (0.002)
