# Data preparation

In this notebook we preprocess our data for ML algorithms and create new features.

## Load and reshape data

Import libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime  
from datetime import timedelta  
import gc
import pickle
from sklearn import preprocessing, metrics
import pandas as pd
import matplotlib.pyplot as plt
#from sagemaker import get_execution_role
#import boto3
import seaborn as sns

Helper functions

In [8]:
# this function is taken from https://www.kaggle.com/ragnar123/very-fst-model
def reduce_mem_usage(df, verbose=True):
    '''Reduce memory usage of dataframe by converting ints and floats 
    Args:
        df: dataframe
            
    Returns:
        dataframe with converted columns
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


Read data from S3.

In [None]:
#role = get_execution_role()
#bucket='sagemaker-eu-central-1-594657351600'
#prefix = 'kaggle-m5/00_data'

In [None]:
print('Reading files...')
#data_location = 's3://{}/{}/{}'.format(bucket, prefix, 'calendar.csv')
data_location = "../01_preprocessed_data/X.pkl"
X = pd.read_pickle(data_location)

## Lag features

In [None]:
lags = [x for x in range(28, 28+7)]
windows = [7, 28]

lag_cols = [f"lag_{lag}" for lag in lags ]
for lag, lag_col in zip(lags, lag_cols):
    print('Lag {}'.format(lag))
    X[lag_col] = X.groupby("id")["demand"].shift(lag)

X = reduce_mem_usage(X)

for window in windows:
    for lag,lag_col in zip(lags, lag_cols):
        print("Lag {}, window {}".format(lag, window))
        X[f"rmean_{lag}_{window}"] = X[["id", lag_col]].groupby("id")[lag_col].\
        transform(lambda x : x.rolling(window).mean())
X = reduce_mem_usage(X)
# For each id drop early rows for which lag cannot be calculated 
    
#X = X.dropna()


Categorical features

In [None]:
X.to_pickle("../01_preprocessed_data/X_lags.pkl")

In [3]:
#X = pd.read_pickle("../01_preprocessed_data/X_lags.pkl")

# From train, validation, and test datasets

Split X into X_train and X_val

In [4]:
first_day = pd.Timestamp(X['date'].values.min())
n_days = 1913
last_day_val =  first_day + timedelta(days = n_days-1)
n_val_days = 28# round(n_days*val_size)

In [5]:
print("n_val_days {}".format(n_val_days))

n_val_days 28


In [6]:
first_val_day = first_day + timedelta(days = n_days - n_val_days)
first_val_day

Timestamp('2016-03-28 00:00:00')

In [9]:
X_train = X[X['date'] < first_val_day]
y_train = X_train[['demand']]
X_val = X[(X['date'] >= first_val_day) & (X['date'] <= last_day_val)]
y_val = X_val[['demand']]
X_test = X[X['date'] > last_day_val]
X_test = reduce_mem_usage(X_test)
X_train = reduce_mem_usage(X_train)
X_val = reduce_mem_usage(X_val)
X = reduce_mem_usage(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Mem. usage decreased to 144.11 Mb (0.6% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

Mem. usage decreased to 7668.51 Mb (0.0% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

Mem. usage decreased to 144.92 Mb (0.0% reduction)
Mem. usage decreased to 7958.35 Mb (0.0% reduction)


In [11]:
print("X dates:")
print(X['date'].values.min())
print(X['date'].values.max())
print((X['date'].values.max() - X['date'].values.min())/np.timedelta64(1, 'D'))

print("X_train dates:")
print(X_train['date'].values.min())
print(X_train['date'].values.max())
print((X_train['date'].values.max() - X_train['date'].values.min())/np.timedelta64(1, 'D'))

print("X_val dates:")
print(X_val['date'].values.min())
print(X_val['date'].values.max())
print((X_val['date'].values.max() - X_val['date'].values.min())/np.timedelta64(1, 'D'))


print("X_test dates:")
print(X_test['date'].values.min())
print(X_test['date'].values.max())
print((X_test['date'].values.max() - X_test['date'].values.min())//np.timedelta64(1, 'D'))

X dates:
2011-01-29T00:00:00.000000000
2016-05-22T00:00:00.000000000
1940.0
X_train dates:
2011-01-29T00:00:00.000000000
2016-03-27T00:00:00.000000000
1884.0
X_val dates:
2016-03-28T00:00:00.000000000
2016-04-24T00:00:00.000000000
27.0
X_test dates:
2016-04-25T00:00:00.000000000
2016-05-22T00:00:00.000000000
27


In [16]:
del X

### Sales pattern

In this part we determine demand_type of the time series, as it was done at https://github.com/Mcompetitions/M5-methods

Determine average demand interval.

In [12]:
X_train['ADI'] = X_train.groupby(['id'])['demand'].transform(lambda x: len(x)/sum(x > 0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Compute square of the Coefficient of Variation (CV²).

In [13]:
X_train['CV2'] = X_train.groupby(['id'])['demand'].transform(lambda x: (np.std(x[x>0])/np.mean(x[x>0]))**2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Calculate demand_type.

In [14]:
X_train['demand_type'] = ((X_train['ADI'] <= 1.32) & (X_train['CV2'] < 0.5))*1 + \
    ((X_train['ADI'] > 1.32) & (X_train['CV2'] < 0.5))*10 + \
    ((X_train['ADI'] <= 1.32) & (X_train['CV2'] >= 0.5))*100 + \
    ((X_train['ADI'] > 1.32) & (X_train['CV2'] > 0.5))*1000 
X_train['demand_type'] = X_train['demand_type'].replace({1:'smooth', 10:'intermittent', 100:'erratic', 1000:'lumpy'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [17]:
X_val = X_val.merge(X_train[['id', 'ADI', 'CV2', 'demand_type']], on = ['id'], how = 'left')
X_test = X_test.merge(X_train[['id', 'ADI', 'CV2', 'demand_type']], on = ['id'], how = 'left')


MemoryError: Unable to allocate 9.42 GiB for an array with shape (1264878636,) and data type int64

In [None]:
X_train = X_train.drop(columns = ['demand'])
X_val = X_val.drop(columns = ['demand'])
X_test = X_test.drop(columns = ['demand'])


X_val.to_pickle("../01_preprocessed_data/X_val.pkl")
y_val.to_pickle("../01_preprocessed_data/y_val.pkl")
X_test.to_pickle("../01_preprocessed_data/X_test.pkl")
X_train.to_pickle("../01_preprocessed_data/X_train.pkl")
y_train.to_pickle("../01_preprocessed_data/y_train.pkl")
del X_test
del X_val, y_val
del X_train, y_train


In [None]:
X = X.drop(columns = ['wm_yr_wk'])
categorical_features = ['id', 'weekday', 'month' ,'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 
                        'event_name_1', 'event_type_1',  'event_name_2', 'event_type_2', 
                        'demand_type', 'cheaper_than_usual']
for c in categorical_features:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X[c] = X[c].astype('category')


## Mean demand

Most ids have a very low mean demand. 

In [None]:
average_demand_per_id = X.groupby(['id'])['demand'].agg(['mean'])

The following items have the highest average demand

In [None]:
average_demand_per_id.sort_values(by='mean').tail()

In [None]:
average_demand_per_id_hist = sns.distplot(average_demand_per_id['mean'], kde=False)
average_demand_per_id_hist.set_title('Histogram of mean demand per id')
average_demand_per_id_hist.set_xlabel('mean demand')

### Sales pattern

In this part we determine demand_type of the time series, as it was done at https://github.com/Mcompetitions/M5-methods

Determine average demand interval.

In [None]:
X['ADI'] = X.groupby(['id'])['demand'].transform(lambda x: len(x)/sum(x > 0))

Compute square of the Coefficient of Variation (CV²).

In [None]:
X['CV2'] = X.groupby(['id'])['demand'].transform(lambda x: (np.std(x[x>0])/np.mean(x[x>0]))**2)

Calculate demand_type.

In [None]:
X['demand_type'] = ((X['ADI'] <= 1.32) & (X['CV2'] < 0.5))*1 + ((X['ADI'] > 1.32) & (X['CV2'] < 0.5))*10 + \
((X['ADI'] <= 1.32) & (X['CV2'] >= 0.5))*100  + ((X['ADI'] > 1.32) & (X['CV2'] > 0.5))*1000 
X['demand_type'] = X['demand_type'].replace({1:'smooth', 10:'intermittent', 100:'erratic', 1000:'lumpy'})

### Aggregations

Add average sales per week per item, per month per item and simply average sales per item. Add maximum items sold.

In [None]:
X['avg_weekday_demand'] = X.groupby(['id', 'weekday'])['demand'].transform('mean') 
X['avg_demand'] =  X.groupby(['id'])['demand'].transform('mean') 
X['max_demand'] =  X.groupby(['id'])['demand'].transform('max') 


Add quantiles

In [None]:
X['quantile025_week'] = X.groupby(['id', 'weekday'])['demand'].transform(lambda x: x.quantile(0.25))

Add maximum of weekday demand.

In [None]:
X['max_weekday_demand'] = X.groupby(['id', 'weekday'])['demand'].transform('max') 

Add average price per item

In [None]:
X['avg_price'] = X.groupby(['id'])['sell_price'].transform('mean') 
X['cheaper_than_usual'] = ( X['sell_price'] < X['avg_price'])

In [None]:
X.head()

### Test set

Reshape test test to long format.

In [None]:
test1_rows = [row for row in submission['id'] if 'validation' in row]
test2_rows = [row for row in submission['id'] if 'evaluation' in row]
test1 = submission[submission['id'].isin(test1_rows)]
test2 = submission[submission['id'].isin(test2_rows)]

In [None]:
test_dates = calendar.loc[1913:1940, 'date']
column_names = test_dates.dt.strftime('%Y-%m-%d').to_list()
column_names.insert(0,'id' )
test1.columns = column_names

In [None]:
test1 = pd.melt(test1, id_vars = ['id'], var_name = 'date')
test1 = test1.drop(columns = 'value')
test1['date'] =  pd.to_datetime(test1['date']) 
test1.head()

In [None]:
test1 = test1.merge(calendar[calendar_features], on = 'date', how = 'left')
test1.head()

In [None]:
X = reduce_mem_usage(X)
gc.collect()

In [None]:
X.columns

In [None]:
temp = X[['id',  'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'max_demand', 'avg_demand',
          'start_date', 'start_date_from_start', 'avg_price',  'ADI', 'CV2', 'demand_type']].drop_duplicates()
test1 = test1.merge(temp, on = ['id'], how = 'left')
test1['item_id'].unique()

In [None]:
test1 = test1.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

In [None]:
test1['days_from_start'] = test1['date'] - first_day
test1['days_from_start'] = test1['days_from_start'].dt.days

In [None]:
temp = X[['id',  'weekday', 'avg_weekday_demand', 'quantile025_week', 'max_weekday_demand']].drop_duplicates()
test1 = test1.merge(temp, on = ['id', 'weekday'], how = 'left')
test1['item_id'].unique()

In [None]:
test1['cheaper_than_usual'] = ( test1['sell_price'] < test1['avg_price'])
test1['demand'] = 0
X.columns[~X.columns.isin(test1.columns)]

In [None]:
test1 = test1[X.columns]

In [None]:
last_X_day = X.date.values.max()

In [None]:
X = pd.concat([X, test1], axis = 0)

In [None]:
first_day

### Save by demand type

In [None]:
X[X['demand_type'] == 'smooth'].to_pickle("../01_preprocessed_data/X_smooth.pkl")

In [None]:
X[X['demand_type'] == 'erratic'].to_pickle("../01_preprocessed_data/X_erratic.pkl")

In [None]:
X[X['demand_type'] == 'lumpy'].to_pickle("../01_preprocessed_data/X_lumpy.pkl")

In [None]:
X[X['demand_type'] == 'intermittent'].to_pickle("../01_preprocessed_data/X_intermittent.pkl")

In [None]:
del X

In [None]:
del test1

### Lag features and categorical features

In [None]:
categorical_features = ['id', 'weekday', 'month' ,'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 
                        'event_name_1', 'event_type_1',  'event_name_2', 'event_type_2', 
                        'demand_type', 'cheaper_than_usual']

In [None]:
for i in ['smooth', 'erratic', 'lumpy', 'intermittent']:
    print(i)
    X = pd.read_pickle("../01_preprocessed_data/X_{}.pkl".format(i))
    if i == 'intermittent':
        lags = [x for x in range(28, 28+14)]
        windows = [7, 28]
    
    else:
        lags = [x for x in range(28, 28+30)]
        windows = [7, 14, 28]
    
        
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        print('Lag {}'.format(lag))
        X[lag_col] = X.groupby("id")["demand"].shift(lag)
    X = X[X.date > X.start_date + timedelta(days = 41)]
    X = reduce_mem_usage(X)
    lags = [28]
    
    for window in windows:
        for lag,lag_col in zip(lags, lag_cols):
            print("Lag {}, window {}".format(lag, window))
            X[f"rmean_{lag}_{window}"] = X[["id", lag_col]].groupby("id")[lag_col].\
            transform(lambda x : x.rolling(window).mean())
    X = reduce_mem_usage(X)
    # For each id drop early rows for which lag cannot be calculated 
    X = X.dropna()
    X = X.drop(columns = ['wm_yr_wk'])
    for c in categorical_features:
        col_type = X[c].dtype
        if col_type == 'object' or col_type.name == 'category':
            X[c] = X[c].astype('category')
    X.to_pickle("../01_preprocessed_data/X_{}.pkl".format(i))

### Categorical features

Display boxplots for demand per weekday for three ids with the highest average demand. Hiher demand over the weekend can be noticed for these ids.

In [None]:
#sns.boxplot(x="weekday", y="demand", data=X.loc[X['id'] == 'FOODS_3_090_CA_3_validation',  ['weekday', 'demand']])

In [None]:
#sns.boxplot(x="weekday", y="demand", data=X.loc[X['id'] == 'FOODS_3_586_TX_3_validation',  ['weekday', 'demand']])

In [None]:
#sns.boxplot(x="weekday", y="demand", data=X.loc[X['id'] == 'FOODS_3_586_TX_2_validation',  ['weekday', 'demand']])

Display boxplots for demand per event type for three ids with the highest average demand. 

In [None]:
#sns.boxplot(x="event_type_1", y="demand", data=X.loc[X['id'] == 'FOODS_3_586_TX_3_validation', 
#                                                     ['event_type_1', 'demand']])

During sporting events FOODS_3_586_TX_3_validation tends to have a slightly higher demand than for other event types.

In [None]:
#sns.boxplot(x="event_type_1", y="demand", data=X.loc[X['id'] == 'FOODS_3_586_TX_2_validation', 
#                                                     ['event_type_1', 'demand']])

In [None]:
#sns.boxplot(x="event_type_1", y="demand", data=X.loc[X['id'] == 'FOODS_3_090_CA_3_validation', 
#                                                     ['event_type_1', 'demand']])

Display boxplots for demand per demand type for three ids with the highest average demand. 

In [None]:
#X.to_pickle("../01_preprocessed_data/X.pkl")

# From train, validation, and test datasets

Split X into X_train and X_val

In [None]:
n_days = last_day-first_day
n_days = n_days.days
n_val_days = 28# round(n_days*val_size)

In [None]:
print("n_val_days {}".format(n_val_days))

In [None]:
first_val_day = first_day + timedelta(days = n_days - n_val_days + 1)
first_val_day

In [None]:
for i in ['smooth', 'erratic', 'lumpy', 'intermittent']:
    print(i)
    X = pd.read_pickle("../01_preprocessed_data/X_{}.pkl".format(i))
    X_train = X[X['date'] < first_val_day]
    y_train = X_train[['demand']]
    X_val = X[(X['date'] >= first_val_day) & (X['date'] <= last_X_day)]
    y_val = X_val[['demand']]
    X_test = X[X['date'] > last_X_day]
    X_test = reduce_mem_usage(X_test)
    X_train = reduce_mem_usage(X_train)
    X_val = reduce_mem_usage(X_val)
    X = reduce_mem_usage(X)
    X_train = X_train.drop(columns = ['demand'])
    X_val = X_val.drop(columns = ['demand'])
    X_test = X_test.drop(columns = ['demand'])
    X_val.to_pickle("../01_preprocessed_data/X_{}_val.pkl".format(i))
    y_val.to_pickle("../01_preprocessed_data/y_{}_val.pkl".format(i))
    del X_val, y_val
    X_test.to_pickle("../01_preprocessed_data/X_{}_test.pkl".format(i))
    del X_test
    X_train.to_pickle("../01_preprocessed_data/X_{}_train.pkl".format(i))
    y_train.to_pickle("../01_preprocessed_data/y_{}_train.pkl".format(i))
    del X_train, y_train

Do not include date, start_date and demand as features.

In [None]:
#not_features = ['date', 'start_date', 'demand']
#numerical_features = X.columns[~X.columns.isin(categorical_features + not_features)]
#new_features = X.columns[~X.columns.isin(not_features)]

In [None]:
#X_train = X_train.drop(columns = ['demand'])
#X_val = X_val.drop(columns = ['demand'])
#X_test = X_test.drop(columns = ['demand'])


Form test dataset

In [None]:
#X_test = reduce_mem_usage(X_test)
#X_train = reduce_mem_usage(X_train)
#X_val = reduce_mem_usage(X_val)
#X = reduce_mem_usage(X)

Save datasets

In [None]:
X_val[new_features].to_pickle("../01_preprocessed_data/X_val.pkl")
y_val.to_pickle("../01_preprocessed_data/y_val.pkl")
del X_val, y_val

In [None]:
X_test[new_features].to_pickle("../01_preprocessed_data/X_test.pkl")
del X_test

In [None]:
X_train[new_features].to_pickle("../01_preprocessed_data/X_train.pkl")
y_train.to_pickle("../01_preprocessed_data/y_train.pkl")
del X_train, y_train