## Load Required Library

In [2]:
# Create connection to the staging directory.
import boto3
import sagemaker
import pandas as pd
import numpy as np
try:
    from pyathena import connect
except:
    !pip install --disable-pip-version-check -q PyAthena==2.1.0
    from pyathena import connect
try:
    import awswrangler as wr   
except:
    !pip install awswrangler
    import awswrangler as wr   
    
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

bucket = 'ads508team7'
raw_file_dir   = f"s3://{bucket}/raw_files"
s3_staging_dir = f"s3://{bucket}/athena/staging"
main_set_path = f's3://{bucket}/main_set/pr_tr.csv'
database_name  = 'sys'
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [3]:
orders = wr.s3.read_csv('{raw_file_dir}/{file_name}/{file_name}.csv'.format(raw_file_dir=raw_file_dir, file_name='orders'))     
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_phase_of_day
0,2539329,1,prior,1,2,8,,2
1,2398795,1,prior,2,3,7,15.0,2
2,473747,1,prior,3,3,12,21.0,4
3,2254736,1,prior,4,4,7,29.0,2
4,431534,1,prior,5,4,15,28.0,5


## Convert order_hour_of_day to a more generic category 
### Instead of 24 category, it'll be 4 (Morning, Afternoon, Evening, Night)

In [4]:
def convert_hour_to_part_of_day(hour):
    if 3 <= hour < 6:      #3AM to 6AM      Early Birds 
        return 1
    if 6 <= hour < 9:      #6AM to 9AM      Early Morning
        return 2
    elif 9 <= hour < 12:   #9AM to 12AM     Late Morning
        return 3
    elif 12 <= hour < 15:  #12PM to 3PM     Early Afternoon
        return 4
    elif 15 <= hour < 18:  #3PM to 6PM      Late Afternoon
        return 5  
    elif 18 <= hour < 21:  #6PM to 9PM      Early Night
        return 6      
    elif 21 <= hour < 24:  #9PM to 12AM     Late Night
        return 7  
    elif 0 <= hour < 3:    #9PM to 12AM     After Mid-Night
        return 8
    else:
        return None
    
orders['order_phase_of_day'] = orders['order_hour_of_day'].apply(convert_hour_to_part_of_day)
orders['order_phase_of_day'].fillna(orders.order_phase_of_day.mode(), inplace = True)
# replace with zero since it was the first time
orders['days_since_prior_order'].fillna(0, inplace = True)


save_to_path = f's3://{bucket}/ready/orders/orders.csv'
wr.s3.to_csv(orders, save_to_path, index = False)

{'paths': ['s3://ads508team7/ready/orders/orders.csv'],
 'partitions_values': {}}

In [5]:
aisles = wr.s3.read_csv('{raw_file_dir}/{file_name}/{file_name}.csv'.format(raw_file_dir=raw_file_dir, file_name='aisles'))     
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [6]:
perishable_cols = [
	'prepared soups salads',	'specialty cheeses',	'packaged meat',	'bakery desserts',	'fresh pasta',	
    'prepared meals',	'tofu meat alternatives',	'packaged seafood',	'fresh herbs',	'packaged cheese',	
    'fresh fruits',	'latino foods',	'refrigerated',	'packaged produce',	'kosher foods',	'frozen meat seafood',	
    'poultry counter',	'butter',	'ice cream ice',	'frozen meals',	'seafood counter',	'frozen vegan vegetarian',	
    'buns rolls',	'packaged poultry',	'fruit vegetable snacks',	'preserved dips spreads',	'frozen breakfast',	'cream',	
    'frozen breads doughs',	'cookies cakes',	'asian foods',	'fresh dips tapenades',	'refrigerated pudding desserts',	
    'indian foods',	'frozen pizza',	'fresh vegetables',	'milk',	'eggs',	'breakfast bakery',	'lunch meat',	'juice nectars',	
    'hot dogs bacon sausage',	'other creams cheeses',	'pickled goods olives',	'bread',	'frozen juice',	'frozen produce',	
    'frozen dessert',	'yogurt',	'meat counter',	'packaged vegetables fruits',	'tortillas flat bread',	'frozen appetizers sides']

aisles.loc[:,'perishable'] = 0
aisles.loc[aisles.aisle.isin(perishable_cols),'perishable'] = 1


save_to_path = f's3://{bucket}/ready/aisles/aisles.csv'
wr.s3.to_csv(aisles, save_to_path, index = False)

{'paths': ['s3://ads508team7/ready/aisles/aisles.csv'],
 'partitions_values': {}}

## Create a feature to reflect whether a product is organic

In [7]:
products = wr.s3.read_csv('{raw_file_dir}/{file_name}/{file_name}.csv'.format(raw_file_dir=raw_file_dir, file_name='products'))     
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [8]:
products['is_organic'] = products['product_name'].str.lower()
products.loc[products.is_organic.str.contains("organic") == True ,'is_organic'] = 1
products.loc[products.is_organic.str.contains("organic") == False ,'is_organic'] = 0

save_to_path = f's3://{bucket}/ready/products/products.csv'
wr.s3.to_csv(aisles, save_to_path, index = False)

products.head(100)

Unnamed: 0,product_id,product_name,aisle_id,department_id,is_organic
0,1,Chocolate Sandwich Cookies,61,19,0
1,2,All-Seasons Salt,104,13,0
2,3,Robust Golden Unsweetened Oolong Tea,94,7,0
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,0
4,5,Green Chile Anytime Sauce,5,13,0
...,...,...,...,...,...
95,96,Sprinklez Confetti Fun Organic Toppings,97,13,1
96,97,Organic Chamomile Lemon Tea,94,7,1
97,98,2% Yellow American Cheese,2,16,0
98,99,Local Living Butter Lettuce,83,4,0


In [9]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
  
def place_new_features(df):
    orders_cols = ['user_id', 'order_id','days_since_prior_order','order_phase_of_day']
    df = df.join(orders[orders_cols].set_index('order_id'), on='order_id')

    products_cols = ['product_id','aisle_id','is_organic']
    df = df.join(products[products_cols].set_index('product_id'), on='product_id')
        
    aisles_cols = ['aisle_id','perishable']
    df = df.join(aisles[aisles_cols].set_index('aisle_id'), on='aisle_id')

    # passing bridge-types-cat column (label encoded values of bridge_types)
    enc_df = pd.DataFrame(enc.fit_transform(df[['department']]).toarray())

    dept_org_name = enc_df.columns
    dept_new_name = [f'dept_{c}' for c in enc_df.columns]
    dept_change_to_name = {}
    for index,value in enumerate(dept_org_name):
        dept_change_to_name[index] = dept_new_name[index]


    # merge with main df bridge_df on key values
    df = df.join(enc_df)

    cols_to_int = enc_df.columns.tolist()
    cols_to_int.append('days_since_prior_order')

    df[cols_to_int] = df[cols_to_int].astype('int32')
    df.drop(['aisle_id', 'department'], axis = 1, inplace=True)
    df.rename(columns=dept_change_to_name,inplace=True)
    return df

In [None]:
def add_new_features(df_info):
    df = wr.s3.read_csv(df_info['current_path'])
    df_with_features = place_new_features(df)
    wr.s3.to_csv(df_with_features, df_info['ready_path'], index = False)
    print(df_info['df_name'], df_with_features.shape)
    
df_dict = [
    {'df_name': 'train', 'current_path' : f's3://{bucket}/balanced/balanced.csv', 'ready_path' : f's3://{bucket}/ready/train/train.csv'},
    {'df_name': 'test', 'current_path' : f's3://{bucket}/split/test/test.csv', 'ready_path' : f's3://{bucket}/ready/test/test.csv'},
    {'df_name': 'validate', 'current_path' : f's3://{bucket}/split/validate/validate.csv', 'ready_path' : f's3://{bucket}/ready/validate/validate.csv'},
]

for df_info in df_dict:
    add_new_features(df_info)

train (509880, 30)


Prepare data for JumpStart modeling

In [None]:
non_features = ['user_id', 'order_id','product_id','add_to_cart_order']

columns = wr.s3.read_csv(f's3://{bucket}/ready/validate/validate.csv').columns.tolist()

f = 0
rename_list = {}
keep_for_training_cols = {}
for c in columns:
    if c == 'reordered':
        rename_list[c] = 'Target'
    elif c not in non_features:
        f += 1  
        rename_list[c] = f'Feature_{f}' 
    else: 
        rename_list[c] = c      

keep_for_training_cols = {key: val for key, val in rename_list.items() if key not in non_features}

class df_save():
    def __init__(self, **kwargs):
        [setattr(self, key, value) for key, value in kwargs.items()] 
        
        self.df_org = wr.s3.read_csv(self.df_path) 
        self.df = self.df_org[self.rename_list]
        self.df_save = self.df.rename(columns=self.rename_list, errors='raise')       
        
        save_to_path = f's3://{self.bucket}/{self.folder}/{self.df_name}/data.csv'
        wr.s3.to_csv(self.df_save, save_to_path, index = False, header = self.header)
        print(self.df_name, self.folder, self.df_save.shape)

for df_info in df_dict:
    df_save(df_path = df_info['ready_path'], folder = 'AfterFeatureEngr', bucket = bucket, df_name = df_info['df_name'], rename_list = rename_list, header = True)
    df_save(df_path = df_info['ready_path'], folder = 'jump_start', bucket = bucket, df_name = df_info['df_name'], rename_list = keep_for_training_cols, header = False)