# Setup Libraries and Enviroment

In [2]:
# Load necessary libraries and Account IDs
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

database_name = "insta_db"

from pyathena import connect

In [3]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [4]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# Organize Data for Logistic Model

In [24]:
# Set Athena parameters
# Set Athena parameters
database_name = "insta_db"
table_name = "order_products__train_parquet"

# SQL statement
statement = """
SELECT *
FROM {}.{} limit 20
""".format(
    database_name, table_name
)

import pandas as pd

df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1577369,20995,6,1
1,1577369,7113,7,0
2,1577369,5334,8,1
3,1577374,39475,1,0
4,1577374,22124,2,1


In [63]:
# Set Athena parameters
database_name = "insta_db"
table_name = "order_products__train_parquet"

# SQL statement
statement = """
select t.*, p.product_name, d.department, a.aisle
from (
SELECT product_id 
, order_id
, count(*) as count
, avg(reordered) as proportion_reordered 
FROM insta_db.order_products_prior_parquet
group by product_id, order_id
) t
left join insta_db.products_parquet p
on t.product_id = p.product_id
left join insta_db.departments_parquet d
on p.department_id = d.department_id
left join insta_db.aisles_parquet a
on p.aisle_id = a.aisle_id
--where count > 40
order by proportion_reordered DESC limit 10
""".format(
    database_name, table_name
)

import pandas as pd

df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,product_id,order_id,count,proportion_reordered,product_name,department,aisle
0,37250,1668699,1,1.0,Cut Green Beans,frozen,frozen produce
1,17630,1668697,1,1.0,YoKids Squeeze! Organic Strawberry Flavor Yogurt,dairy eggs,yogurt
2,18987,1668698,1,1.0,Hot Dog Buns,bakery,buns rolls
3,30384,1668699,1,1.0,Dark Caffe Verona Whole Bean Coffee,beverages,coffee
4,21472,1668699,1,1.0,Original Pepperoni Pizza,frozen,frozen pizza


In [86]:
# Set Athena parameters
database_name = "insta_db"
table_name = "orders_parquet"

# SQL statement
statement = """
with orders as(
select *
, sum(coalesce(days_since_prior_order,0)) over (partition by user_id order by order_number asc) as days
FROM insta_db.orders_parquet
where user_id in( 191663, 198874 ) 
), products as (
select t.*, p.product_name, d.department, a.aisle
from (
SELECT *
FROM insta_db.order_products_prior_parquet
) t
left join insta_db.products_parquet p
on t.product_id = p.product_id
left join insta_db.departments_parquet d
on p.department_id = d.department_id
left join insta_db.aisles_parquet a
on p.aisle_id = a.aisle_id
)
select p.reordered
, p.product_name
, coalesce( days - lag(days) over (partition by o.user_id, p.product_id order by o.order_number) , 0) as days_since_previous_order
from products p
join orders o
on p.order_id = o.order_id
order by p.product_id,
o.order_number ASC
""".format(
    database_name, table_name
)

import pandas as pd

df = pd.read_sql(statement, conn)
df.head(150)

Unnamed: 0,reordered,product_name,days_since_previous_order
0,0,Organic Diced Tomatoes,0.0
1,0,Cuties Non Dairy Vanilla Frozen Dessert Sandwi...,0.0
2,0,Organic 2% Reduced Fat Milk,0.0
3,1,Organic 2% Reduced Fat Milk,30.0
4,1,Organic 2% Reduced Fat Milk,28.0
...,...,...,...
145,0,Fresh Cauliflower,0.0
146,0,Rich & Creamy Hershey's Dutch Cocoa Chocolate ...,0.0
147,0,Cuties Dairy Free Mint Chocolate Chip Frozen D...,0.0
148,0,Michigan Organic Kale,0.0


In [87]:
# Set Athena parameters
database_name = "insta_db"
table_name = "orders_parquet"

# SQL statement
statement = """
SELECT *
FROM insta_db.orders_parquet limit 30

""".format(
    database_name, table_name
)

import pandas as pd

df = pd.read_sql(statement, conn)
df.head(50)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1400338,196422,prior,16,3,21,7.0
1,1959511,196422,prior,17,2,12,6.0
2,2827143,196422,prior,18,1,9,6.0
3,762489,196422,prior,19,1,6,14.0
4,1251610,196422,prior,20,6,8,5.0
5,2790326,196422,prior,21,5,13,6.0
6,2622429,196422,prior,22,2,12,4.0
7,498617,196422,prior,23,3,13,1.0
8,2975639,196422,prior,24,1,10,5.0
9,2923456,196422,prior,25,0,14,6.0


# Model Logistic Regression / Propensisty Model

In [None]:
# Get Libraries
import pandas as pd
import numpy as np
import statesmodels.api as sm
from scipy import stats


In [None]:
# Test Train Split Data 


In [None]:
rp_train = 
rp_test = 

In [None]:
# Train
X = pd.DataFram(rp_train[['days', 'web']])
X = sm.add_constant(X)
y = pd.DataFrame(rp_train[['RP']]) # Repeat Purchase
logreg01 = sm.Logit(y,X).fit()
logreg01.summary2()

In [None]:
# Test
X_test = pd.DataFram(rp_test[['days', 'web']])
X_test = sm.add_constant(X)
y = pd.DataFrame(rp_test[['RP']])
logreg01_test = sm.Logit(y,X_test).fit()
logreg01_test.summary2()