In [163]:
import numpy as np
import pandas as pd

import zipfile
import pandasql as ps

In [164]:
zf = zipfile.ZipFile('Data Scientist Ops Research Test 1.zip') 

orders = pd.read_csv(zf.open('Scientist Test/orders.csv'))
delivery = pd.read_csv(zf.open('Scientist Test/delivery_windows.csv'))
shopper = pd.read_csv(zf.open('Scientist Test/shopper_schedule.csv'))

### Understanding the Data

In [165]:
orders.head()

Unnamed: 0,order_id,shopper_id,delivery_window_id
0,892850508,5599879,29979371
1,904485576,3539999,30324692
2,916808463,5599879,30679690
3,916825164,3539999,30679691
4,915339858,4004300,30679684


In [166]:
orders.shape

(3430, 3)

In [167]:
orders.nunique()

order_id              3430
shopper_id             385
delivery_window_id    2089
dtype: int64

In [168]:
orders.isnull().sum()

order_id              0
shopper_id            0
delivery_window_id    0
dtype: int64

In [169]:
delivery.head()

Unnamed: 0,delivery_window_id,starts_at,market_name,neighborhood_name
0,30211212,2017-03-09T20:00:00Z,San Francisco,SoMa / Castro
1,29979371,2017-03-05T19:00:00Z,San Francisco,SoMa / Castro
2,29921412,2017-03-04T20:00:00Z,San Francisco,SoMa / Castro
3,29805494,2017-03-02T22:00:00Z,San Francisco,SoMa / Castro
4,30324692,2017-03-11T19:00:00Z,San Francisco,SoMa / Castro


In [170]:
delivery.shape

(8448, 4)

In [171]:
delivery.nunique()

delivery_window_id    8448
starts_at              768
market_name              3
neighborhood_name       11
dtype: int64

In [172]:
delivery.isnull().sum()

delivery_window_id    0
starts_at             0
market_name           0
neighborhood_name     0
dtype: int64

In [173]:
shopper.head()

Unnamed: 0,shopper_id,delivery_window_id,abandoned
0,3540919,30211212,False
1,3618452,30211212,False
2,3986107,30211212,False
3,4004300,30211212,False
4,4081488,30211212,False


All tables can be joined on delivery_window_id.

In [174]:
shopper.shape

(80494, 3)

In [175]:
shopper.nunique() #multiple records of ind shoppers

shopper_id             590
delivery_window_id    7849
abandoned                2
dtype: int64

In [176]:
shopper.isnull().sum()

shopper_id            0
delivery_window_id    0
abandoned             0
dtype: int64

In [177]:
shopper.abandoned.value_counts()

False    60450
True     20044
Name: abandoned, dtype: int64

# ETL

1. For every market and delivery window combination, **find the number of orders placed** 
and **number of unique shoppers who made themselves available to fulfill an order**.

In [178]:
part_one = ps.sqldf("select d.market_name, d.delivery_window_id, count(o.order_id) no_orders_placed, count(distinct s.shopper_id) no_unique_avail_shoppers from delivery d join orders o on d.delivery_window_id = o.delivery_window_id join shopper s on o.delivery_window_id = s.delivery_window_id group by 1,2")
part_one

Unnamed: 0,market_name,delivery_window_id,no_orders_placed,no_unique_avail_shoppers
0,New York NY,29678563,106,53
1,New York NY,29678564,98,49
2,New York NY,29678566,80,40
3,New York NY,29683435,8,8
4,New York NY,29683437,6,6
5,New York NY,29683509,40,40
6,New York NY,29683511,34,34
7,New York NY,29739874,44,44
8,New York NY,29739875,250,50
9,New York NY,29739876,159,53


2.	Now, add two columns to the previous table in (1): 

    a.	the number of unique shoppers who abandoned their scheduled delivery window (hint: use the abandoned field)
   
   b.	the number of unique shoppers who fulfilled an order in the delivery window

In order to complete this, will need to output 2 different tables:
1. delivery_window_id + shopper_id + abadoned = 1  (abandoned unique shoppers)
2. delivery_window_id + shopper_id + abadoned = 0 (fulfilled unique shoppers)
* Combine these two tables on delivery_window_id
* Get count of each unique shopper who was available but abandoned
* Get count of each unique shopper who was available but did not abandon (fulfilled)

In [179]:
abandoned = ps.sqldf("select d.delivery_window_id, count(distinct s.shopper_id) no_unique_abandoned_shoppers, s.abandoned from delivery d join orders o on d.delivery_window_id = o.delivery_window_id join shopper s on o.delivery_window_id = s.delivery_window_id where s.abandoned=1 group by 1,3")
abandoned

Unnamed: 0,delivery_window_id,no_unique_abandoned_shoppers,abandoned
0,29675515,4,1
1,29675516,5,1
2,29675517,5,1
3,29675518,3,1
4,29678563,17,1
5,29678564,15,1
6,29678566,11,1
7,29683435,1,1
8,29683437,2,1
9,29683509,13,1


In [180]:
fulfilled = ps.sqldf("select d.delivery_window_id, count(distinct s.shopper_id) no_unique_fulfilled_shoppers, s.abandoned from delivery d join orders o on d.delivery_window_id = o.delivery_window_id join shopper s on o.delivery_window_id = s.delivery_window_id where s.abandoned=0 group by 1,3")
fulfilled

Unnamed: 0,delivery_window_id,no_unique_fulfilled_shoppers,abandoned
0,29675515,5,0
1,29675516,2,0
2,29675517,2,0
3,29675518,3,0
4,29678563,36,0
5,29678564,34,0
6,29678566,29,0
7,29683435,7,0
8,29683437,4,0
9,29683509,27,0


In [181]:
#join three tables on delivery_window_id
final_etl  = ps.sqldf("select* from part_one po join abandoned a on po.delivery_window_id = a.delivery_window_id join fulfilled f on a.delivery_window_id=f.delivery_window_id order by 2")
final_etl

Unnamed: 0,market_name,delivery_window_id,no_orders_placed,no_unique_avail_shoppers,delivery_window_id.1,no_unique_abandoned_shoppers,abandoned,delivery_window_id.2,no_unique_fulfilled_shoppers,abandoned.1
0,Philadelphia,29675515,9,9,29675515,4,1,29675515,5,0
1,Philadelphia,29675516,14,7,29675516,5,1,29675516,2,0
2,Philadelphia,29675517,14,7,29675517,5,1,29675517,2,0
3,Philadelphia,29675518,6,6,29675518,3,1,29675518,3,0
4,New York NY,29678563,106,53,29678563,17,1,29678563,36,0
5,New York NY,29678564,98,49,29678564,15,1,29678564,34,0
6,New York NY,29678566,80,40,29678566,11,1,29678566,29,0
7,New York NY,29683435,8,8,29683435,1,1,29683435,7,0
8,New York NY,29683437,6,6,29683437,2,1,29683437,4,0
9,New York NY,29683509,40,40,29683509,13,1,29683509,27,0


In [182]:
final_etl = ps.sqldf("select market_name, delivery_window_id, no_orders_placed, no_unique_avail_shoppers, no_unique_abandoned_shoppers, no_unique_fulfilled_shoppers from final_etl")
final_etl

Unnamed: 0,market_name,delivery_window_id,no_orders_placed,no_unique_avail_shoppers,no_unique_abandoned_shoppers,no_unique_fulfilled_shoppers
0,Philadelphia,29675515,9,9,4,5
1,Philadelphia,29675516,14,7,5,2
2,Philadelphia,29675517,14,7,5,2
3,Philadelphia,29675518,6,6,3,3
4,New York NY,29678563,106,53,17,36
5,New York NY,29678564,98,49,15,34
6,New York NY,29678566,80,40,11,29
7,New York NY,29683435,8,8,1,7
8,New York NY,29683437,6,6,2,4
9,New York NY,29683509,40,40,13,27


In [183]:
final_etl.to_csv("etl.csv")