In [1]:
import pandas as pd

# Import Data

In [2]:
# import data
week1 = pd.read_csv('data/restaurant_week1_sales.csv')
week2 = pd.read_csv('data/restaurant_week2_sales.csv')
customers = pd.read_csv('data/restaurant_customers.csv')
foods = pd.read_csv('data/restaurant_foods.csv')

# view head of data
print(week1.head(), '\n')
print(week2.head(), '\n')
print(customers.head(), '\n')
print(foods.head(), '\n')

   Customer ID  Food ID
0          537        9
1           97        4
2          658        1
3          202        2
4          155        9 

   Customer ID  Food ID
0          688       10
1          813        7
2          495       10
3          189        5
4          267        3 

   ID First Name Last Name  Gender  Company                     Occupation
0   1     Joseph   Perkins    Male  Dynazzy  Community Outreach Specialist
1   2   Jennifer   Alvarez  Female     DabZ        Senior Quality Engineer
2   3      Roger     Black    Male  Tagfeed              Account Executive
3   4     Steven     Evans    Male     Fatz               Registered Nurse
4   5       Judy  Morrison  Female  Demivee                Legal Assistant 

   Food ID   Food Item  Price
0        1       Sushi   3.99
1        2     Burrito   9.99
2        3        Taco   2.99
3        4  Quesadilla   4.25
4        5       Pizza   2.49 



# Concat Method

In [3]:
# concat two similar data frames
# ignore index will generate a new index for the data frame
# if you need to preserve the original index, set ignore_index = False (default)
sales = pd.concat(objs = [week1, week2], ignore_index = True)
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Customer ID  500 non-null    int64
 1   Food ID      500 non-null    int64
dtypes: int64(2)
memory usage: 7.9 KB


In [4]:
# concat two similar data frames
# maintain original index, and add a key to differentiate each original data frame
sales_org = pd.concat(
    objs = [week1, week2], 
    ignore_index = False,
    keys = ['Week 1', 'Week 2'] # list must be the same length of objs argument
)
sales_org

Unnamed: 0,Unnamed: 1,Customer ID,Food ID
Week 1,0,537,9
Week 1,1,97,4
Week 1,2,658,1
Week 1,3,202,2
Week 1,4,155,9
...,...,...,...
Week 2,245,783,10
Week 2,246,556,10
Week 2,247,547,9
Week 2,248,252,9


In [5]:
# extract a specified week
sales_org.loc[('Week 1')]

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
245,413,9
246,926,6
247,134,3
248,396,6


In [6]:
# extract a specified week and record (index position)
sales_org.loc[('Week 1', 20)]

Customer ID    549
Food ID          6
Name: (Week 1, 20), dtype: int64

# Inner Joins

In [32]:
# merge using inner join
# customers with occurences in both data frames will duplicate (ex: cust 155)
pd.merge(
    left = week1,
    right = week2,
    how = 'inner',
    on = 'Customer ID',
    suffixes = [' - W1', ' - W2'] # defaults _x and _y
).head()


Unnamed: 0,Customer ID,Food ID - W1,Food ID - W2
0,537,9,5
1,155,9,3
2,155,1,3
3,503,5,8
4,503,5,9


In [25]:
# merge using inner join
# using matches for two specified columns (only records that evaluate to true will return)
pd.merge(
    left = week1,
    right = week2,
    how = 'inner', 
    on = ['Customer ID', 'Food ID']
    # suffixes not necessary in this case as we're also matching on food id
)

Unnamed: 0,Customer ID,Food ID
0,304,3
1,540,3
2,937,10
3,233,3
4,21,4
5,21,4
6,922,1
7,578,5
8,578,5


# Outer Joins

In [35]:
# merge using outer join
# values in W1 with NaN did not have an order in W1
# values in W2 with Nan did not have an order in W2
outer_join = pd.merge(
    left = week1,
    right = week2,
    how = 'outer', # defaults to full outer join
    on = 'Customer ID',
    suffixes = [' - W1', ' - W2'], # defaults _x and _y
    indicator = True # shows which side provided values
)

outer_join.head()

Unnamed: 0,Customer ID,Food ID - W1,Food ID - W2,_merge
0,537,9.0,5.0,both
1,97,4.0,,left_only
2,658,1.0,,left_only
3,202,2.0,,left_only
4,155,9.0,3.0,both


In [36]:
# displaying _merge counts
outer_join['_merge'].value_counts()

right_only    197
left_only     195
both           62
Name: _merge, dtype: int64

In [38]:
# filtering based on _merge (excluding values that came from both)
# can modify code to filter result
merge_filter = outer_join['_merge'].isin(['left_only', 'right_only'])
outer_join[merge_filter]

Unnamed: 0,Customer ID,Food ID - W1,Food ID - W2,_merge
1,97,4.0,,left_only
2,658,1.0,,left_only
3,202,2.0,,left_only
6,213,8.0,,left_only
7,600,1.0,,left_only
...,...,...,...,...
449,855,,4.0,right_only
450,559,,10.0,right_only
451,276,,4.0,right_only
452,556,,10.0,right_only


# Left Joins

In [48]:
# merge using left join
# swap left and right data frames for a right join (or use how = 'right')
pd.merge(
    left = week1,
    right = foods,
    how = 'left',
    on = 'Food ID'
).sort_values(by = 'Price', ascending = False).head()

Unnamed: 0,Customer ID,Food ID,Food Item,Price
156,250,7,Steak,24.99
149,673,7,Steak,24.99
107,418,7,Steak,24.99
80,433,7,Steak,24.99
110,816,7,Steak,24.99


# Left_On and Right_On Parameters

In [56]:
# joining with mis-matched id columns
pd.merge(
    left = week2,
    right = customers,
    how = 'left',
    left_on = 'Customer ID',
    right_on = 'ID'
).drop('ID', axis = 'columns').head()

Unnamed: 0,Customer ID,Food ID,First Name,Last Name,Gender,Company,Occupation
0,688,10,Carl,Williamson,Male,Thoughtmix,Graphic Designer
1,813,7,Johnny,Walker,Male,Kayveo,Developer II
2,495,10,Deborah,Little,Female,Babbleblab,VP Accounting
3,189,5,Roger,Gordon,Male,Skilith,Operator
4,267,3,Matthew,Wood,Male,Agimba,Product Engineer


# Merging w/ Indexes

In [60]:
# merge using index
pd.merge(
    left = week1,
    right = week2,
    how = 'left',
    left_index = True,
    right_index = True
).head()

Unnamed: 0,Customer ID_x,Food ID_x,Customer ID_y,Food ID_y
0,537,9,688,10
1,97,4,813,7
2,658,1,495,10
3,202,2,189,5
4,155,9,267,3


In [62]:
# alternate method with join
# both data frames share the same index
week1_sat = pd.read_csv('data/restaurant_week1_satisfaction.csv')

week1.join(week1_sat).head()

Unnamed: 0,Customer ID,Food ID,Satisfaction Rating
0,537,9,2
1,97,4,7
2,658,1,3
3,202,2,7
4,155,9,10
