## 10.1 Introducing the Data Sets

In [1]:
import pandas as pd

In [2]:
groups1 = pd.read_csv("meetup/groups1.csv")
groups1.head()

Unnamed: 0,group_id,name,category_id,city_id
0,6388,Alternative Health NYC,14,10001
1,6510,Alternative Energy Meetup,4,10001
2,8458,NYC Animal Rights,26,10001
3,8940,The New York City Anime Group,29,10001
4,10104,NYC Pit Bull Group,26,10001


In [3]:
groups2 = pd.read_csv("meetup/groups2.csv")
groups2.head()

Unnamed: 0,group_id,name,category_id,city_id
0,18879327,BachataMania,5,10001
1,18880221,Photoshoot Chicago - Photography and Modeling ...,27,60601
2,18880426,Chicago Adult Push / Kick Scooter Group Riding...,31,60601
3,18880495,Chicago International Soccer Club,32,60601
4,18880695,Impact.tech San Francisco Meetup,2,94101


In [4]:
categories = pd.read_csv("meetup/categories.csv")
categories.head()

Unnamed: 0,category_id,category_name
0,1,Arts & Culture
1,3,Cars & Motorcycles
2,4,Community & Environment
3,5,Dancing
4,6,Education & Learning


In [5]:
pd.read_csv("meetup/cities.csv").head()

Unnamed: 0,id,city,state,zip
0,7093,West New York,NJ,7093
1,10001,New York,NY,10001
2,13417,New York Mills,NY,13417
3,46312,East Chicago,IN,46312
4,56567,New York Mills,MN,56567


In [6]:
cities = pd.read_csv(
    "meetup/cities.csv", dtype = {"zip": "string"}
)
cities.head()

Unnamed: 0,id,city,state,zip
0,7093,West New York,NJ,7093
1,10001,New York,NY,10001
2,13417,New York Mills,NY,13417
3,46312,East Chicago,IN,46312
4,56567,New York Mills,MN,56567


## 10.2 Concatenating Data Sets

In [7]:
pd.concat(objs = [groups1, groups2])

Unnamed: 0,group_id,name,category_id,city_id
0,6388,Alternative Health NYC,14,10001
1,6510,Alternative Energy Meetup,4,10001
2,8458,NYC Animal Rights,26,10001
3,8940,The New York City Anime Group,29,10001
4,10104,NYC Pit Bull Group,26,10001
...,...,...,...,...
8326,26377464,Shinect,34,94101
8327,26377698,The art of getting what you want [conference s...,14,94101
8328,26378067,Streeterville Running Group,9,60601
8329,26378128,Just Dance NYC,23,10001


In [8]:
len(groups1)

7999

In [9]:
len(groups2)

8331

In [10]:
len(groups1) + len(groups2)

16330

In [11]:
pd.concat(objs = [groups1, groups2], ignore_index = True)

Unnamed: 0,group_id,name,category_id,city_id
0,6388,Alternative Health NYC,14,10001
1,6510,Alternative Energy Meetup,4,10001
2,8458,NYC Animal Rights,26,10001
3,8940,The New York City Anime Group,29,10001
4,10104,NYC Pit Bull Group,26,10001
...,...,...,...,...
16325,26377464,Shinect,34,94101
16326,26377698,The art of getting what you want [conference s...,14,94101
16327,26378067,Streeterville Running Group,9,60601
16328,26378128,Just Dance NYC,23,10001


In [12]:
pd.concat(objs = [groups1, groups2], keys = ["G1", "G2"])

Unnamed: 0,Unnamed: 1,group_id,name,category_id,city_id
G1,0,6388,Alternative Health NYC,14,10001
G1,1,6510,Alternative Energy Meetup,4,10001
G1,2,8458,NYC Animal Rights,26,10001
G1,3,8940,The New York City Anime Group,29,10001
G1,4,10104,NYC Pit Bull Group,26,10001
...,...,...,...,...,...
G2,8326,26377464,Shinect,34,94101
G2,8327,26377698,The art of getting what you want [conference s...,14,94101
G2,8328,26378067,Streeterville Running Group,9,60601
G2,8329,26378128,Just Dance NYC,23,10001


In [13]:
groups = pd.concat(objs = [groups1, groups2], ignore_index = True)

### 10.2.1 Missing Values in Combined DataFrames

In [14]:
sports_champions_A = pd.DataFrame(
    data = [
        ["New England Patriots", "Houston Astros"],
        ["Philadelphia Eagles", "Boston Red Sox"]
    ],
    columns = ["Football", "Baseball"],
    index = [2017, 2018]
)

sports_champions_A

Unnamed: 0,Football,Baseball
2017,New England Patriots,Houston Astros
2018,Philadelphia Eagles,Boston Red Sox


In [15]:
sports_champions_B = pd.DataFrame(
    data = [
        ["New England Patriots", "St. Louis Blues"],
        ["Kansas City Chiefs", "Tampa Bay Lightning"]
    ],
    columns = ["Football", "Hockey"],
    index = [2019, 2020]
)

In [16]:
pd.concat(objs = [sports_champions_A, sports_champions_B])

Unnamed: 0,Football,Baseball,Hockey
2017,New England Patriots,Houston Astros,
2018,Philadelphia Eagles,Boston Red Sox,
2019,New England Patriots,,St. Louis Blues
2020,Kansas City Chiefs,,Tampa Bay Lightning


In [17]:
sports_champions_C = pd.DataFrame(
    data = [
        ["Pittsburgh Penguins", "Golden State Warriors"],
        ["Washington Capitals", "Golden State Warriors"]
    ],
    columns = ["Hockey", "Basketball"],
    index = [2017, 2018]
)


In [18]:
pd.concat(objs = [sports_champions_A, sports_champions_C])

Unnamed: 0,Football,Baseball,Hockey,Basketball
2017,New England Patriots,Houston Astros,,
2018,Philadelphia Eagles,Boston Red Sox,,
2017,,,Pittsburgh Penguins,Golden State Warriors
2018,,,Washington Capitals,Golden State Warriors


In [19]:
pd.concat(
    objs = [sports_champions_A, sports_champions_C], 
    axis = 1
)
pd.concat(
    objs = [sports_champions_A, sports_champions_C], axis = "columns"
)

Unnamed: 0,Football,Baseball,Hockey,Basketball
2017,New England Patriots,Houston Astros,Pittsburgh Penguins,Golden State Warriors
2018,Philadelphia Eagles,Boston Red Sox,Washington Capitals,Golden State Warriors


## 10.3 Left Joins

In [20]:
groups.head(3)

Unnamed: 0,group_id,name,category_id,city_id
0,6388,Alternative Health NYC,14,10001
1,6510,Alternative Energy Meetup,4,10001
2,8458,NYC Animal Rights,26,10001


In [21]:
categories.head(3)

Unnamed: 0,category_id,category_name
0,1,Arts & Culture
1,3,Cars & Motorcycles
2,4,Community & Environment


In [22]:
groups.merge(categories, how = "left", on = "category_id").head()

Unnamed: 0,group_id,name,category_id,city_id,category_name
0,6388,Alternative Health NYC,14,10001,Health & Wellbeing
1,6510,Alternative Energy Meetup,4,10001,Community & Environment
2,8458,NYC Animal Rights,26,10001,
3,8940,The New York City Anime Group,29,10001,Sci-Fi & Fantasy
4,10104,NYC Pit Bull Group,26,10001,


## 10.4 Inner Joins

In [23]:
groups.head(3)

Unnamed: 0,group_id,name,category_id,city_id
0,6388,Alternative Health NYC,14,10001
1,6510,Alternative Energy Meetup,4,10001
2,8458,NYC Animal Rights,26,10001


In [24]:
categories.head(3)

Unnamed: 0,category_id,category_name
0,1,Arts & Culture
1,3,Cars & Motorcycles
2,4,Community & Environment


In [25]:
groups.merge(categories, how = "inner", on = "category_id")

Unnamed: 0,group_id,name,category_id,city_id,category_name
0,6388,Alternative Health NYC,14,10001,Health & Wellbeing
1,54126,Energy Healers NYC,14,10001,Health & Wellbeing
2,67776,Flourishing Life Meetup,14,10001,Health & Wellbeing
3,111855,Hypnosis & NLP NYC - Update Your Brain,14,10001,Health & Wellbeing
4,129277,The Live Food Chicago Community,14,60601,Health & Wellbeing
...,...,...,...,...,...
8032,25536270,New York Cuckold Relationships Meetup,17,10001,Lifestyle
8033,25795045,Pagans Paradise NYC - A Haven for Heathens,17,10001,Lifestyle
8034,25856573,Fuck Yeah Femme Productions,17,94101,Lifestyle
8035,26158102,Chicago Crossdresser Meetup,17,60601,Lifestyle


In [26]:
groups[groups["category_id"] == 14]

Unnamed: 0,group_id,name,category_id,city_id
0,6388,Alternative Health NYC,14,10001
52,54126,Energy Healers NYC,14,10001
78,67776,Flourishing Life Meetup,14,10001
121,111855,Hypnosis & NLP NYC - Update Your Brain,14,10001
136,129277,The Live Food Chicago Community,14,60601
...,...,...,...,...
16174,26291539,The Transformation Project: Collaborative Life...,14,94101
16201,26299876,"Cognitive Empathy, How To Translate Enemy Imag...",14,10001
16248,26322976,Contemplative Practices Group,14,94101
16314,26366221,The art of getting what you want: hacking fear,14,94101


In [27]:
categories[categories["category_id"] == 14]

Unnamed: 0,category_id,category_name
8,14,Health & Wellbeing


## 10.5 Outer Joins

In [28]:
groups.head(3)

Unnamed: 0,group_id,name,category_id,city_id
0,6388,Alternative Health NYC,14,10001
1,6510,Alternative Energy Meetup,4,10001
2,8458,NYC Animal Rights,26,10001


In [29]:
cities.head(3)

Unnamed: 0,id,city,state,zip
0,7093,West New York,NJ,7093
1,10001,New York,NY,10001
2,13417,New York Mills,NY,13417


In [30]:
groups.merge(
    cities, how = "outer", left_on = "city_id", right_on = "id"
)

Unnamed: 0,group_id,name,category_id,city_id,id,city,state,zip
0,6388.0,Alternative Health NYC,14.0,10001.0,10001,New York,NY,10001
1,6510.0,Alternative Energy Meetup,4.0,10001.0,10001,New York,NY,10001
2,8458.0,NYC Animal Rights,26.0,10001.0,10001,New York,NY,10001
3,8940.0,The New York City Anime Group,29.0,10001.0,10001,New York,NY,10001
4,10104.0,NYC Pit Bull Group,26.0,10001.0,10001,New York,NY,10001
...,...,...,...,...,...,...,...,...
16329,24303427.0,Midwest FPGA/AI/Machine Learning Meetup,34.0,60064.0,60064,North Chicago,IL,60064
16330,,,,,13417,New York Mills,NY,13417
16331,,,,,46312,East Chicago,IN,46312
16332,,,,,56567,New York Mills,MN,56567


In [31]:
groups.merge(
    cities,
    how = "outer",
    left_on = "city_id",
    right_on = "id",
    indicator = True
)

Unnamed: 0,group_id,name,category_id,city_id,id,city,state,zip,_merge
0,6388.0,Alternative Health NYC,14.0,10001.0,10001,New York,NY,10001,both
1,6510.0,Alternative Energy Meetup,4.0,10001.0,10001,New York,NY,10001,both
2,8458.0,NYC Animal Rights,26.0,10001.0,10001,New York,NY,10001,both
3,8940.0,The New York City Anime Group,29.0,10001.0,10001,New York,NY,10001,both
4,10104.0,NYC Pit Bull Group,26.0,10001.0,10001,New York,NY,10001,both
...,...,...,...,...,...,...,...,...,...
16329,24303427.0,Midwest FPGA/AI/Machine Learning Meetup,34.0,60064.0,60064,North Chicago,IL,60064,both
16330,,,,,13417,New York Mills,NY,13417,right_only
16331,,,,,46312,East Chicago,IN,46312,right_only
16332,,,,,56567,New York Mills,MN,56567,right_only


In [32]:
outer_join = groups.merge(
    cities,
    how = "outer",
    left_on = "city_id",
    right_on = "id",
    indicator = True
)

in_right_only = outer_join["_merge"] == "right_only"

outer_join[in_right_only].head()

Unnamed: 0,group_id,name,category_id,city_id,id,city,state,zip,_merge
16330,,,,,13417,New York Mills,NY,13417,right_only
16331,,,,,46312,East Chicago,IN,46312,right_only
16332,,,,,56567,New York Mills,MN,56567,right_only
16333,,,,,95712,Chicago Park,CA,95712,right_only


## 10.6 Merging on Index Labels

In [33]:
cities.head(3)

Unnamed: 0,id,city,state,zip
0,7093,West New York,NJ,7093
1,10001,New York,NY,10001
2,13417,New York Mills,NY,13417


In [34]:
cities = cities.set_index("id")

In [35]:
cities.head(3)

Unnamed: 0_level_0,city,state,zip
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7093,West New York,NJ,7093
10001,New York,NY,10001
13417,New York Mills,NY,13417


In [36]:
groups.head(3)

Unnamed: 0,group_id,name,category_id,city_id
0,6388,Alternative Health NYC,14,10001
1,6510,Alternative Energy Meetup,4,10001
2,8458,NYC Animal Rights,26,10001


In [37]:
groups.merge(
    cities,
    how = "left",
    left_on = "city_id",
    right_index = True
)

Unnamed: 0,group_id,name,category_id,city_id,city,state,zip
0,6388,Alternative Health NYC,14,10001,New York,NY,10001
1,6510,Alternative Energy Meetup,4,10001,New York,NY,10001
2,8458,NYC Animal Rights,26,10001,New York,NY,10001
3,8940,The New York City Anime Group,29,10001,New York,NY,10001
4,10104,NYC Pit Bull Group,26,10001,New York,NY,10001
...,...,...,...,...,...,...,...
16325,26377464,Shinect,34,94101,San Francisco,CA,94101
16326,26377698,The art of getting what you want [conference s...,14,94101,San Francisco,CA,94101
16327,26378067,Streeterville Running Group,9,60601,Chicago,IL,60290
16328,26378128,Just Dance NYC,23,10001,New York,NY,10001


## 10.7 Coding Challenge

In [38]:
pd.read_csv("restaurant/week_1_sales.csv").head()

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


In [39]:
week1 = pd.read_csv("restaurant/week_1_sales.csv")
week2 = pd.read_csv("restaurant/week_2_sales.csv")

In [40]:
pd.read_csv("restaurant/customers.csv", index_col = "ID").head()

Unnamed: 0_level_0,First Name,Last Name,Gender,Company,Occupation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Joseph,Perkins,Male,Dynazzy,Community Outreach Specialist
2,Jennifer,Alvarez,Female,DabZ,Senior Quality Engineer
3,Roger,Black,Male,Tagfeed,Account Executive
4,Steven,Evans,Male,Fatz,Registered Nurse
5,Judy,Morrison,Female,Demivee,Legal Assistant


In [41]:
customers = pd.read_csv(
    "restaurant/customers.csv", index_col = "ID"
)

In [42]:
pd.read_csv("restaurant/foods.csv", index_col = "Food ID")

Unnamed: 0_level_0,Food Item,Price
Food ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Sushi,3.99
2,Burrito,9.99
3,Taco,2.99
4,Quesadilla,4.25
5,Pizza,2.49
6,Pasta,13.99
7,Steak,24.99
8,Salad,11.25
9,Donut,0.99
10,Drink,1.75


In [43]:
foods = pd.read_csv("restaurant/foods.csv", index_col = "Food ID")

### 10.7.1 Problems

### 10.7.2 Solutions

In [44]:
pd.concat(objs = [week1, week2], keys = ["Week 1", "Week 2"])

Unnamed: 0,Unnamed: 1,Customer ID,Food ID
Week 1,0,537,9
Week 1,1,97,4
Week 1,2,658,1
Week 1,3,202,2
Week 1,4,155,9
...,...,...,...
Week 2,245,783,10
Week 2,246,556,10
Week 2,247,547,9
Week 2,248,252,9


In [45]:
week1.merge(
    right = week2, how = "inner", on = "Customer ID"
).head()

Unnamed: 0,Customer ID,Food ID_x,Food ID_y
0,537,9,5
1,155,9,3
2,155,1,3
3,503,5,8
4,503,5,9


In [46]:
week1.merge(
    right = week2, how = "inner", on = "Customer ID"
).drop_duplicates(subset = ["Customer ID"]).head()

Unnamed: 0,Customer ID,Food ID_x,Food ID_y
0,537,9,5
1,155,9,3
3,503,5,8
5,550,6,7
6,101,7,4


In [47]:
week1.merge(
    right = week2,
    how = "inner",
    on = ["Customer ID", "Food ID"]
)

Unnamed: 0,Customer ID,Food ID
0,304,3
1,540,3
2,937,10
3,233,3
4,21,4
5,21,4
6,922,1
7,578,5
8,578,5


In [48]:
week1.merge(
    right = week2,
    how = "outer",
    on = "Customer ID",
    indicator = True
).head()

Unnamed: 0,Customer ID,Food ID_x,Food ID_y,_merge
0,537,9.0,5.0,both
1,97,4.0,,left_only
2,658,1.0,,left_only
3,202,2.0,,left_only
4,155,9.0,3.0,both


In [49]:
week1.merge(
    right = customers,
    how = "left",
    left_on = "Customer ID",
    right_index = True
).head()

Unnamed: 0,Customer ID,Food ID,First Name,Last Name,Gender,Company,Occupation
0,537,9,Cheryl,Carroll,Female,Zoombeat,Registered Nurse
1,97,4,Amanda,Watkins,Female,Ozu,Account Coordinator
2,658,1,Patrick,Webb,Male,Browsebug,Community Outreach Specialist
3,202,2,Louis,Campbell,Male,Rhynoodle,Account Representative III
4,155,9,Carolyn,Diaz,Female,Gigazoom,Database Administrator III


## 10.8 Summary