In [1]:
# Dependencies
import pandas as pd

In [2]:
# Import the csv file as pandas dataframe
csv_file_one = "../00_input/datafiniti-fast-food-restaurants-across-america/Datafiniti_Fast_Food_Restaurants.csv"
restaurant_df = pd.read_csv(csv_file_one)
restaurant_df.head()

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,city,country,keys,latitude,longitude,name,postalCode,province,sourceURLs,websites
0,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,American Restaurant and Fast Food Restaurant,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
1,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
2,AVwcopQoByjofQCxgfVa,2016-03-29T05:06:36Z,2018-06-26T02:59:52Z,206 Wears Valley Rd,Fast Food Restaurant,Pigeon Forge,US,us/tn/pigeonforge/206wearsvalleyrd/-864103396,35.803788,-83.580553,Taco Bell,37863,TN,https://www.yellowpages.com/pigeon-forge-tn/mi...,"http://www.tacobell.com,https://locations.taco..."
3,AVweXN5RByjofQCxxilK,2017-01-03T07:46:11Z,2018-06-26T02:59:51Z,3652 Parkway,Fast Food,Pigeon Forge,US,us/tn/pigeonforge/3652parkway/93075755,35.782339,-83.551408,Arby's,37863,TN,http://www.yellowbook.com/profile/arbys_163389...,"http://www.arbys.com,https://locations.arbys.c..."
4,AWQ6MUvo3-Khe5l_j3SG,2018-06-26T02:59:43Z,2018-06-26T02:59:43Z,2118 Mt Zion Parkway,Fast Food Restaurant,Morrow,US,us/ga/morrow/2118mtzionparkway/1305117222,33.562738,-84.321143,Steak 'n Shake,30260,GA,https://foursquare.com/v/steak-n-shake/4bcf77a...,http://www.steaknshake.com/locations/23851-ste...


#### Check the dataframe before transformation 

In [3]:
# There are 10,000 records in the dataframe
restaurant_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           10000 non-null  object 
 1   dateAdded    10000 non-null  object 
 2   dateUpdated  10000 non-null  object 
 3   address      10000 non-null  object 
 4   categories   10000 non-null  object 
 5   city         10000 non-null  object 
 6   country      10000 non-null  object 
 7   keys         10000 non-null  object 
 8   latitude     10000 non-null  float64
 9   longitude    10000 non-null  float64
 10  name         10000 non-null  object 
 11  postalCode   10000 non-null  object 
 12  province     10000 non-null  object 
 13  sourceURLs   10000 non-null  object 
 14  websites     10000 non-null  object 
dtypes: float64(2), object(13)
memory usage: 1.1+ MB


#### Drop duplicates in "keys" column so that restaurants will be unique

In [4]:
# There are only 9343 unique restaurants out of 10,000 rows in the dataframe.
restaurant_df["keys"].nunique()

9343

In [5]:
# Drop duplicates in restaurant
restaurant_df.drop_duplicates(subset ="keys", keep = 'first', inplace = True) 

In [6]:
# The total count has been reduced from 10,000 to 9343. 
restaurant_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9343 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           9343 non-null   object 
 1   dateAdded    9343 non-null   object 
 2   dateUpdated  9343 non-null   object 
 3   address      9343 non-null   object 
 4   categories   9343 non-null   object 
 5   city         9343 non-null   object 
 6   country      9343 non-null   object 
 7   keys         9343 non-null   object 
 8   latitude     9343 non-null   float64
 9   longitude    9343 non-null   float64
 10  name         9343 non-null   object 
 11  postalCode   9343 non-null   object 
 12  province     9343 non-null   object 
 13  sourceURLs   9343 non-null   object 
 14  websites     9343 non-null   object 
dtypes: float64(2), object(13)
memory usage: 1.1+ MB


####  Some zip_codes are in xxxxx-xxxx format. They need to be diced to match the standard format. Digits after '-' will be dropped.

In [7]:
# Cleanse ZIP Code to retain only 5 characters
restaurant_df["postalCode"] = restaurant_df["postalCode"].apply(lambda x: x.split("-")[0].strip())

In [8]:
# Verify ZIP Codes all cleansed to 5 characters only
restaurant_df['zip_check'] = restaurant_df['postalCode'].apply(len)
print(restaurant_df['zip_check'].max(), restaurant_df['zip_check'].min())

5 5


#### Data normalisation

If keeping zip_code, city and state in the same table, there will be transitive dependency. City and state depend on the restaurant address but also on zip_code. Hence to achieve the Third Normal Form, we'll put zip, city and state into a seperate table.

In [9]:
# Extract ZIP Code, City and State to a separate table
zip_code_df = restaurant_df[['postalCode','city','province']]
zip_code_df.head()

Unnamed: 0,postalCode,city,province
0,70301,Thibodaux,LA
2,37863,Pigeon Forge,TN
3,37863,Pigeon Forge,TN
4,30260,Morrow,GA
5,48204,Detroit,MI


In [10]:
# Rename ZIP Code table column headings so they are more user friendly
zip_code_df = zip_code_df.rename(columns={"postalCode":"zip_code", "province":"state"})

In [11]:
# Check renamed columns
zip_code_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9343 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  9343 non-null   object
 1   city      9343 non-null   object
 2   state     9343 non-null   object
dtypes: object(3)
memory usage: 292.0+ KB


Identify if there are any duplicate ZIP Codes

In [12]:
zip_code_df["zip_code"].nunique()

5426

In [13]:
# Drop duplicate ZIP Codes from the ZIP Code dataframe
zip_code_df.drop_duplicates(subset ='zip_code', keep = 'first', inplace=True) 

In [14]:
zip_code_df.head()

Unnamed: 0,zip_code,city,state
0,70301,Thibodaux,LA
2,37863,Pigeon Forge,TN
4,30260,Morrow,GA
5,48204,Detroit,MI
6,48235,Detroit,MI


In [15]:
# Export ZIP Code dataframe into CSV file 
zip_code_df.to_csv('zip_code.csv', index=False)

#### Save the restaurant dataframe into a new dataframe, so unnecessary columns will be dropped 

In [16]:
# Save restaurant dataframe to a new dataframe

restaurant_clean_df = restaurant_df[["name","address","postalCode"]]
restaurant_clean_df.head()

Unnamed: 0,name,address,postalCode
0,SONIC Drive In,800 N Canal Blvd,70301
2,Taco Bell,206 Wears Valley Rd,37863
3,Arby's,3652 Parkway,37863
4,Steak 'n Shake,2118 Mt Zion Parkway,30260
5,Wendy's,9768 Grand River Ave,48204


#### Rename the columns so that columns will be consistant throughout database 

In [17]:
restaurant_clean_df = restaurant_clean_df.rename(columns={"name":"restaurant_name", "address":"address", "postalCode":"zip_code"})
restaurant_clean_df

Unnamed: 0,restaurant_name,address,zip_code
0,SONIC Drive In,800 N Canal Blvd,70301
2,Taco Bell,206 Wears Valley Rd,37863
3,Arby's,3652 Parkway,37863
4,Steak 'n Shake,2118 Mt Zion Parkway,30260
5,Wendy's,9768 Grand River Ave,48204
...,...,...,...
9995,Pizza Hut,3460 Robinhood Rd,27106
9996,Pizza Hut,3069 Kernersville Rd,27107
9997,Pizza Hut,838 S Main St,27284
9998,Pizza Hut,1702 Glendale Dr SW,27893


In [18]:
# Split address into street numbers and street name
restaurant_clean_df['street_address_no'], restaurant_clean_df['street_address_name'] = restaurant_clean_df['address'].str.split(' ', 1).str

  


In [19]:
restaurant_clean_df.head()

Unnamed: 0,restaurant_name,address,zip_code,street_address_no,street_address_name
0,SONIC Drive In,800 N Canal Blvd,70301,800,N Canal Blvd
2,Taco Bell,206 Wears Valley Rd,37863,206,Wears Valley Rd
3,Arby's,3652 Parkway,37863,3652,Parkway
4,Steak 'n Shake,2118 Mt Zion Parkway,30260,2118,Mt Zion Parkway
5,Wendy's,9768 Grand River Ave,48204,9768,Grand River Ave


In [20]:
# Create restaurant address dataframe to store restaurant addresses
restaurant_address_df = restaurant_clean_df[['restaurant_name','street_address_no','street_address_name','zip_code']]

In [21]:
restaurant_address_df.reset_index(drop = True, inplace = True)
restaurant_address_df.head()

Unnamed: 0,restaurant_name,street_address_no,street_address_name,zip_code
0,SONIC Drive In,800,N Canal Blvd,70301
1,Taco Bell,206,Wears Valley Rd,37863
2,Arby's,3652,Parkway,37863
3,Steak 'n Shake,2118,Mt Zion Parkway,30260
4,Wendy's,9768,Grand River Ave,48204


In [22]:
restaurant_address_df.shape

(9343, 4)

In [23]:
# Create restaurant address ID
restaurant_address_df.index.name='resturant_address_id'

In [24]:
restaurant_address_df.head()

Unnamed: 0_level_0,restaurant_name,street_address_no,street_address_name,zip_code
resturant_address_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,SONIC Drive In,800,N Canal Blvd,70301
1,Taco Bell,206,Wears Valley Rd,37863
2,Arby's,3652,Parkway,37863
3,Steak 'n Shake,2118,Mt Zion Parkway,30260
4,Wendy's,9768,Grand River Ave,48204


In [25]:
# Create restaurant names dataframe to store restaurant ID and name
restaurant_names_df= pd.DataFrame(restaurant_clean_df["restaurant_name"])
restaurant_names_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9343 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   restaurant_name  9343 non-null   object
dtypes: object(1)
memory usage: 146.0+ KB


In [26]:
# Identify if there are duplicate restaurant names
restaurant_names_df["restaurant_name"].nunique()

571

In [27]:
# Keep unique names only noting limitation on variation in string for the same restaurant name
restaurant_names_df.drop_duplicates(subset ="restaurant_name", keep = 'first', inplace = True) 

In [28]:
restaurant_names_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 571 entries, 0 to 9942
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   restaurant_name  571 non-null    object
dtypes: object(1)
memory usage: 8.9+ KB


In [29]:
restaurant_names_df.reset_index(drop=True, inplace=True)

In [30]:
# Create restaurant ID
restaurant_names_df.index.name='restaurant_id'

In [31]:
restaurant_names_df['restaurant_id']=restaurant_names_df.index

In [32]:
restaurant_names_df.head()

Unnamed: 0_level_0,restaurant_name,restaurant_id
restaurant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,SONIC Drive In,0
1,Taco Bell,1
2,Arby's,2
3,Steak 'n Shake,3
4,Wendy's,4


In [42]:
#test_one = pd.join(restaurant_names_df, restaurant_address_df,left_on="restaurant_name", right_on="restaurant_name")

inner_join = pd.merge(restaurant_address_df, restaurant_names_df,on ='restaurant_name', how ='right')

Unnamed: 0,restaurant_name,street_address_no,street_address_name,zip_code,restaurant_id
0,SONIC Drive In,800,N Canal Blvd,70301,0
1,SONIC Drive In,124,John R Rd,48083,0
2,SONIC Drive In,909,N Wood,75644,0
3,SONIC Drive In,97,Gateway Blvd,82901,0
4,SONIC Drive In,6557,S Staples St,78413,0


In [34]:
#restaurant_address_df_test = restaurant_address_df.merge(restaurant_names_df, on='restaurant_name', how="right")

In [35]:
#restaurant_address_df

In [36]:
#restaurant_address_df.astype('object').dtypes

In [44]:
restaurant_address_df.head()

Unnamed: 0_level_0,restaurant_name,street_address_no,street_address_name,zip_code
resturant_address_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,SONIC Drive In,800,N Canal Blvd,70301
1,Taco Bell,206,Wears Valley Rd,37863
2,Arby's,3652,Parkway,37863
3,Steak 'n Shake,2118,Mt Zion Parkway,30260
4,Wendy's,9768,Grand River Ave,48204


In [38]:
#restaurant_address_df.head()

In [39]:
# Export restaurant data to CSV
#restaurant_clean_df.to_csv('restaurant_clean.csv', index=False)