In [14]:
#Import pandas
import pandas as pd

In [15]:
#import yelp csv to clean up
file_path='Resources/yelpdata.csv'
yelp_df=pd.read_csv(file_path)
yelp_df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,category,results_total,avg_rating,avg_review_count
0,0,38.046407,-84.497039,beaches,1,4.5,2.0
1,1,38.046407,-84.497039,horsebackriding,20,3.6,10.6
2,2,38.046407,-84.497039,hiking,9,4.0,7.444444
3,3,38.046407,-84.497039,beaches,1,4.5,2.0
4,4,38.046407,-84.497039,horsebackriding,20,3.6,10.6


In [16]:
#get rid of the unnamed column
yelp_df = yelp_df.drop(yelp_df.columns[0], axis=1)
yelp_df.head()

Unnamed: 0,latitude,longitude,category,results_total,avg_rating,avg_review_count
0,38.046407,-84.497039,beaches,1,4.5,2.0
1,38.046407,-84.497039,horsebackriding,20,3.6,10.6
2,38.046407,-84.497039,hiking,9,4.0,7.444444
3,38.046407,-84.497039,beaches,1,4.5,2.0
4,38.046407,-84.497039,horsebackriding,20,3.6,10.6


In [17]:
#brought over note from aggregate workbook, standardizing categories across datasets
# Clean final DF Activities to be one of the following list: 
#'beaches','hanggliding','horsebackriding', 'hiking','hot_air_balloons',
#'paddleboarding','parasailing','sailing','snorkeling','ziplining', 
#'Spas','hotsprings','massage','tours'

In [18]:
#fix spelling of balloons now, will match in other dataframe
yelp_df['category'].replace('hot_air_baloons','hot_air_balloons',inplace=True)

In [19]:
#Should effectively drop all rows where the category column does not contain one of the above values. 
yelp_df=yelp_df[yelp_df['category'].str.contains("beaches|hanggliding|horsebackriding|hiking|hot_air_balloons|paddleboarding|parasailing|sailing|snorkeling|ziplining|Spas|hotsprings|massage|tours")== True]

In [20]:
#create a 'top rated' column based on the average rating and number of reviews 
yelp_df['top_rated'] = yelp_df['avg_rating'] * yelp_df['avg_review_count'] 
yelp_df.head()

Unnamed: 0,latitude,longitude,category,results_total,avg_rating,avg_review_count,top_rated
0,38.046407,-84.497039,beaches,1,4.5,2.0,9.0
1,38.046407,-84.497039,horsebackriding,20,3.6,10.6,38.16
2,38.046407,-84.497039,hiking,9,4.0,7.444444,29.777778
3,38.046407,-84.497039,beaches,1,4.5,2.0,9.0
4,38.046407,-84.497039,horsebackriding,20,3.6,10.6,38.16


In [21]:
#group by [lat,lon] pair and sort top rated value to descending
yelp_df = yelp_df.groupby(['latitude', 'longitude']).apply(lambda x: x.sort_values(by='top_rated', ascending=False))
yelp_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,latitude,longitude,category,results_total,avg_rating,avg_review_count,top_rated
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-45.032192,168.661,281,-45.032192,168.661,hotsprings,1,4.5,20.0,90.0
-45.032192,168.661,279,-45.032192,168.661,ziplining,20,4.95,9.7,48.015
-45.032192,168.661,287,-45.032192,168.661,tours,20,4.8,4.45,21.36
-45.032192,168.661,276,-45.032192,168.661,hanggliding,2,2.25,1.5,3.375
-45.032192,168.661,278,-45.032192,168.661,hiking,13,2.076923,1.230769,2.556213


In [22]:
#get rid of that index.
yelp_df=yelp_df.reset_index(drop=True)
yelp_df.head()

Unnamed: 0,latitude,longitude,category,results_total,avg_rating,avg_review_count,top_rated
0,-45.032192,168.661,hotsprings,1,4.5,20.0,90.0
1,-45.032192,168.661,ziplining,20,4.95,9.7,48.015
2,-45.032192,168.661,tours,20,4.8,4.45,21.36
3,-45.032192,168.661,hanggliding,2,2.25,1.5,3.375
4,-45.032192,168.661,hiking,13,2.076923,1.230769,2.556213


In [25]:
#group by again and drop all values below the top five for each [lat,lon] pair
yelp_df=yelp_df.groupby(['latitude','longitude']).head(5)
yelp_df.head(15)

Unnamed: 0,latitude,longitude,category,results_total,avg_rating,avg_review_count,top_rated
0,-45.032192,168.661,hotsprings,1,4.5,20.0,90.0
1,-45.032192,168.661,ziplining,20,4.95,9.7,48.015
2,-45.032192,168.661,tours,20,4.8,4.45,21.36
3,-45.032192,168.661,hanggliding,2,2.25,1.5,3.375
4,-45.032192,168.661,hiking,13,2.076923,1.230769,2.556213
7,-37.814218,144.963161,ziplining,20,5.0,7.15,35.75
8,-37.814218,144.963161,tours,20,4.85,3.2,15.52
9,-37.814218,144.963161,beaches,20,3.05,3.05,9.3025
10,-37.814218,144.963161,massage,20,5.0,1.5,7.5
11,-37.814218,144.963161,snorkeling,4,2.5,1.0,2.5


In [26]:
# group again to convert the category column into a list and collapse the [lat,lon] pairs into a single row
grouped_df = yelp_df.groupby(['latitude', 'longitude']).agg({'category': lambda x: list(x)})
grouped_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,category
latitude,longitude,Unnamed: 2_level_1
-45.032192,168.661,"[hotsprings, ziplining, tours, hanggliding, hi..."
-37.814218,144.963161,"[ziplining, tours, beaches, massage, snorkeling]"
-36.852095,174.76318,"[ziplining, beaches, tours, massage, hiking]"
-36.718805,142.220951,"[ziplining, tours, horsebackriding]"
-35.141813,150.391646,"[ziplining, tours, beaches, horsebackriding, h..."


In [27]:
#loop through the locations and expand the category list into separate columns
for i in range(5):
    grouped_df[f'act_{i+1}'] = grouped_df['category'].apply(lambda x: x[i] if len(x) > i else None)
grouped_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,category,act_1,act_2,act_3,act_4,act_5
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-45.032192,168.661,"[hotsprings, ziplining, tours, hanggliding, hi...",hotsprings,ziplining,tours,hanggliding,hiking
-37.814218,144.963161,"[ziplining, tours, beaches, massage, snorkeling]",ziplining,tours,beaches,massage,snorkeling
-36.852095,174.76318,"[ziplining, beaches, tours, massage, hiking]",ziplining,beaches,tours,massage,hiking
-36.718805,142.220951,"[ziplining, tours, horsebackriding]",ziplining,tours,horsebackriding,,
-35.141813,150.391646,"[ziplining, tours, beaches, horsebackriding, h...",ziplining,tours,beaches,horsebackriding,hot_air_balloons


In [31]:
#drop the original category column
grouped_df=grouped_df.drop(['category'],axis=1)
grouped_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,act_1,act_2,act_3,act_4,act_5
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-45.032192,168.661,hotsprings,ziplining,tours,hanggliding,hiking
-37.814218,144.963161,ziplining,tours,beaches,massage,snorkeling
-36.852095,174.76318,ziplining,beaches,tours,massage,hiking
-36.718805,142.220951,ziplining,tours,horsebackriding,,
-35.141813,150.391646,ziplining,tours,beaches,horsebackriding,hot_air_balloons


In [32]:
### export it  as final_activities.csv
grouped_df.to_csv('Resources/final_activies.csv',index=True)