In [1]:
import pandas as pd
import numpy as np

### Preprocessing the Business Dataset<br>
Since the records in this dataset do not all correspond to restaurants, we eliminate attributes that do not correspond to "valid" businesses <br>
Businesses that are not tagged as "Restaurant","Restaurants" or "Food" are dropped <br>

In [None]:
pruned_business_file='yelp_academic_dataset_business_pruned.csv'

prune=['attributes.HairSpecializesIn.coloring','attributes.HairSpecializesIn.straightperms','attributes.HairSpecializesIn.kids',
       'attributes.HairSpecializesIn.extensions','attributes.HairSpecializesIn.curly','attributes.HairSpecializesIn.africanamerican',
      'attributes.HairSpecializesIn.asian','attributes.HairSpecializesIn.perms','attributes.AcceptsInsurance','attributes.ByAppointmentOnly'
       ,'attributes.RestaurantsAttire','attributes.NoiseLevel','attributes.AgesAllowed','attributes.Alcohol','attributes.BYOBCorkage']

header=1
for chunk in pd.read_csv('yelp_dataset/csv/yelp_academic_dataset_business.csv',chunksize=1000):
    data=chunk.drop(prune,axis=1)
    data = data[data['categories'].str.contains(r'(?:\s|^)(Restaurants|Food)(?:,|$)')==True]
    data=data[data['review_count']>=100] #preliminary check for review count
    if(header):
        data.to_csv(pruned_business_file,mode='a',index=False)
        header=0
    else:
        data.to_csv(pruned_business_file,mode='a',header=False,index=False)

### Handling Missing Values
For boolean attributes, replace all NaN values with 0.5 <br>
Since the data is unavailable we assume a business to both have and not have the corresponding attribute

In [None]:
data=pd.read_csv(pruned_business_file)
data=data.replace(np.nan,0.5)
data=data.replace(True,1)
data=data.replace(False,0)
data=data.replace('True',1)
data=data.replace('False',0)
data=data.replace('NaN',0.5)

The column "categories" is a set of all attributes the business is tagged with <br>
We create an independent boolean attribute for each category in the set

In [None]:
cat_count = dict()

for row in data['categories']:
    cats = row.split(", ")
    for cat in cats:
        if(cat in cat_count.keys()):
            cat_count[cat]+=1
        else:
            cat_count[cat] = 1

valid_cats = dict()

for key in cat_count.keys():
    if(cat_count[key]>50):
        valid_cats[key] = cat_count[key]
        
for key in valid_cats.keys():
    data[key] = 0

for i in range(0,len(data)):
    print(i)
    cats=data.iloc[i]['categories'].split(", ")
    for cat in cats:
        if(cat in valid_cats.keys()):
            data[cat].iloc[i]=1

data.to_csv(pruned_business_file,mode='w',header=True,index=False)

### Preprocessing the Review Dataset<br>
Valid businesses are those remaining in the business file post preprocessing<br>
Reviews of invalid businesses are dropped <br>

In [2]:
business_data=pd.read_csv("yelp_dataset/yelp_academic_dataset_business_pruned.csv")
valid_businesses=business_data.business_id.unique()

pruned_review_file='yelp_academic_dataset_review_pruned.csv'
header=1
prune=['text']
for chunk in pd.read_csv('/Users/malaika/Desktop/yelp_dataset/csv/yelp_academic_dataset_review.csv',chunksize=1000):
    data=chunk.drop(prune,axis=1)
    data=data.loc[data.business_id.isin(valid_businesses)] 
    if(header):  
        data.to_csv(pruned_review_file,mode='a')
        header=0
    else:
        data.to_csv(pruned_review_file,mode='a',header=False)

### Preprocessing the User Dataset<br>
Valid users are those having a review count exceeding 100. The cleaned review data is used to extract valid users based on the count of reviews for each user<br>
Irrelevant attributes (for our purpose) are dropped

In [3]:
reviews=pd.read_csv("yelp_dataset/yelp_academic_dataset_review_pruned.csv")
count_users=reviews.groupby(by='user_id')
groups=count_users.groups.keys()
count_users=count_users.count() #find true count of reviews for each user
count_users['user_id']=groups 
valid_users=count_users.loc[count_users.stars>=100]['user_id'] #extract users with review_count>=100

In [7]:
pruned_user_file='yelp_academic_dataset_user_pruned.csv'
data=pd.read_csv('yelp_dataset/csv/yelp_academic_dataset_user.csv')

prune=['elite','useful','compliment_note','compliment_cool','compliment_funny','compliment_hot','compliment_photos',
       'compliment_list', 'compliment_writer', 'funny', 'cool','compliment_cute','compliment_profile', 'fans','compliment_plain',
       'friends', 'compliment_more']

data=data.drop(prune,axis=1)
data=data.loc[data.user_id.isin(valid_users)]
data.to_csv(pruned_user_file,mode='a',header=True,index=False)