In [1]:
import json
import pandas as pd

from tqdm import tqdm
from collections import Counter
from datetime import datetime, timedelta

import import_ipynb
import data_acquisition

from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

importing Jupyter notebook from data_acquisition.ipynb


In [13]:
def get_yelp_data():
    ratings = data_acquisition.get_ratings_data()
    business = data_acquisition.get_business_data()
    checkin = data_acquisition.get_checkin_data()
    user = data_acquisition.get_user_data()
    tips = data_acquisition.get_tips_data()
    
    return ratings, business, checkin, user, tips

In [20]:
ratings, business, checkin, user, tips = get_yelp_data()

100%|██████████| 6685900/6685900 [00:48<00:00, 138961.51it/s]
100%|██████████| 192609/192609 [00:04<00:00, 47598.57it/s]
100%|██████████| 161950/161950 [00:01<00:00, 126705.28it/s]
100%|██████████| 1637138/1637138 [00:23<00:00, 70984.12it/s]
100%|██████████| 1223094/1223094 [00:04<00:00, 249184.56it/s]


In [193]:
def add_user_features(user, ratings, tips):
    # for each user, date of last review and total tips provided
    last_user_review = ratings.groupby(['user_id'], sort = False)['date'].max().reset_index()
    user_tips_count = tips.groupby(['user_id'], sort = False)['compliment_count'].sum().reset_index()

    user = user.replace('',np.nan)

    # Create a minimum and maximum processor object
    min_max_scaler = preprocessing.MinMaxScaler()    

    #1. Normalizing reviews    
    review_norm_mdl = min_max_scaler.fit(user[['review_count']])
    user['review_count_norm'] = pd.DataFrame(review_norm_mdl.transform(user[['review_count']])) 

    #2. Compliments per review_cnt index
    for col in user.columns[6:17] :
        user[col] = user[col]/user['review_count']

    #3. Friends
    user['friends'] = user['friends'].apply(lambda x : len(x.split(','))) 
    friends_norm_mdl = min_max_scaler.fit(user[['friends']])
    user['friends_norm'] = pd.DataFrame(friends_norm_mdl.transform(user[['friends']])) 

    #4. Total Elite Years
    def elite_cnt(x):
        y = str(x).split(',')
        if y[0] == 'nan':
            cnt = 0
        else:
            cnt = len(y)  
        return cnt, y

    user['elite_intm'] = user['elite'].apply(lambda x : elite_cnt(x))
    user['elite_cnt'] = [x[0] for x in user['elite_intm']]
    user['elite_year'] = [x[1] for x in user['elite_intm']]

    mlb = MultiLabelBinarizer()
    elite = pd.DataFrame(mlb.fit_transform(user.pop('elite_year')),
                         columns = 'elite_' + mlb.classes_, index = user.index)
    user = pd.concat([user, elite], axis = 1)
    user.drop(['elite_nan'], axis=1, inplace=True)

    #5. Yelping since #days
    user['yelping_since'] = (pd.to_datetime(user['yelping_since'])).dt.normalize()
    user = user.merge(last_user_review, on = 'user_id', how = 'left')
    user['date'] = pd.to_datetime(user['date'], errors = 'ignore', format = '%Y-%m-%d %H:%M:%S')
    user['user_lifetime'] = (user['date'] - user['yelping_since']).apply(lambda x: x.days)

    #6. Tip count
    user = user.merge(user_tips_count, on = 'user_id', how = 'left')
    user['compliment_count'] = user['compliment_count']/user['review_count']

    #7. Fans
    fans_norm_mdl = min_max_scaler.fit(user[['fans']])
    user['fans_norm'] = pd.DataFrame(fans_norm_mdl.transform(user[['fans']])) 

    # Subsetting the user features
    user.drop(['review_count','friends','fans','yelping_since','elite','elite_intm','date'], axis=1, inplace=True)
    
    user.to_pickle("user_feature_set.pkl")
    return user

In [11]:
def add_item_features(business, checkin):
    all_categories = []
    business['hours'] = business['hours'].apply(lambda x: ' '.join(' '.join(str(x).split(', ')).split(': ')).split())

    for i in tqdm(range(len(business))):
        all_categories.extend(business.loc[i, 'categories'])

        if business.loc[i, 'hours'] == ['None'] or []:
            business.loc[i, 'total_hours'] = 0
        else:
            open_hours = business.loc[i, 'hours'][1::2]
            open_hours[-1] = open_hours[-1][:-1]
            open_hours = [x[1:len(x)-1].split('-') for x in open_hours]
            total_hours = 0

            for j in range(len(open_hours)):
                delta = datetime.strptime(open_hours[j][1], '%H:%M') - datetime.strptime(open_hours[j][0], '%H:%M')
                if delta <= timedelta(days = -1, hours = 23):
                    delta = delta - timedelta(days=-1)

                delta = delta.seconds//3600
                total_hours = total_hours + delta
            business.loc[i, 'total_hours'] = total_hours

    top50_categories = Counter(all_categories).most_common(50)
    category_name = [category for category, count in top50_categories]

    business['str_categories'] = business['categories'].apply(lambda x : str(x))

    for category in tqdm(category_name):
        category_presence = [category_list for category_list in business['categories'] if category in category_list]
        category_features = pd.DataFrame(0, index = range(0, len(category_presence)), columns = [category,'presence'])
        for i in range(len(category_presence)):
            category_features.loc[i, category] = str(category_presence[i])
            category_features.loc[i, 'presence'] = 1

        category_features = category_features.rename(columns={category: "str_categories"})
        category_features['str_categories'] = category_features['str_categories'].apply(lambda x : str(x))    
        category_features = category_features.drop_duplicates()

        business = business.merge(category_features, how = "left", on = "str_categories")
        business = business.rename(columns={"presence": category})

    business.drop(['categories', 'hours', 'str_categories'], axis = 1, inplace = True)

    checkin['total_checkins'] = checkin['date'].apply(lambda x: len(str(x).split(', ')))
    checkin['last_checkin'] = checkin['date'].apply(lambda x: max(str(x).split(', ')))
    checkin['first_checkin'] = checkin['date'].apply(lambda x: min(str(x).split(', ')))

    last_checkin = pd.to_datetime(checkin['last_checkin'], format = '%Y-%m-%d %H:%M:%S')
    first_checkin = pd.to_datetime(checkin['first_checkin'], format = '%Y-%m-%d %H:%M:%S')

    checkin['age_of_business'] = (last_checkin - first_checkin).apply(lambda x: x.days)
    checkin.drop(['date', 'first_checkin', 'last_checkin'], axis = 1, inplace = True)

    business = business.merge(checkin, how = "left", on = "business_id")
    
    business.to_pickle("business_feature_set.pkl")
    return business

In [194]:
# function calls

# user = add_user_features(user, ratings, tips)
# business = add_item_features(business, checkin)