In [186]:
import json
import pickle
import random
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter
from datetime import datetime, timedelta

import import_ipynb
import data_acquisition

from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

In [17]:
def get_yelp_data():
    # invoking data import functions
    ratings = data_acquisition.get_ratings_data()
    business = data_acquisition.get_business_data()
    checkin = data_acquisition.get_checkin_data()
    user = data_acquisition.get_user_data()
    tips = data_acquisition.get_tips_data()
    
    return ratings, business, checkin, user, tips

In [64]:
def add_user_features(user, ratings, tips):
    active_users = pickle.load(open("data/active_users_list.pkl", "rb"))
    active_users = active_users[0].values.tolist()
    
    # subsetting for active users
    user = user[user.user_id.isin(active_users)]
    
    # for each user, date of last review and total tips provided
    last_user_review = ratings.groupby(['user_id'], sort = False)['date'].max().reset_index()
    user_tips_count = tips.groupby(['user_id'], sort = False)['compliment_count'].sum().reset_index()

    user = user.replace('',np.nan)

    # create a minimum and maximum processor object
    min_max_scaler = preprocessing.MinMaxScaler()    

    #1. normalizing review attributes    
    review_norm_mdl = min_max_scaler.fit(user[['review_count']])
    user['review_count_norm'] = pd.DataFrame(review_norm_mdl.transform(user[['review_count']])) 

    #2. compliments per review count index
    for col in user.columns[7:17] :
        user[col] = user[col]/user['review_count']

    #3. total friends for each user normalized
    user['friends'] = user['friends'].apply(lambda x : len(x.split(','))) 
    friends_norm_mdl = min_max_scaler.fit(user[['friends']])
    user['friends_norm'] = pd.DataFrame(friends_norm_mdl.transform(user[['friends']])) 

    #4. total elite years for each user
    def elite_count(x):
        y = str(x).split(',')
        if y[0] == 'nan':
            cnt = 0
        else:
            cnt = len(y)  
        return cnt, y

    user['elite_intm'] = user['elite'].apply(lambda x : elite_count(x))
    user['elite_count'] = [x[0] for x in user['elite_intm']]
    user['elite_year'] = [x[1] for x in user['elite_intm']]

    mlb = MultiLabelBinarizer()
    elite = pd.DataFrame(mlb.fit_transform(user.pop('elite_year')),
                         columns = 'elite_' + mlb.classes_, index = user.index)
    user = pd.concat([user, elite], axis = 1)
    user.drop(['elite_nan'], axis = 1, inplace = True)

    #5. yelping since number of days
    user['yelping_since'] = (pd.to_datetime(user['yelping_since'])).dt.normalize()
    user = user.merge(last_user_review, on = 'user_id', how = 'left')
    user['date'] = pd.to_datetime(user['date'], errors = 'ignore', format = '%Y-%m-%d %H:%M:%S')
    user['user_lifetime'] = (user['date'] - user['yelping_since']).apply(lambda x: x.days)

    #6. tip count normalized for each user
    user = user.merge(user_tips_count, on = 'user_id', how = 'left')
    user['compliment_count'] = user['compliment_count']/user['review_count']

    #7. total fans of each user normalized
    fans_norm_mdl = min_max_scaler.fit(user[['fans']])
    user['fans_norm'] = pd.DataFrame(fans_norm_mdl.transform(user[['fans']])) 

    # removing user features not needed
    user.drop(['friends', 'fans' , 'yelping_since', 'elite', 'elite_intm', 'date'], axis = 1, inplace = True)
    
    user.to_pickle("data/user_feature_set.pkl")
    return user

In [91]:
def add_item_features(business, checkin):
    all_categories = []
    
    # sampling for Las Vegas businesses that are open
    business = business[business['is_open'] == 1]
    business = business[business['city'] == 'Las Vegas']
    
    # creating a list for hours of business each day
    business['hours'] = business['hours'].apply(lambda x: ' '.join(' '.join(str(x).split(', ')).split(': ')).split())
    business.reset_index(inplace = True, drop = True)

    for i in tqdm(range(len(business))):
        # maintain list of all business categories
        all_categories.extend(business.loc[i, 'categories'])

        if business.loc[i, 'hours'] == ['None'] or []:
            business.loc[i, 'total_hours'] = 0
        else:
            open_hours = business.loc[i, 'hours'][1::2]
            open_hours[-1] = open_hours[-1][:-1]
            open_hours = [x[1:len(x)-1].split('-') for x in open_hours]
            total_hours = 0
            
            # computing total hours open per week for each business
            for j in range(len(open_hours)):
                delta = datetime.strptime(open_hours[j][1], '%H:%M') - datetime.strptime(open_hours[j][0], '%H:%M')
                if delta <= timedelta(days = -1, hours = 23):
                    delta = delta - timedelta(days=-1)

                delta = delta.seconds//3600
                total_hours = total_hours + delta
            business.loc[i, 'total_hours'] = total_hours
            
    # subset for 25 most common business categories by count
    top25_categories = Counter(all_categories).most_common(25)
    category_name = [category for category, count in top25_categories]

    business['str_categories'] = business['categories'].apply(lambda x : str(x))

    # one-hot encoding to indicate presence of top 25 category for each business
    for category in tqdm(category_name):
        category_presence = [category_list for category_list in business['categories'] if category in category_list]
        category_features = pd.DataFrame(0, index = range(0, len(category_presence)), columns = [category,'presence'])
        for i in range(len(category_presence)):
            category_features.loc[i, category] = str(category_presence[i])
            category_features.loc[i, 'presence'] = 1

        category_features = category_features.rename(columns={category: "str_categories"})
        category_features['str_categories'] = category_features['str_categories'].apply(lambda x : str(x))    
        category_features = category_features.drop_duplicates()

        business = business.merge(category_features, how = "left", on = "str_categories")
        business = business.rename(columns={"presence": category})
    
    business.drop(['city', 'categories', 'hours', 'str_categories', 'is_open'], axis = 1, inplace = True)
    
    # finding first, last and total checkins for each business
    checkin['total_checkins'] = checkin['date'].apply(lambda x: len(str(x).split(', ')))
    checkin['last_checkin'] = checkin['date'].apply(lambda x: max(str(x).split(', ')))
    checkin['first_checkin'] = checkin['date'].apply(lambda x: min(str(x).split(', ')))

    last_checkin = pd.to_datetime(checkin['last_checkin'], format = '%Y-%m-%d %H:%M:%S')
    first_checkin = pd.to_datetime(checkin['first_checkin'], format = '%Y-%m-%d %H:%M:%S')
    
    # computing age of business in terms of first and last checkin
    checkin['age_of_business'] = (last_checkin - first_checkin).apply(lambda x: x.days)
    checkin.drop(['date', 'first_checkin', 'last_checkin'], axis = 1, inplace = True)

    business = business.merge(checkin, how = 'left', on = 'business_id')
    
    business.to_pickle("data/business_feature_set.pkl")
    return business

In [187]:
def add_features_to_ratings(ratings, user, business):
    # imputing user and business attributes to ratings
    ratings = ratings.merge(user, on = 'user_id')
    ratings = ratings.merge(business, on = 'business_id')
    
    # subsetting active businesses based on >=5 ratings
    business_ratings_count = ratings['business_id'].value_counts()
    active_business = business_ratings_count.loc[business_ratings_count >= 5].index.tolist()
    ratings = ratings[ratings.business_id.isin(active_business)]
    
    # subsetting active users based on >=5 ratings
    user_ratings_count = ratings['user_id'].value_counts()
    active_users = user_ratings_count.loc[user_ratings_count >= 5].index.tolist()
    ratings = ratings[ratings.user_id.isin(active_users)]
    ratings = ratings.fillna(0)
    
    ratings.to_pickle("data/ratings_feature_set.pkl")
    return ratings

In [188]:
def train_test_split(size = 1):
    ratings = pickle.load(open("data/ratings_feature_set.pkl", "rb"))

    users_list = ratings['user_id'].value_counts()
    size = int(size * len(users_list))
    
    # finding a subset of total users randomly
    users = random.choices(users_list.index.tolist(), k = size)
    ratings = ratings[ratings.user_id.isin(users)]
    
    # hold out latest review for each user by review date
    holdout_set = ratings.sort_values(by = ['date'], ascending = False).groupby('user_id').first().reset_index()
    holdout_reviews = holdout_set['review_id'].values.tolist()
    
    # construct train set from all reviews and test set from holdout review
    ratings_train = ratings[~ratings.review_id.isin(holdout_reviews)]
    ratings_test = holdout_set

    return ratings_train, ratings_test

In [161]:
# function calls
# ratings, business, checkin, user, tips = get_yelp_data()

# user = add_user_features(user, ratings, tips)
# business = add_item_features(business, checkin)
# ratings = add_features_to_ratings(ratings, user, business)
# ratings_train, ratings_test = train_test_split(ratings, 1000)