### Feature Engineering

In [2]:
import json
import pickle
import random
import itertools
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter
from datetime import datetime, timedelta

import import_ipynb
import data_acquisition

from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

importing Jupyter notebook from data_acquisition.ipynb


#### 1. Invoke Data Acquisition Functions

In [3]:
def get_yelp_data():
    # invoking data acquisition functions
    ratings = data_acquisition.get_ratings_data()
    business = data_acquisition.get_business_data()
    checkin = data_acquisition.get_checkin_data()
    user = data_acquisition.get_user_data()
    tips = data_acquisition.get_tips_data()
    
    return ratings, business, checkin, user, tips

#### 2. Define User Features

In [4]:
def add_user_features(user, ratings, tips):
    active_users = pickle.load(open("data/active_users_list.pkl", "rb"))
    active_users = active_users[0].values.tolist()
    
    # subsetting for active users
    user = user[user.user_id.isin(active_users)]
    
    # for each user, date of last review and total tips provided
    last_user_review = ratings.groupby(['user_id'], sort = False)['date'].max().reset_index()
    user_tips_count = tips.groupby(['user_id'], sort = False)['compliment_count'].sum().reset_index()

    user = user.replace('',np.nan)

    # create a minimum and maximum processor object
    min_max_scaler = preprocessing.MinMaxScaler()    

    #1. normalizing review attributes    
    review_norm_mdl = min_max_scaler.fit(user[['review_count']])
    user['review_count_norm'] = pd.DataFrame(review_norm_mdl.transform(user[['review_count']])) 

    #2. compliments per review count
    for col in user.columns[7:17]:
        user[col] = user[col]/user['review_count']
        
    user['compliment_score'] = 0
    for col in user.columns[7:17]:
        user['compliment_score'] += user[col]
        
    compliment_columns = ['compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 
                          'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 
                          'compliment_funny', 'compliment_writer', 'compliment_photos']
    user.drop(columns = compliment_columns, axis = 1, inplace = True)

    #3. total friends for each user normalized
    user['friends'] = user['friends'].apply(lambda x : len(x.split(','))) 
    friends_norm_mdl = min_max_scaler.fit(user[['friends']])
    user['friends_norm'] = pd.DataFrame(friends_norm_mdl.transform(user[['friends']])) 

    #4. total elite years for each user
    def elite_count(x):
        year = str(x).split(',')
        if year[0] == 'nan':
            year_count = 0
        else:
            year_count = len(year)  
        return year_count, year

    user['elite_intm'] = user['elite'].apply(lambda x : elite_count(x))
    user['elite_count'] = [x[0] for x in user['elite_intm']]

    #5. yelping since number of days
    user['yelping_since'] = (pd.to_datetime(user['yelping_since'])).dt.normalize()
    user = user.merge(last_user_review, on = 'user_id', how = 'left')
    user['date'] = pd.to_datetime(user['date'], errors = 'ignore', format = '%Y-%m-%d %H:%M:%S')
    user['user_lifetime'] = (user['date'] - user['yelping_since']).apply(lambda x: x.days)

    #6. tip count normalized for each user
    user = user.merge(user_tips_count, on = 'user_id', how = 'left')
    user['compliment_count'] = user['compliment_count']/user['review_count']

    #7. total fans of each user normalized
    fans_norm_mdl = min_max_scaler.fit(user[['fans']])
    user['fans_norm'] = pd.DataFrame(fans_norm_mdl.transform(user[['fans']])) 

    # removing user features not needed
    user.drop(['friends', 'fans' , 'yelping_since', 'elite', 'elite_intm', 'date'], axis = 1, inplace = True)
    user = user.drop_duplicates()
    user = user.fillna(0)
    
    user.to_pickle("data/user_feature_set.pkl")
    return user

#### 3. Define Business Features

In [5]:
def add_item_features(business, checkin):
    # sampling for Las Vegas businesses that are open
    business = business[business['is_open'] == 1]
    business = business[business['city'] == 'Las Vegas']
    
    # creating a list for hours of business each day
    business['hours'] = business['hours'].apply(lambda x: ' '.join(' '.join(str(x).split(', ')).split(': ')).split())
    business.reset_index(inplace = True, drop = True)

    # considering only 'Restaurants' business category
    for i in tqdm(range(len(business))):
        category_string = str(business.loc[i, 'categories'])
        if category_string.find('Restaurants') >= 0:
            business.loc[i, 'is_restaurant'] = 1
    
    business = business[business.is_restaurant == 1]
    business.reset_index(inplace = True, drop = True)
    
    all_categories = []
    for i in tqdm(range(len(business))):
        # maintain list of all business categories
        all_categories.extend(business.loc[i, 'categories'])

        if business.loc[i, 'hours'] == ['None'] or []:
            business.loc[i, 'total_hours'] = 0
        else:
            open_hours = business.loc[i, 'hours'][1::2]
            open_hours[-1] = open_hours[-1][:-1]
            open_hours = [x[1:len(x)-1].split('-') for x in open_hours]
            total_hours = 0
            
            # computing total hours open per week for each business
            for j in range(len(open_hours)):
                delta = datetime.strptime(open_hours[j][1], '%H:%M') - datetime.strptime(open_hours[j][0], '%H:%M')
                if delta <= timedelta(days = -1, hours = 23):
                    delta = delta - timedelta(days=-1)

                delta = delta.seconds//3600
                total_hours = total_hours + delta
            business.loc[i, 'total_hours'] = total_hours
            
    # subset for 10 most common business categories by count
    top10_categories = Counter(all_categories).most_common(10)
    category_name = [category for category, count in top10_categories]

    business.drop(['is_restaurant'], axis = 1, inplace = True)
    business['str_categories'] = business['categories'].apply(lambda x : str(x))

    # one-hot encoding to indicate presence of top 10 category for each business
    for category in tqdm(category_name):
        category_presence = [category_list for category_list in business['categories'] if category in category_list]
        category_features = pd.DataFrame(0, index = range(0, len(category_presence)), columns = [category,'presence'])
        for i in range(len(category_presence)):
            category_features.loc[i, category] = str(category_presence[i])
            category_features.loc[i, 'presence'] = 1

        category_features = category_features.rename(columns={category: "str_categories"})
        category_features['str_categories'] = category_features['str_categories'].apply(lambda x : str(x))    
        category_features = category_features.drop_duplicates()

        business = business.merge(category_features, how = "left", on = "str_categories")
        business = business.rename(columns={"presence": category})
    
    business.drop(['city', 'categories', 'hours', 'str_categories', 'is_open'], axis = 1, inplace = True)
    
    # finding first, last and total checkins for each business
    checkin['total_checkins'] = checkin['date'].apply(lambda x: len(str(x).split(', ')))
    checkin['last_checkin'] = checkin['date'].apply(lambda x: max(str(x).split(', ')))
    checkin['first_checkin'] = checkin['date'].apply(lambda x: min(str(x).split(', ')))

    last_checkin = pd.to_datetime(checkin['last_checkin'], format = '%Y-%m-%d %H:%M:%S')
    first_checkin = pd.to_datetime(checkin['first_checkin'], format = '%Y-%m-%d %H:%M:%S')
    
    # computing age of business in terms of first and last checkin
    checkin['age_of_business'] = (last_checkin - first_checkin).apply(lambda x: x.days)
    checkin.drop(['date', 'first_checkin', 'last_checkin'], axis = 1, inplace = True)

    business = business.merge(checkin, how = 'left', on = 'business_id')
    business = business.drop_duplicates()
    business = business.fillna(0)
    
    business.to_pickle("data/business_feature_set.pkl")
    return business

#### 4. Map User & Business Features to Reviews

In [6]:
def add_features_to_ratings(ratings, user, business):
    # imputing user and business attributes to ratings
    ratings = ratings.merge(user, on = 'user_id')
    ratings = ratings.merge(business, on = 'business_id')
    
    # subsetting active businesses based on >=5 ratings
    business_ratings_count = ratings['business_id'].value_counts()
    active_business = business_ratings_count.loc[business_ratings_count >= 5].index.tolist()
    ratings = ratings[ratings.business_id.isin(active_business)]
    
    # subsetting active users based on >=5 ratings
    user_ratings_count = ratings['user_id'].value_counts()
    active_users = user_ratings_count.loc[user_ratings_count >= 5].index.tolist()
    ratings = ratings[ratings.user_id.isin(active_users)]
    
    ratings = ratings.drop_duplicates()
    ratings = ratings.fillna(0)
    
    ratings.to_pickle("data/ratings_feature_set.pkl")
    return ratings

#### 5. Split Reviews Data into Train, Validation, Test

In [7]:
def train_validation_test_split(years = 1):
    ratings = pickle.load(open("data/ratings_feature_set.pkl", "rb"))
    
    # subsetting ratings for specified number of years
    ratings['year'] = ratings['date'].apply(lambda x : x.split('-')[0])
    years_list = set(ratings['year'])
    years_list = sorted(years_list, reverse = True)[0:years]
    ratings = ratings[ratings.year.isin(years_list)]
    ratings.drop(['year'], axis = 1, inplace = True)
    
    # subsetting active businesses based on >=5 ratings
    business_ratings_count = ratings['business_id'].value_counts()
    active_business = business_ratings_count.loc[business_ratings_count >= 5].index.tolist()
    ratings = ratings[ratings.business_id.isin(active_business)]
    
    # subsetting active users based on >=5 ratings
    user_ratings_count = ratings['user_id'].value_counts()
    active_users = user_ratings_count.loc[user_ratings_count >= 5].index.tolist()
    ratings = ratings[ratings.user_id.isin(active_users)]
    
    # hold out latest review for each user by review date
    holdout_set = ratings.sort_values(by = ['date'], ascending = False).groupby('user_id').first().reset_index()
    holdout_reviews = holdout_set['review_id'].values.tolist()
    
    # construct train set from all reviews and test set from holdout review
    ratings_train = ratings[~ratings.review_id.isin(holdout_reviews)]
    ratings_test = holdout_set
    
    # select two ratings per user for validation set
    random_two = lambda x: x.loc[np.random.choice(x.index, 2, False), :]
    ratings_validation = ratings_train.groupby('user_id', as_index = False).apply(random_two)
    ratings_validation.reset_index(drop = True, inplace = True)
    
    # excluding validation set data from training set
    validation_reviews = ratings_validation['review_id'].values.tolist()
    ratings_train = ratings_train[~ratings_train.review_id.isin(validation_reviews)]
    
    # persisting pickle object for specified years
    ratings_train.to_pickle("data/ratings_train_"+str(years)+"_years.pkl")
    ratings_validation.to_pickle("data/ratings_validation_"+str(years)+"_years.pkl")
    ratings_test.to_pickle("data/ratings_test_"+str(years)+"_years.pkl")
    
    return ratings_train, ratings_validation, ratings_test

#### 6. Identify Possible Recommendation Options for Users

In [8]:
def user_recommendation_options(ratings):
    all_users = list(set(ratings.user_id))
    all_business = list(set(ratings.business_id))
    all_user_business = list(itertools.product(all_users, all_business))
    
    ratings_reviewed = list()
    ratings.reset_index(drop = True, inplace = True)
    
    for i in range(len(ratings)):
        ratings_reviewed.append((ratings.user_id.iloc[i], ratings.business_id.iloc[i]))
    
    ratings_recommend = set(all_user_business) - set(ratings_reviewed)
    ratings_recommend = pd.DataFrame(ratings_recommend, columns = ['user_id', 'business_id'])
    
    ratings_recommend.to_pickle("data/ratings_recommendation_list_full.pkl")
    
    return ratings_recommend

In [46]:
def subset_recommendations(ratings_train, ratings_recommend):
    # maintaining ratio when subsetting
    n_users = 4000
    n_businesses = 500
    
    all_users = list(ratings_train.groupby(['user_id']).groups.keys())
    all_businesses = list(ratings_train.groupby(['business_id']).groups.keys())
    
    # randomly selecting n_users and n_businesses
    sample_users = random.choices(all_users, k = n_users)
    sample_businesses = random.choices(all_businesses, k = n_businesses)
    
    # generating smaller list for recommendation
    ratings_recommend = ratings_recommend[ratings_recommend.user_id.isin(sample_users)]
    ratings_recommend = ratings_recommend[ratings_recommend.business_id.isin(sample_businesses)]
    
    ratings_recommend.to_pickle("data/ratings_recommendation_list.pkl")
    
    return ratings_recommend

In [47]:
# function calls
# ratings, business, checkin, user, tips = get_yelp_data()

# user = add_user_features(user, ratings, tips)
# business = add_item_features(business, checkin)
# ratings = add_features_to_ratings(ratings, user, business)
# ratings_train, ratings_validation, ratings_test = train_validation_test_split(5)

# ratings_recommend = user_recommendation_options(ratings_train)
# ratings_recommend_subset = subset_recommendations(ratings_train, ratings_recommend)