In [10]:
import json
import pandas as pd

from tqdm import tqdm
from datetime import datetime, timedelta

import import_ipynb
import data_acquisition

from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
def get_data():
    ratings = data_acquisition.get_ratings_data()
    business = data_acquisition.get_business_data()
    checkin = data_acquisition.get_checkin_data()
    user = data_acquisition.get_user_data()
    tips = data_acquisition.get_tips_data()
    
    return ratings, business, checkin, user, tips

In [5]:
ratings, business, checkin, user, tips = get_data()

100%|██████████| 6685900/6685900 [00:46<00:00, 144030.59it/s]
100%|██████████| 192609/192609 [00:03<00:00, 63641.78it/s]
100%|██████████| 161950/161950 [00:01<00:00, 128928.77it/s]
100%|██████████| 1637138/1637138 [00:21<00:00, 75043.73it/s]
100%|██████████| 1223094/1223094 [00:04<00:00, 261114.83it/s]


In [6]:
def add_user_features(user, ratings, tips):
    last_user_review = ratings.groupby(['user_id'], sort=False)['date'].max().reset_index()
    user_tips_count = tips.groupby(['user_id'], sort=False)['compliment_count'].sum().reset_index()
    
    # Additional variables  
    user=user.replace('',np.nan)
    
    # Create a minimum and maximum processor object
    min_max_scaler = preprocessing.MinMaxScaler()    
    
    #1. Normalizing reviews    
    review_norm_mdl = min_max_scaler.fit(user[['review_count']])
    user['review_count_norm'] = pd.DataFrame(review_norm_mdl.transform(user[['review_count']])) 
    
    #2. Compliments per review_cnt index
    for col in user.columns[6:17] :
        user[col]=user[col]/user['review_count']
        
    #3. Friends
    user['friends'] = user['friends'].apply(lambda x : len(x.split(','))) 
    friends_norm_mdl = min_max_scaler.fit(user[['friends']])
    user['friends_norm'] = pd.DataFrame(friends_norm_mdl.transform(user[['friends']])) 
     
    #4. Elite Cnt Years
    def elite_cnt(x):
        y=str(x).split(',')
        if (y[0] == 'nan') :
            cnt = 0
        else:
            cnt = len(y)  
        return cnt, y
    
    user['elite_intm'] = user['elite'].apply(lambda x : elite_cnt(x))
    user['elite_cnt'] = [x[0] for x in user['elite_intm']]
    user['elite_year'] = [x[1] for x in user['elite_intm']]
    
    mlb = MultiLabelBinarizer()
    user = user.join(pd.DataFrame(mlb.fit_transform(user.pop('elite_year')),
                                  columns='elite_' + mlb.classes_, index=user.index))
    
    user.drop(['elite_nan'], axis=1, inplace=True)
    
    
    #5. Yelping since #days
    user['yelping_since'] = (pd.to_datetime(user['yelping_since'])).dt.normalize()
    user = user.merge(last_user_review,on='user_id',how='left')
    user['user_lifetime'] = user['date']-user['yelping_since']
    
    '''
    Need to 1st check how many users are there which have no user_lifetime value and what to do with it.
    '''
    #user.drop(user[user.user_lifetime <=0].reset_index, inplace=True)
    
    #6. Tip count
    user = user.merge(user_tips_count, on='user_id', how='left')
    user['compliment_count'] = user['compliment_count']/user['review_count']
    
    #7. Fans
    fans_norm_mdl = min_max_scaler.fit(user[['fans']])
    user['fans_norm'] = pd.DataFrame(fans_norm_mdl.transform(user[['fans']])) 
    
    # Subsetting the user features
    user.drop(['review_count','friends','fans','yelping_since','elite','elite_intm','date'], axis=1, inplace=True)

    return user

In [7]:
def add_item_features(business, checkin):
    all_categories = []
    business['hours'] = business['hours'].apply(lambda x: ' '.join(' '.join(str(x).split(', ')).split(': ')).split())
    
    for i in tqdm(range(len(business))):
        all_categories.extend(business.loc[i, 'categories'])
        
        if business.loc[i, 'hours'] == ['None'] or []:
            business.loc[i, 'total_hours'] = 0
        else:
            open_hours = business.loc[i, 'hours'][1::2]
            open_hours[-1] = open_hours[-1][:-1]
            open_hours = [x[1:len(x)-1].split('-') for x in open_hours]
            total_hours = 0

            for j in range(len(open_hours)):
                delta = datetime.strptime(open_hours[j][1], '%H:%M') - datetime.strptime(open_hours[j][0], '%H:%M')
                if delta <= timedelta(days = -1, hours = 23):
                    delta = delta - timedelta(days=-1)

                delta = delta.seconds//3600
                total_hours = total_hours + delta
            business.loc[i, 'total_hours'] = total_hours
    
    all_categories = list(set(all_categories))
    category_features = pd.DataFrame(index = range(0, len(business)), columns = all_categories)
    
    for i in tqdm(range(len(business))):
        for j in range(len(business.loc[i, 'categories'])):
            category_features.loc[i, business.loc[i, 'categories'][j]] = 1
    
    checkin['total_checkins'] = checkin['date'].apply(lambda x: len(str(x).split(', ')))
    checkin['first_checkin'] = checkin['date'].apply(lambda x: min(str(x).split(', ')))
    checkin['last_checkin'] = checkin['date'].apply(lambda x: max(str(x).split(', ')))
    
    for i in tqdm(range(len(checkin))):
        last_checkin = datetime.strptime(str(checkin.loc[i, 'last_checkin']), '%Y-%m-%d %H:%M:%S').date()
        first_checkin = datetime.strptime(str(checkin.loc[i, 'first_checkin']), '%Y-%m-%d %H:%M:%S').date()
        
        checkin.loc[i, 'age_of_business'] = (last_checkin - first_checkin).days
            
    # Pending - To join business and checkin
    return business

In [11]:
user = add_user_features(user, ratings, tips)

TypeError: ufunc subtract cannot use operands with types dtype('<U19') and dtype('<M8[ns]')