In [18]:
from collections import Set, OrderedDict
class OrderedSet(Set):
    def __init__(self, iterable=()):
        self.d = OrderedDict.fromkeys(iterable)

    def __len__(self):
        return len(self.d)

    def __contains__(self, element):
        return element in self.d

    def __iter__(self):
        return iter(self.d)
    
TRENDING_POPULAR_MODELS_QUERY = """SELECT v.model_id, COUNT(v.model_id) AS num
                                    FROM variants v
                                        JOIN listings l ON v.id = l.variant_id
                                        JOIN models m ON m.id = v.model_id
                                    WHERE l.is_inventory = 1
                                        AND v.model_id IN {trending_models}
                                        {is_trending}
                                        AND l.id IN {popular_listings}
                                    GROUP BY v.model_id
                                    ORDER BY num DESC"""

TRENDING_MODELS = [217, 471, 209, 468, 182, 76, 242, 464, 132, 463, 306, 260, 438, 314, 379, 202, 248, 449, 301, 363,
                   365, 249, 70, 513]

SUPER_SAVER_LISTINGS_QUERY = """SELECT DISTINCT city_id, listing_id FROM supersaver_listings WHERE DATE(created_at) = (SELECT MAX(DATE(created_at)) FROM supersaver_listings)"""

# Function to drop duplicates and fill NAN data
def drop_fill(df):
    return df.reset_index(drop=True).fillna(0).astype('int32')

In [19]:
import copy
import logging
import os
import pickle
import random
from collections import deque
from copy import deepcopy
from datetime import datetime

import pymysql
import numpy as np
import pandas as pd
import redis
import sentry_sdk
import urlparse
from ConfigParser import ConfigParser
from lightfm import LightFM
from scipy.sparse import coo_matrix, identity, hstack
from sentry_sdk.integrations.redis import RedisIntegration
from sklearn.preprocessing import MinMaxScaler

date_today = datetime.today().date()

In [20]:
# DB connection
connection = pymysql.connect(passwd="newreadpassword", db="truebil", host="db.truebil.com", user="truebil-read")

# Buyer filters
buyer_filters = pd.read_sql_query(
    """SELECT DISTINCT buyer_id, filter FROM buyer_filters WHERE filter != ' ' ORDER BY created_at desc""",
    connection).set_index('buyer_id', drop=True)

unique_buyers = np.unique(buyer_filters.index)
user_features_columns = ['buyer_id', 'price_b_2', 'price_bw_2_3', 'price_bw_3_4', 'price_bw_4_5', 'price_bw_5_8',
                         'price_a_8', 'Hatchback', 'Sedan', 'MPV', 'SUV', 'Petrol', 'Diesel']
price_slabs = user_features_columns[1:7]
user_features = pd.DataFrame(columns=user_features_columns, dtype='int32')

# Features data preparation
fuel_types = {1: "Petrol", 2: "Diesel"}
body_types = {1: "Hatchback", 2: "Sedan", 3: "MPV", 4: "SUV"}
price_slabs_df = pd.DataFrame()
price_slabs_df['min'] = [0, 200000, 300000, 400000, 500000, 800000]
price_slabs_df['max'] = [200000, 300000, 400000, 500000, 800000, float("inf")]

# Function to generate user features
all_users = []
for buyer_id in unique_buyers:
    feature_dict = {key: 0 for key in user_features_columns}
    feature_dict["buyer_id"] = buyer_id
    buyer_filter = buyer_filters.loc[buyer_id, 'filter']
    if type(buyer_filter) is str:
        buyer_filter = [buyer_filter]

    # For a unique buyer, get all filters and take feature wise union
    column_names = {}
    feature_data = {'price_max': [], 'price_min': [], 'fuel': [], 'body_type': []}
    for filter_string in buyer_filter:
        parsed_filter = dict(urlparse.parse_qs(filter_string))
        for feature in feature_data.keys():
            if feature in parsed_filter:
                split_data = map(int, map(float, parsed_filter[feature][0].split(',')))
                feature_data[feature].extend(split_data)
            else:
                feature_data[feature].extend([0])
        column_names.update(feature_data)

    fuel_type_ids = set(column_names['fuel'])
    body_type_ids = set(column_names['body_type'])
    price_max = max(column_names['price_max'])

    for fuel_type in fuel_type_ids:
        if fuel_type in fuel_types.keys():
            feature_dict[fuel_types[fuel_type]] = 1

    for body_type in body_type_ids:
        if body_type in body_types.keys():
            feature_dict[body_types[body_type]] = 1

    # Set all price slabs between min and max price range
    price_values = {key: 0 for key in price_slabs}
    price_slabs_df['price_check'] = 0
    if price_max != 0:
        price_min = min(column_names['price_min'])  # price_min is always available when price_max is available
        max_price_slab_index = price_slabs_df[(price_slabs_df['min'] < price_max) &
                                              (price_slabs_df['max'] >= price_max)].index[0]
        min_price_slab_index = price_slabs_df[(price_slabs_df['min'] <= price_min) &
                                              (price_slabs_df['max'] > price_min)].index[0]
        range_indices = list(range(min_price_slab_index, int(max_price_slab_index) + 1))
        price_slabs_df.loc[range_indices, 'price_check'] = 1
        price_values = price_slabs_df['price_check'].tolist()
        price_values = dict(zip(price_slabs, price_values))
        feature_dict.update(price_values)

    user_features = user_features.append(feature_dict, ignore_index=True)


# Function to drop duplicates and fill NAN data
def drop_fill(df):
    return df.reset_index(drop=True).fillna(0).astype('int32')

# Data Retrieval

# Item features
item_features = pd.read_sql_query(
    "(select listings.id as listing_id, localities.city_id, ifnull(price, 0)<200000, ifnull(price, 0) between 100000 and 300000 , ifnull(price, 0) between 200000 and 400000 , ifnull(price, 0) between 300000 and 500000, ifnull(price, 0) between 400000 and 600000,ifnull(price, 0) between 500000 and 700000 ,ifnull(price, 0) between 600000 and 800000, ifnull(price, 0) between 700000 and 900000, ifnull(price, 0)>900000, ifnull(listings.body_type_id=1,0), ifnull(listings.body_type_id=2, 0),ifnull(listings.body_type_id=3, 0), ifnull(listings.body_type_id=4, 0), listings.fuel_type_id_primary=1, listings.fuel_type_id_primary=2, ifnull(cast(listings.status as unsigned), 2) as status, ifnull(cast(listings.is_inventory as unsigned), 0) as is_inventory from listings join  localities on listings.locality_id = localities.id) union (select listings.id, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0 from listings where locality_id is null) ", connection)

# Active listings and their features
active_listings = item_features.query("status in (3, 9)")[0:].reset_index()
active_listings.drop(['index'], axis=1, inplace=True)

# Max listing and Max user id
max_listing_id = max(item_features['listing_id'])
max_buyer_id = pd.read_sql_query("(select max(id) as id from buyers)", connection)
max_buyer_id = max_buyer_id['id'][0]

# CRFs
crfs = pd.read_sql_query(
    "(SELECT buyer_id, listing_id,date(buyer_listings.created_at) as created_at FROM buyer_listings JOIN listings ON buyer_listings.listing_id = listings.id JOIN "
    "buyers ON buyer_listings.buyer_id = buyers.id)",
    connection).dropna()
crfs = crfs[crfs['created_at'] != '0000-00-00']
crfs['days'] = (date_today - crfs.loc[:,'created_at']).dt.days
crfs = crfs.drop('created_at',1)

# Offers
offers = pd.read_sql_query(
    "(SELECT buyer_id, listing_id,date(buyer_listing_offers.created_at) as created_at FROM buyer_listing_offers JOIN listings ON "
    "buyer_listing_offers.listing_id = listings.id JOIN "
    "buyers ON buyer_listing_offers.buyer_id = buyers.id)",
    connection).dropna()
offers = offers[offers['created_at'] != '0000-00-00']
offers['days'] = (date_today - offers.loc[:,'created_at']).dt.days
offers = offers.drop('created_at',1)

# Is seen
seen = pd.read_sql_query(
    "(SELECT buyers.id AS buyer_id, user_seens.listing_id, date(user_seens.created_at) as created_at FROM "
    "user_seens JOIN users ON user_seens.user_id=users.id JOIN buyers ON users.mobile=buyers.mobile WHERE user_seens.seen_count>0)",
    connection).dropna()
seen = seen[seen['created_at'] != '0000-00-00']
seen['days'] = (date_today - seen.loc[:,'created_at']).dt.days
seen = seen.drop('created_at', 1)

# Is Shortlisted
shortlisted = pd.read_sql_query(
    "(SELECT buyers.id AS buyer_id, user_shortlists.listing_id, date(user_shortlists.created_at) as created_at "
    "FROM user_shortlists JOIN users ON user_shortlists.user_id=users.id JOIN buyers ON users.mobile=buyers.mobile WHERE user_shortlists.is_shortlisted=1)",
    connection).dropna()
shortlisted = shortlisted[shortlisted['created_at'] != '0000-00-00']
shortlisted['days'] = (date_today - shortlisted.loc[:,'created_at']).dt.days
shortlisted = shortlisted.drop('created_at', 1)

# Invalid Numbers
invalid_numbers = pd.read_sql_query(
    "(SELECT buyers.id FROM buyers JOIN invalid_numbers ON buyers.mobile=invalid_numbers.mobile)", connection).astype(
    'int32')

# Data CLeaning

# Filling the missing listing ids info

item_indices = np.array(range(1, max_listing_id + 1))
item_ids = item_features['listing_id'].values

item_features = item_features.append(pd.DataFrame(
    ([i, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0] for i in np.setdiff1d(item_indices, item_ids)),
    columns=item_features.columns), ignore_index=True)
item_features = item_features.sort_values('listing_id')

item_features = drop_fill(item_features)

# Filling the missing buyer ids info

user_indices = np.array(range(1, max_buyer_id + 1))
user_ids = user_features['buyer_id'].values

user_features = user_features.append(
    pd.DataFrame(([i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for i in np.setdiff1d(user_indices, user_ids)),
                 columns=user_features.columns), ignore_index=True)
user_features = user_features.sort_values('buyer_id')

user_features = drop_fill(user_features)

# Item Features (needed for training) to csr matrix
item_features_for_model = coo_matrix(item_features.values[:, 2:-2])
item_features_csr = coo_matrix(identity(max_listing_id))

item_features_final = coo_matrix(hstack([item_features_csr, item_features_for_model])).tocsr()

# User Features (needed for training) to csr matrix
user_features_for_model = coo_matrix(user_features.values[:, 1:])
user_features_csr = coo_matrix(identity(max_buyer_id))

user_features_final = coo_matrix(hstack([user_features_csr, user_features_for_model])).tocsr()

seen['weight'] = 0.2-(seen['days']/365)*0.2
shortlisted['weight'] =  0.5 - (shortlisted['days']/365)*0.5
crfs['weight'] = 1 - (crfs['days']/365)*0.75
offers['weight'] = 1 - (offers['days']/365)*0.75

seen.loc[seen[seen['weight']<0].index.values,'weight'] = 0
shortlisted.loc[shortlisted[shortlisted['weight']<0].index.values,'weight'] = 0
crfs.loc[crfs[crfs['weight']<0].index.values,'weight'] = 0.25
offers.loc[offers[offers['weight']<0].index.values,'weight'] = 0.25

seen = seen.drop(seen[(seen.buyer_id>max_buyer_id) | (seen.buyer_id).isin(invalid_numbers['id'])].index)
shortlisted = shortlisted.drop(shortlisted[(shortlisted.buyer_id>max_buyer_id) | (shortlisted.buyer_id).isin(invalid_numbers['id'])].index)
crfs = crfs.drop(crfs[(crfs.buyer_id>max_buyer_id) | (crfs.buyer_id).isin(invalid_numbers['id'])].index)
offers = offers.drop(offers[(offers.buyer_id>max_buyer_id) | (offers.buyer_id).isin(invalid_numbers['id'])].index)

seen_array = seen.drop('days',1).drop_duplicates(subset=['buyer_id','listing_id']).values
shortlisted_array = shortlisted.drop('days',1).drop_duplicates(subset=['buyer_id','listing_id']).values
crfs_array = crfs.drop('days',1).drop_duplicates(subset=['buyer_id','listing_id']).values
offers_array = offers.drop('days',1).drop_duplicates(subset=['buyer_id','listing_id']).values

sample_weights_array = np.concatenate((seen_array, shortlisted_array, crfs_array, offers_array))
sample_weights_array[:, 0:2] = sample_weights_array[:, 0:2] - 1
interactions_array = copy.deepcopy(sample_weights_array)
interactions_array[:, 2] = 1

interactions_matrix = coo_matrix((interactions_array[:,2], (interactions_array[:,0], interactions_array[:,1])),
                                 shape=(max_buyer_id, max_listing_id))
sample_weights_matrix = coo_matrix((sample_weights_array[:,2], (sample_weights_array[:,0], sample_weights_array[:,1])),
                                   shape=(max_buyer_id, max_listing_id))

# Model training
model = LightFM(loss='bpr')
model = model.fit(interactions_matrix, item_features=item_features_final, user_features=user_features_final,
                  sample_weight=sample_weights_matrix, epochs=30)
pool = redis.ConnectionPool(host="127.0.0.1", port=6379, db=1)
r = redis.Redis(connection_pool=pool)
r.set('model', pickle.dumps(model))
r.set('user_features', pickle.dumps(user_features_final))
r.set('item_features', pickle.dumps(item_features_final))
r.set('max_buyer_id', max_buyer_id)
r.set('training_interval', 65)

True

In [21]:
 def get_shuffled_inventory(inventory_listings):
    inventory_size = np.size(inventory_listings)
    percent_5_inventory_size = int(0.05 * inventory_size)
    shuffle_num = max(5, percent_5_inventory_size)
    change_order_listings = inventory_listings[:shuffle_num]
    random.seed(datetime.now().hour)
    rotate = random.randint(1, shuffle_num)
    change_order_listings = deque(change_order_listings)
    change_order_listings.rotate(rotate)
    inventory_listings[:shuffle_num] = change_order_listings
    return inventory_listings

In [22]:
def get_super_saver_listings():
    global SUPER_SAVER_DF
    SUPER_SAVER_DF = pd.read_sql_query(SUPER_SAVER_LISTINGS_QUERY, connection)

In [23]:
def set_super_saver_listings(listings, city_id):
    listing_recommendation = list(listings)
    city_super_saver = SUPER_SAVER_DF[SUPER_SAVER_DF['city_id'] == city_id]['listing_id'].tolist()
    city_super_saver_set = OrderedSet(city_super_saver)
    listings_set = OrderedSet(listings)
    listings_set -= city_super_saver_set
    listings = list(listings_set)

    s_index = 0
    super_saver_listings = list(city_super_saver_set)
    ordered_super_saver = []
    for listing in listing_recommendation:
        if listing in super_saver_listings:
            ordered_super_saver.append(listing)

    final_listings = [ordered_super_saver[s_index]]
    s_index += 1

    for iterator in xrange(0, len(listings), 3):
        final_listings.extend(listings[iterator:iterator + 3])
        if s_index < len(ordered_super_saver):
            final_listings.append(ordered_super_saver[s_index])
            s_index += 1

    final_listings.extend(ordered_super_saver[s_index:])
    final_listings = np.array(final_listings)
    return final_listings

In [24]:
def set_popular_listing_scores(sorted_popular_listings, item_features_bias_scores):
    popular_listings = sorted_popular_listings.tolist()
    item_features_scores = item_features_bias_scores.tolist()
    scores = []
    for listing in popular_listings:
        scores.append(item_features_scores[listing])
    scores = np.array(scores)
    scaler = MinMaxScaler(feature_range=[0, 1])
    normalized_scores = scores.reshape(-1, 1)
    scaler.fit(normalized_scores)
    normalized_scores = scaler.transform(normalized_scores).ravel()
    listings_scores_dictionary = dict(zip(sorted_popular_listings, normalized_scores))
    return listings_scores_dictionary

In [25]:
 def get_trending_popular_models(popular_listings, is_trending=False):
    if is_trending is False:
        condition = " "
    else:
        condition = """AND l.price <= m.cutoff_price"""

    query = TRENDING_POPULAR_MODELS_QUERY.format(is_trending=condition, trending_models=tuple(
        map(str, TRENDING_MODELS)), popular_listings=tuple(map(str, popular_listings)))

    models_df = pd.read_sql_query(query, connection)
    inventory_wise_models = OrderedSet(models_df['model_id'].tolist())
    all_trending_models = OrderedSet(TRENDING_MODELS)
    no_inventory_models = list(all_trending_models - inventory_wise_models)
    final_models = models_df['model_id'].tolist() + no_inventory_models

    return final_models

In [26]:
def get_popular_recommendations():
    item_features_bias_scores = model.item_biases
    item_features_baises = model.item_biases.argsort()[::-1]
    all_city_popular_listings = {}
    get_super_saver_listings()

    for city_id in np.unique(active_listings['city_id'].values):
        city_active_listings = active_listings.query("city_id == %s" % city_id)
        popular_listings = item_features_baises[np.in1d(item_features_baises, city_active_listings['listing_id'].values)]
        city_active_inventory_listings = city_active_listings[city_active_listings["is_inventory"] == 1]['listing_id'].values
        inventory_sorted_listings = popular_listings[np.in1d(popular_listings, city_active_inventory_listings)]
        inventory_sorted_listings_unshuffled = deepcopy(inventory_sorted_listings)
        inventory_sorted_listings = get_shuffled_inventory(inventory_sorted_listings)
        marketplace_sorted_listings = popular_listings[~np.in1d(popular_listings, city_active_inventory_listings)]
        sorted_popular_listings_unshuffled = np.append(inventory_sorted_listings_unshuffled,
                                                       marketplace_sorted_listings)
        sorted_popular_listings = np.append(inventory_sorted_listings, marketplace_sorted_listings)
        sorted_popular_listings = set_super_saver_listings(sorted_popular_listings, city_id)
        popular_listing_scores_dictionary = set_popular_listing_scores(sorted_popular_listings_unshuffled,
                                                                       item_features_bias_scores)

        trending_models = get_trending_popular_models(popular_listings=sorted_popular_listings_unshuffled,
                                                      is_trending=True)
        popular_models = get_trending_popular_models(popular_listings=sorted_popular_listings_unshuffled)

        all_city_popular_listings[city_id] = {'popular_models': popular_models,
                                              'trending_models': trending_models,
                                              'popular_listings': sorted_popular_listings,
                                              'popular_listing_scores_dict': popular_listing_scores_dictionary}

    return all_city_popular_listings

    popular_listings = get_popular_recommendations()
    r.set('popular_listings', pickle.dumps(popular_listings))

In [27]:
def get_city_active_listings():
    all_city_active_listings = {}

    for city_id in np.unique(active_listings['city_id'].values):
        city_active_listings = active_listings.query("city_id == %s" % city_id)
        city_active_inventory_listings = city_active_listings[city_active_listings["is_inventory"] == 1][
            'listing_id'].values
        city_active_marketplace_listings = city_active_listings[city_active_listings["is_inventory"] == 0][
            'listing_id'].values
        all_city_active_listings[city_id] = {'inventory': city_active_inventory_listings, 'marketplace':
                                             city_active_marketplace_listings}
    return all_city_active_listings

    active_listings = get_city_active_listings()
    r.set('active_listings', pickle.dumps(active_listings))