Hello everyone, this is a snapshot of what I did on the recommendation engine at DojoMojo/Innovation Department as a data science intern. I worked closely with our chief data scientist at my company on this and offered little help on algorithms. Note that this is not the final production code. This is only the draft that I worked on. The final production code is compiled with other algorithms created by the data scientist and I can not share it.


Our business is a marketing partnership platform and our customers are all brands that want to seek marketing partnership. That being said, they could be both sellers and buyers.



The recommendation system is a recommendation engine based on machine learning backed search space. It has two main logics: Content Based Filtering and Collaborative Based Filtering. The Content Based Filtering is based on the product similarity while Collaborative Based Filtering is based on user similarity.
I created algorithms on both.


The main search space for this collections of algorithms is KDTree as it provides quick search ability. Provided the large amount of data and limited resources we have our chief data scientist thought this was the best for now. 

### Collaborative Based Filtering: Based on Connection

This is an algorithms inspired by LinkedIn connection features. Given the nature of our business, our customers are all businesses and they could be both customers and products

In [9]:
from scipy.spatial import cKDTree
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import normalize

# Define the searching method
# m is the brand_id to search for
# n is # of neighbors we want to seek
# mode has 3: overlap; all; unique
def find_neighbors(m, n: int, mode:str): 
    if m != 0:
        #tags
        mlb = MultiLabelBinarizer()
        tag_data = mlb.fit_transform(space_population['tags'])
        tags = pd.DataFrame(tag_data)

        #numerical
        numerical_data_unscaled = df2[['acquired', 'provided', 
                                       'dedicated_list_size','twitter_followers','facebook_followers', 
                                       'instagram_followers', 'pinterest_followers', 
                                       'average_sign_ups_per_campaign']]
        numerical_data_unscaled = numerical_data_unscaled.fillna(numerical_data_unscaled.mean())
        numerical_data_scaled = normalize(numerical_data_unscaled)
        numerical = pd.DataFrame(numerical_data_scaled)
        #merge
        all_d = pd.concat((numerical, tags), axis = 1)
        all_data = np.array(all_d)

        #KDTree 
        numerical_space = cKDTree(numerical)
        tags_space = cKDTree(tags)
        all_space = cKDTree(all_d)

        #finding the overlaped
        idxs = []
        idxs_rep = []
        idxs_uniq = []
        index = space_population.brand_id.index.get_loc(m)
#       index = space_population['brand_id'][space_population['brand_id'] == m].index[0]
        
        #create a list of similar brand index and store it in different lists
        values, idxs_a = all_space.query(all_data[index], k = n)    
        for x in idxs_a:
            if x in idxs:
                if x not in idxs_rep:
                    idxs_rep.append(x)
                else:
                    continue
            idxs.append(x)

        values, idxs_t = tags_space.query(tag_data[index], k = n)
        for x in idxs_t:
            if x in idxs:
                if x not in idxs_rep:
                    idxs_rep.append(x)
                else:
                    continue
            idxs.append(x)

        values, idxs_d = numerical_space.query(numerical_data_scaled[index], k = n)
        for x in idxs_d:
            if x in idxs:
                if x not in idxs_rep:
                    idxs_rep.append(x)
                else:
                    continue
            idxs.append(x)

        for x in idxs:
            if x not in idxs_rep:
                idxs_uniq.append(x)
        
        #returnning brand_index
        if mode == 'overlap':
            return idxs_rep
        if mode == 'all':
            return idxs
        if mode == 'unique':
            return idxs_uniq
    else:
        return None

# Function for generating recommended list
def recommend(brand_id:int, mode:str, quantity:int, df:pd.DataFrame):
    lst = []
    recommend_lst = []
#     index = deal['brand_id'][deal['brand_id'] == brand_id].index
    index = deal.brand_id.index.get_loc(brand_id)
    #constructing full list by brand, 1st connection, 2nd connection
    
    a_list = find_neighbors(brand_id, n = quantity, mode = mode)
    for x in a_list:
        recommend_lst.append(x)

# delete as we consider about past deals
    b_list = find_neighbors(df.iloc[index, 1], n = quantity, mode = mode)
    try:
        for x in b_list:
            recommend_lst.append(x)
    except:
        recommend_lst.append('none for 2nd connection')
#     c_list = find_neighbors(df.iloc[ index, 2], n = quantity, mode = mode)
#     try:
#         for x in c_list:
#             recommend_lst.append(x)
#     except:
#         recommend_lst.append('none for 3nd connection')
    #find the unique value of recommended brand's index
    for x in recommend_lst:
        if x not in lst:
            lst.append(x)
    return lst[0: quantity]
    


### Check Seller

This is a optional filter to check on brands who only sells. It is created as we need a clearer list of brands who are sure to sell

In [62]:
def check_seller(A: list, B:pd.DataFrame ):
    count = 0
    C = []
    for x in A:
        if x not in B.values:
            print('dropped', x)
            count += 1
            continue
        else:
            C.append(x)
        
    print('dropped', count, 'elements as they have not been sellers')
    return C


buy_sell_info = pd.read_csv('buy_sell.csv')
info = buy_sell_info['sell']

check_seller(A, info)

        

dropped 20
dropped 11
dropped 30
dropped 36
dropped 51
dropped 71
dropped 33
dropped 4
dropped 23
dropped 9 elements as they are sellers


[52]

## Test

In [45]:
deal = pd.read_csv('deal.csv')
deal.head()

Unnamed: 0,brand_id,target
0,475,305
1,4646,305
2,2299,1014
3,680,52
4,2132,1516


In [51]:
deal.iloc[4,1]

1516

In [17]:
#brand connection space
brand_1st_connection = pd.read_csv('brand_1st_connection.csv')
brand_2nd_connection = pd.read_csv('brand_2nd_connection.csv')
brand_connection = pd.merge(brand_1st_connection, brand_2nd_connection, on = 'brand', how = 'outer')
brand_connection.columns = ['brand_id', '1st_conn', '2nd_conn']
brand_connection_1 = pd.merge(space_population[['brand_id']], brand_connection, on = 'brand_id', how = 'left')

df1 = pd.read_csv('cluster_user.csv')
df1.columns = ['brand_id','dedicated_list_size','twitter_followers','facebook_followers', 'instagram_followers', 'pinterest_followers', 'average_sign_ups_per_campaign']
df1.fillna(0,inplace = True)
space_population = pd.read_csv('space_population.csv')
df2 = pd.merge(df1, space_population[['brand_id', 'acquired','provided', 'tags']], on = 'brand_id', how = 'left')
df2.head()

Unnamed: 0,brand_id,dedicated_list_size,twitter_followers,facebook_followers,instagram_followers,pinterest_followers,average_sign_ups_per_campaign,acquired,provided,tags
0,2525,100,0,0,0,0,1210.0,,,
1,6182,100000,0,0,0,0,1264.0,8150.75,1306.25,"['Shopping', 'Apparel']"
2,7235,3000,0,0,0,0,16.0,702.5,16.0,"['People & Society', ""Women\\'s Health"", 'Heal..."
3,7762,60000,0,0,0,0,0.0,,,
4,7369,10,0,0,0,0,0.0,,,


In [60]:
A = recommend(11, quantity = 15, mode = 'overlap', df = deal)

### The Below was an analysis on how to imporve the recommendation at that time

A few points for improvement:
1. based on past deal instead of similar brands: change neighbor objects to its past seller        DONE
2. put in more metrics in finding the neighbors        Working

3. no sellers to sellers

4. feedback mechanism based on competitors: not sure how to do yet
possible solution: 

    a. competitor brand prediction 

    b. reinforcement learning on recommendation choices 

    c. give choice of competitors to user and drop that option from then on

5. no seller to seller: 
Thinking on the solutions:

    a. based on current profile, Set up a Boolean column  DONE

    b. for buyer/seller mixers, find metrics for probability or find cluster on sell and buy objects

    c. buyer/seller prediction

6. recent tag only
7. additional metrics: price/budget


Current problem: 
1. finding more metrics
    reason: limited in data: need to derive more metrics

2. feedback mechanism: not sure what to do yet

3. no seller to seller: should be able to solve as long as with solid logic

4. industry tag: currently most industry tags missing: working on google description NLP project/ Parsing Project



### Collaborative Based Filtering: email matching

Since email list is such an important feature to our clients, in this algorithm, I analyzed on brands' email lists, and calculated the similarity based on email matching rate.

In [2]:
from scipy.spatial import cKDTree
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import normalize
from sklearn.preprocessing import OneHotEncoder

In [3]:
email_data = pd.read_csv('email_data.csv')

In [97]:
email_data.head()

Unnamed: 0,id,industry,brand_id,email_id,gender,age,longitude,latitude
0,2174,Beauty & Care,1020,2246,0,66.0,-97.822,37.751
1,2174,Beauty & Care,2666,2246,0,66.0,-97.822,37.751
2,2174,Beauty & Care,2664,2246,0,66.0,-97.822,37.751
3,2174,Beauty & Care,2563,2246,0,66.0,-97.822,37.751
4,2174,Beauty & Care,2360,2246,0,66.0,-97.822,37.751


In [5]:
num_data = email_data[['age', 'longitude', 'latitude']]
industry_data = email_data[['industry']]
num_data['gender_encoded'] = email_data['gender'].apply(lambda x: 0 if x == 'male' else 1)
data = normalize(num_data)


# was doing encoder but the data size too huge to operate
# encode = OneHotEncoder()
# etag_data = encode.fit_transform(tag_data).toarray()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [41]:
# input parameters
brand_id = 2

n_num = 10
r_num = 20
index_lst = email_data[email_data['brand_id']==brand_id].index.tolist()

#### finding the neighbors

In [42]:
recommend_lst = []
# creating search space
search_space = cKDTree(data)
for index in index_lst:
    values, idxs_a = search_space.query(data[index], k = n_num)
    df = pd.DataFrame(email_data['brand_id'][idxs_a])
    count = df.groupby(email_data['brand_id']).count()
    count.columns = ['count']
    count.sort_values('count', ascending = False, inplace = True)
    r_lst = []
    lst = np.ndarray.tolist(count.index.values)
    if r_num <= len(lst):
        r_lst = lst[0:r_num]
    else: 
        r_lst = lst
        print('not enough data, returning', len(lst), 'recommends')
    recommend_lst += r_lst


# find the unique value
recommend_index = []


for x in recommend_lst:
    if x not in recommend_index:
        recommend_index.append(x)
    else:
        continue


not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 10 recommends
not enough data, returning 7 recommends
not enough data, 

In [28]:
sorted(recommend_index)

[2,
 15,
 22,
 29,
 49,
 100,
 183,
 228,
 234,
 348,
 359,
 362,
 430,
 481,
 509,
 515,
 535,
 648,
 890,
 940,
 1002,
 1048,
 1137,
 1142,
 1257,
 1275,
 1815,
 2436]

#### Filtering by industry

In [44]:
industry_data = email_data['industry'][index_lst]
industry_data = industry_data.values.tolist()
industry_opt = []
for x in industry_data:
    if x not in industry_opt:
        industry_opt.append(x)
        
# filtering out the ones of different industries
# this is not yet the final logic, the final one was done with a apriori analysis chart 
# on closely related industries. 
recommend_industry_filtered = []
for y in recommend_index:
    if email_data['industry'][y] in industry_opt:
        recommend_industry_filtered.append(y)
recommend_industry_filtered

[]

In [55]:
pd.DataFrame(email_data['industry'][recommend_lst]).industry.unique()

array(['Beauty & Care', 'Media', 'eCommerce', 'Retail'], dtype=object)

In [67]:
# analyzing on industries of the emails
email_data['industry'][recommend_lst].value_counts()

Media              29949
eCommerce          16645
Beauty & Care      16570
Retail              9094
Travel & Trans.     8780
Fashion             6228
Home & Garden       5734
Food & Bev.         3259
Education           2862
Entertainment        357
Daily Deals          291
Electronics          168
Health & Fit          63
Name: industry, dtype: int64

### Filter through brand identity(no sellers to sellers)

In [45]:
def check_seller(A: list):
    buy_sell_info = pd.read_csv('buy_sell.csv')
    B = buy_sell_info['sell']
    count = 0
    C = []
    for x in A:
        if x not in B.values:
            print('dropped', x)
            count += 1
            continue
        else:
            C.append(x)
        
    print('dropped', count, 'elements as they have not been sellers')
    return C

recommend_value = email_data['brand_id'][recommend_index]
A = email_data
check_seller(recommend_value)

dropped 2664
dropped 2666
dropped 2563
dropped 515
dropped 430
dropped 362
dropped 2456
dropped 1257
dropped 1137
dropped 1894
dropped 1002
dropped 2281
dropped 1894
dropped 2369
dropped 2281
dropped 1142
dropped 2369
dropped 1257
dropped 1976
dropped 1257
dropped 2281
dropped 1894
dropped 362
dropped 2281
dropped 1257
dropped 1976
dropped 362
dropped 1257
dropped 1137
dropped 2666
dropped 1988
dropped 1096
dropped 2369
dropped 2456
dropped 1002
dropped 1988
dropped 1002
dropped 1849
dropped 2666
dropped 39 elements as they have not been sellers


[2436, 481, 43, 2436, 1571, 481, 481, 2436, 2440, 43, 1571]