# Collaborative Filtering

Collaborative recommendation using Implicit ALS algorithm.   
Data source: Enriched url Data 

In [1]:
import re
import os
import math

import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

# Preprocessing

## Importing Article Master for Article Lookup

In [2]:
article_master= pd.read_csv(r'./URL_to_Article/URL_ArticleID_cHash_mapping.csv', sep=';')
article_master['reduced_title'] = article_master.apply(lambda row: re.sub('[^A-Za-z0-9]+', '', row.title.lower()), axis = 1)


In [3]:
article_lookup_ID = article_master.loc[:,['article_id', 'title', 'reduced_title']]
article_lookup_ID = article_lookup_ID.drop_duplicates(['article_id', 'reduced_title'], keep='first')
article_lookup_ID = article_lookup_ID.drop(['reduced_title'], axis = 1)
pd.set_option('display.max_colwidth', 200)
article_lookup_ID.head()

Unnamed: 0,article_id,title
0,12,Bridging the industrial heat divide
1,20,EUSEW 2016 - energy efficiency awards
2,21,How much can energy management actually save?
4,22,All eyes on China’s 13th Five-Year Plan for energy
7,23,EEIP at Turkish Energy Efficiency Week


## Importing Interaction and processing 

In [4]:
raw_interaction = pd.read_csv(r'./Enriched_data/new__report-5670-20190711132352.csv')
raw_interaction = raw_interaction.loc[:,['IP Address', 'Entry Page', 'Page Visits', 'Visit Duration']]

In [5]:
# pd.set_option('max_colwidth', 100)
page_visit_url = raw_interaction["Page Visits"].str.split("-->", expand=True)
page_count = pd.DataFrame(page_visit_url.apply(lambda x: x.count(), axis=1))
page_count = page_count.rename(columns= {0: 'Page Count'})
page_visit_url = raw_interaction['Entry Page'].to_frame().join(page_visit_url)

In [6]:
flag = pd.DataFrame(np.zeros((len(page_visit_url), 1)))
flag = flag.rename(columns= {0: 'Flag'})

## Converting URLs to ArticleID

In [7]:
# Returns ArticleID from for the URL
# Returns 0 if 
#     1. URL starts with www2.
#     2. Article not found.

def urls_to_id(url):
    
    if url is None or (isinstance(url, float) and  math.isnan(url)):
        return 0
    
    #   Remove www. header from the URL
    if url[0:4] == 'www.':
        url = url[4:]
        
    #   If URL starts with www2. return 0.
    if url[0:5] == 'www2.':
        return 0
    
    url_tokens = url.split("/")
    
    #   If URL is not an article return 0
    if len(url_tokens) < 5:
        return 0
    if url_tokens[1]!= 'articles':
        return 0 
    
    
    #   If URL is an article return the article name
    token4 = url_tokens[3]
    token5 = url_tokens[4]
    
    #     If token4 not blank then it must be the article name or the cHash
    if token4 != '':
        
        #   Assuming that the 4th token is the cHash
        search_cHash_rows = article_master.loc[article_master['cHash'] == token4]
        
        #   if no match found in the cHash column
        #   Then token4 is the article name and so token5 is the cHash
        if len(search_cHash_rows) < 1:
            search_cHash_rows = article_master.loc[article_master['cHash'] == token4]
                
        #   Return ArticleID if exact match found with cHash
        if len(search_cHash_rows) == 1: 
            return int(search_cHash_rows['article_id'])
        
        #   Return the fist ArticleID if multiple articles found
        #   This is because cHash and articles have a one-to-one matching
        if len(search_cHash_rows) > 1: 
            return int(search_cHash_rows.loc[search_cHash_rows.index[0], 'article_id'])
        
        #   Return 0 by default if none of the cases match
        return 0
        
    
    #     If token4 blank then token5 must be the article name
    else:
        #   removing spaces and punctuations from the URL article name
        article_name = (re.sub('[A-Za-z0-9]+', '', token5)).lower()
        
        #   Pattern searching on title since the URL might not have the full article name
        #   This also takes care if the situation in which multiple articles have similar titles       
        search_article_name_rows = article_master[article_master['reduced_title'].str.contains(article_name)]
        
        #   Return ArticleID if match found with article_name
        if len(search_article_name_rows) == 1: 
            return int(search_article_name_rows['article_id'])
        
        #   If multiple rows found assume that all rows point to the same article
        #   return the article_id of the first row
        if len(search_article_name_rows) < 1: 
            return int(search_article_name_rows['article_id'][0])
        
        #   Return 0 by default if none of the cases match
        return 0
        

In [8]:
# Flag value notes if the row has any article pages
# If not then flag for that row is 0

cols = list(page_visit_url)
url_article_id_list = []
for i in range(len(page_visit_url)):
    url_article_id_list.append([])
    
    for j in list(page_visit_url):
        article_id = urls_to_id(page_visit_url[j][i])
        
        #  If the PageVisit1 ID is the same as EntryPage ID, then put PageVisit1 as 0
        if j == 0 and url_article_id_list[-1][-1] == article_id:            
            url_article_id_list[-1].append(0)
        
        else:
            url_article_id_list[-1].append(urls_to_id(page_visit_url[j][i]))
            
        if url_article_id_list[-1][-1] != 0:
            flag['Flag'][i] +=1 


# url_article_id_list
page_visit_id = pd.DataFrame(url_article_id_list, columns = cols)

## Creating final dataframe

In [9]:
srt = page_visit_id.iloc[:,0:len(cols)].apply(lambda x: ",".join(x.astype(str)), axis=1)
new_interaction = raw_interaction.loc[:, ~raw_interaction.columns.isin(['Entry Page','Page Visits'])].join(pd.DataFrame(srt))
new_interaction = new_interaction.rename(columns= {0: 'Pages'})
new_interaction = new_interaction.join(flag)
new_interaction = new_interaction.join(page_count)

In [10]:
new_interaction = new_interaction.dropna(subset=['Visit Duration'])
new_interaction = new_interaction[new_interaction['Visit Duration'] > 0]
new_interaction = new_interaction[new_interaction['Flag'] > 0]
new_interaction = new_interaction.reset_index(drop = True)


In [11]:
# cols = ['IPAddress', 'Page', 'Weight']
data_list = []
for i in range(len(new_interaction)):
    ip = (new_interaction.loc[i,['IP Address']]).item()
    duration = float(new_interaction.loc[i,['Visit Duration']])
    count = float(new_interaction.loc[i,['Page Count']])
    stay_count = duration / count
    
    pages = new_interaction.loc[i,['Pages']]
    for i in (pages.item()).split(','):
        if(i != '0'):
            data_list.append([ip, i, stay_count])
            
            
data = pd.DataFrame(data_list, columns = ['IPAddress', 'Page', 'Weight'])

In [12]:
data = data.groupby(['IPAddress', 'Page']).sum().reset_index()
data['Weight'] = (data['Weight'] - data['Weight'].min())/(data['Weight'].max() - data['Weight'].min())
data.head()

Unnamed: 0,IPAddress,Page,Weight
0,103.218.216.126,100113,0.379572
1,103.218.230.194,100135,0.083675
2,103.224.105.10,53,0.016155
3,103.25.120.134,100132,0.071493
4,103.73.96.150,100097,0.001579


In [13]:
data['user_id'] = data['IPAddress'].astype("category").cat.codes
data['page_id'] = data['Page'].astype("category").cat.codes

user_lookup = data[['user_id', 'IPAddress']].drop_duplicates()

page_lookup = data[['page_id', 'Page']].drop_duplicates()
page_lookup['Page'] = page_lookup['Page'].astype('int64')
page_lookup = pd.merge(page_lookup, article_lookup_ID, how='inner', left_on = 'Page', right_on = 'article_id')
page_lookup = page_lookup.drop(['Page'], axis = 1)



data = data.drop(['IPAddress', 'Page'], axis = 1)

In [14]:
data = data[['user_id', 'page_id', 'Weight']]
data.head()

Unnamed: 0,user_id,page_id,Weight
0,0,33,0.379572
1,1,53,0.083675
2,2,120,0.016155
3,3,50,0.071493
4,4,21,0.001579


# Model Creation and Fitting

In [15]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)

sparse_item_user = sparse.csr_matrix((data['Weight'].astype(float), (data['page_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['Weight'].astype(float), (data['user_id'], data['page_id'])))

In [16]:
# Initialize the als model and fit it using the sparse item-user matrix
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

In [17]:
# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')


In [18]:
#Fit the model
model.fit(data_conf)

100%|██████████████████████████████████████████████████████████████████████████████| 20.0/20 [00:00<00:00, 1862.89it/s]


# Recommendation

In [19]:
# page_lookup.head()
page_lookup.loc[page_lookup['title'] == 'Utilize all the available energy — Heat recovery']

Unnamed: 0,page_id,article_id,title
66,109,100211,Utilize all the available energy — Heat recovery


In [20]:
data.head()

Unnamed: 0,user_id,page_id,Weight
0,0,33,0.379572
1,1,53,0.083675
2,2,120,0.016155
3,3,50,0.071493
4,4,21,0.001579


In [21]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar articles to 'Utilize all the available energy — Heat recovery'
item_id = 66
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print(page_lookup.title.loc[page_lookup['page_id'] == idx].iloc[0])

IoT - Equipped LED lighting systems enhance Energy Efficiency
Raising the priority for industrial energy efficiency
Roundtable on Financing Industrial Energy Efficiency, October 19th
Electric Vehicles & Heat Pumps: Electric motors play a crucial role in the energy transition
Ireland’s new Renewable Energy Support Scheme (RESS) design
EU research & innovation tout court
We accelerate the Energy Transition.
ICP – the way financial institutions can make fighting climate change a business case
Turkey: Industrial energy efficiency strategy
Europe must choose a green future


In [22]:
# Preview of the Users in the database
user_lookup.head()

Unnamed: 0,user_id,IPAddress
0,0,103.218.216.126
1,1,103.218.230.194
2,2,103.224.105.10
3,3,103.25.120.134
4,4,103.73.96.150


In [23]:
# Preview of the viewing history for user id = 7

pd.merge(page_lookup, data.loc[data['user_id'] == 7], on='page_id')

Unnamed: 0,page_id,article_id,title,user_id,Weight
0,111,100214,“Simply not enough”,7,0.000526
1,63,100146,You've got a friend,7,0.000526
2,81,100165,Message to the investor community: Be that ‘bridge over troubled water’,7,0.000526
3,83,100167,Risk minimization for energy efficiency projects,7,0.000526
4,103,100205,The Times Are A-Changin’,7,0.000528
5,112,100215,More than Ronaldo - How 5 innovative Portuguese companies drive energy efficiency,7,0.000526


In [24]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

# Create recommendations for user with id 7
user_id = 7

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_user_item)

articles = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
#     articles.append(page_lookup.title.loc[data.page_id.loc[data['page_id'] == idx]].iloc[0])
    articles.append(page_lookup.title.loc[page_lookup['page_id'] == idx].iloc[0])
    scores.append(score)

In [25]:
# #------------------------------
# # CREATE USER RECOMMENDATIONS
# #------------------------------

# # Create recommendations for user with id 7
# user_id = 7

# # Use the implicit recommender.
# recommended = model.recommend(user_id, sparse_user_item)

# articles = []
# scores = []

# # Get artist names from ids
# for item in recommended:
#     idx, score = item
#     articles.append(page_lookup.title.loc[data.page_id.loc[data['page_id'] == idx]].iloc[0])
#     scores.append(score)

In [26]:
# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'articles': articles, 'score': scores})

recommendations

Unnamed: 0,articles,score
0,TOP 3 articles in Energy Efficiency fom the 2nd quarter of 2018,0.010687
1,Optimizing Electrical Motor Efficiency: Get More Bang for Your Buck,0.00573
2,Next steps in financing energy efficiency improvements in Europe’s industry and energy supply,0.005326
3,What’s next for industrial energy efficiency?,0.001502
4,Blockchain in the energy sector is moving forward - Key takeaways from the EventHorizon 2018,0.001125
5,Using Non-Energy Benefits to Build Better Business Cases,0.001103
6,De-risking energy efficiency projects: a market opportunity for project developers,0.000984
7,Evolution in energy efficiency financing,0.000933
8,Open now: The Energy Efficiency Barometer of Industry - a tool to enhance industrial energy productivity,0.000921
9,Blockchain in the energy sector: Institutional disruption?,0.000629
