In [542]:
import numpy as np
import pandas as pd

In [543]:
raw_data = pd.read_csv(
    "anonymous-msweb.data", 
    names=['attribute', 'ID', 'ignore', 'title', 'url'], 
    engine="python", 
    skiprows=7
)
raw_data.head(10)

Unnamed: 0,attribute,ID,ignore,title,url
0,A,1287,1,International AutoRoute,/autoroute
1,A,1288,1,library,/library
2,A,1289,1,Master Chef Product Information,/masterchef
3,A,1297,1,Central America,/centroam
4,A,1215,1,For Developers Only Info,/developer
5,A,1279,1,Multimedia Golf,/msgolf
6,A,1239,1,Microsoft Consulting,/msconsult
7,A,1282,1,home,/home
8,A,1251,1,Reference Support,/referencesupport
9,A,1121,1,Microsoft Magazine,/magazine


In [544]:
raw_data.shape

(131659, 5)

In [545]:
raw_data.groupby(by='attribute').count()

Unnamed: 0_level_0,ID,ignore,title,url
attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,294,294,294,294
C,32711,32711,0,0
V,98654,98654,0,0


In [550]:
data = raw_data[['attribute', 'ID', 'title', 'url']]
data.head()

Unnamed: 0,attribute,ID,title,url
0,A,1287,International AutoRoute,/autoroute
1,A,1288,library,/library
2,A,1289,Master Chef Product Information,/masterchef
3,A,1297,Central America,/centroam
4,A,1215,For Developers Only Info,/developer


In [551]:
attributes = raw_data[raw_data['attribute'] == 'A']
attributes = attributes[['ID', 'title', 'url']]
attributes.head(3)

Unnamed: 0,ID,title,url
0,1287,International AutoRoute,/autoroute
1,1288,library,/library
2,1289,Master Chef Product Information,/masterchef


In [555]:
websites = raw_data[raw_data['attribute'] == 'V']
users = raw_data[raw_data['attribute'] == 'C']
print(websites.head())

    attribute    ID  ignore title   url
295         V  1000       1  None  None
296         V  1001       1  None  None
297         V  1002       1  None  None
299         V  1001       1  None  None
300         V  1003       1  None  None


In [553]:
# Get the top visited websites
websites = websites[['ID', 'attribute']]
print(websites.shape)
websites.head()

(98654, 2)


Unnamed: 0,ID,attribute
295,1000,V
296,1001,V
297,1002,V
299,1001,V
300,1003,V


In [554]:
website_visits = websites.groupby(by='ID').count()
website_visits.columns=['visits']
website_visits.sort_values(by='visits', ascending=False).head()


Unnamed: 0_level_0,visits
ID,Unnamed: 1_level_1
1008,10836
1034,9383
1004,8463
1018,5330
1017,5108


In [253]:
website_visits = pd.merge(website_visits, attributes, left_index=True, right_on="ID")
website_visits.sort_values(by='visits', ascending=False).head()

Unnamed: 0,visits,ID,title,url
57,10836,1008,Free Downloads,/msdownload
283,9383,1034,Internet Explorer,/ie
17,8463,1004,Microsoft.com Search,/search
287,5330,1018,isapi,/isapi
212,5108,1017,Products,/products


In [336]:
top_websites = website_visits[website_visits['visits'] > 100]
print(top_websites.shape)
top_websites.head()

(104, 4)


Unnamed: 0,visits,ID,title,url
268,912,1000,regwiz,/regwiz
78,4451,1001,Support Desktop,/support
217,749,1002,End User Produced View,/athome
30,2968,1003,Knowledge Base,/kb
17,8463,1004,Microsoft.com Search,/search


In [254]:
# Make user/website matrix

In [255]:
raw_webvisits = raw_data[raw_data['attribute'] != 'A']
raw_webvisits.shape

(131365, 5)

In [256]:
raw_webvisits = raw_webvisits[['attribute', 'ID']]
raw_webvisits.head()

Unnamed: 0,attribute,ID
294,C,10001
295,V,1000
296,V,1001
297,V,1002
298,C,10002


In [257]:
webvisits = []
row = []
user = ''
#raw_webvisits.values
for line in raw_webvisits.values:
    if line[0] == 'C':
        user = line[1]
    elif line[0] == 'V':
        row = [user, line[1]]
        webvisits.append(row)
    else:
        print("ERROR")
        
webvisits = pd.DataFrame(webvisits, columns=['user', 'ID'])
webvisits.head()

Unnamed: 0,user,ID
0,10001,1000
1,10001,1001
2,10001,1002
3,10002,1001
4,10002,1003


In [258]:
webvisits.shape

(98654, 2)

In [365]:
attributes = raw_data[raw_data['attribute'] == 'A']
attributes = attributes[['ID', 'title', 'url']]
attributes = pd.merge(attributes, top_websites, on="ID")
print(attributes.columns.values)
attributes = attributes[['ID', 'visits', 'title_x', 'url_x']]
attributes.columns = ['ID', 'visits', 'title', 'url']
attributes.sort_values(by='ID').head(5)

['ID' 'title_x' 'url_x' 'visits' 'title_y' 'url_y']


Unnamed: 0,ID,visits,title,url
93,1000,912,regwiz,/regwiz
25,1001,4451,Support Desktop,/support
75,1002,749,End User Produced View,/athome
7,1003,2968,Knowledge Base,/kb
1,1004,8463,Microsoft.com Search,/search


In [366]:
data = pd.merge(webvisits, attributes, on="ID")
data.head(5)

Unnamed: 0,user,ID,visits,title,url
0,10001,1000,912,regwiz,/regwiz
1,10010,1000,912,regwiz,/regwiz
2,10039,1000,912,regwiz,/regwiz
3,10073,1000,912,regwiz,/regwiz
4,10087,1000,912,regwiz,/regwiz


In [367]:
data['count'] = 1
data.sort_values(by='user').head()

Unnamed: 0,user,ID,visits,title,url,count
0,10001,1000,912,regwiz,/regwiz,1
912,10001,1001,4451,Support Desktop,/support,1
5363,10001,1002,749,End User Produced View,/athome,1
913,10002,1001,4451,Support Desktop,/support,1
6112,10002,1003,2968,Knowledge Base,/kb,1


# Item based collaborative filtering

In [380]:
uservisits = data.pivot(index='user', columns='url', values='count')
uservisits = uservisits.fillna(value=0)
print(uservisits.shape)
uservisits.head(10)

(32301, 104)


url,/access,/accessdev,/activeplatform,/activex,/athome,/australia,/automap,/backoffice,/brasil,/canada,...,/visualc,/visualj,/vstudio,/win32dev,/windows,/windows95,/windowsce,/windowssupport,/word,/workshop
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [392]:
item = uservisits['/athome']
similarItems = uservisits.corrwith(item)

similarItems.sort_values(ascending=False).head(15)

url
/athome            1.000000
/support           0.076860
/windowssupport    0.067837
/moneyzone         0.056557
/windows           0.050309
/isapi             0.046777
/homeessentials    0.041815
/kb                0.041429
/products          0.038086
/supportnet        0.036217
/smallbiz          0.034222
/isp               0.034019
/publisher         0.033608
/windows95         0.032173
/catalog           0.031659
dtype: float64

# User based collaborative filtering

In [537]:
print(data.sort_values(by="user").head(10))
uservisits = data.pivot(index='url', columns='user', values='count')
uservisits = uservisits.fillna(value=0)
print(uservisits.shape)
uservisits.head(10)

        user    ID  visits                   title       url  count
0      10001  1000     912                  regwiz   /regwiz      1
912    10001  1001    4451         Support Desktop  /support      1
5363   10001  1002     749  End User Produced View   /athome      1
913    10002  1001    4451         Support Desktop  /support      1
6112   10002  1003    2968          Knowledge Base       /kb      1
9080   10003  1004    8463    Microsoft.com Search   /search      1
914    10003  1001    4451         Support Desktop  /support      1
6113   10003  1003    2968          Knowledge Base       /kb      1
17543  10005  1006     135                    misc     /misc      1
9081   10006  1004    8463    Microsoft.com Search   /search      1
(104, 32301)


user,10001,10002,10003,10005,10006,10007,10008,10009,10010,10011,...,42702,42703,42704,42705,42706,42707,42708,42709,42710,42711
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/access,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/accessdev,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/activeplatform,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/activex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/athome,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/australia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/automap,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/backoffice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/brasil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/canada,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [540]:
u = 10011
user = uservisits[u]
similarUsers = uservisits.corrwith(user, axis='index')
similarUsers = similarUsers.dropna()
similarUsers = similarUsers.sort_values(ascending=False)
similarUsers.head(10)

user
10011    1.00000
21000    0.74000
16662    0.70014
17563    0.70014
37130    0.70014
21724    0.70014
33490    0.70014
29893    0.70014
16000    0.70014
12808    0.70014
dtype: float64

In [541]:
similarUsers.drop(u, inplace=True)
similarUsers.head()

user
21000    0.74000
16662    0.70014
17563    0.70014
37130    0.70014
21724    0.70014
dtype: float64

In [502]:
df = pd.DataFrame(similarUsers)
s_user_id = df.iloc[0].name
s_user = uservisits[s_user_id]
s_user = pd.DataFrame(s_user)
s_user.columns=['visited']
s_user = s_user[s_user['visited'] == 1]
recommended_websites = pd.merge(s_user, top_websites, left_index=True, right_on="url")
recommended_websites = recommended_websites.sort_values(by='visits', ascending=False)
recommended_websites.head()

Unnamed: 0,visited,visits,ID,title,url
287,1.0,5330,1018,isapi,/isapi
212,1.0,5108,1017,Products,/products
78,1.0,4451,1001,Support Desktop,/support
138,1.0,287,1016,MS Excel,/excel


In [525]:
website_urls = recommended_websites['url']
visited_sites = pd.DataFrame(user[user == 1])
print('recommend:', list(website_urls))
print('visited:', list(visited_sites.index.values))

recommend: ['/isapi', '/products', '/support', '/excel']
visited: ['/excel', '/isapi', '/mspowerpoint', '/products']


In [531]:
# User without history
tw = top_websites.sort_values(by='visits', ascending=False).head()
list(tw['url'])

['/msdownload', '/ie', '/search', '/isapi', '/products']

# Test to get corrwith to work (item based)

In [372]:
a = pd.DataFrame([[1,0,0,1,1],[1,0,1,1,0],[1,1,1,1,0],[0,0,1,1,0],[1,0,0,0,1]])
a.columns = ['link_1', 'link_2', 'link_3', 'link_4', 'link_5']
a

Unnamed: 0,link_1,link_2,link_3,link_4,link_5
0,1,0,0,1,1
1,1,0,1,1,0
2,1,1,1,1,0
3,0,0,1,1,0
4,1,0,0,0,1


In [373]:
u = a['link_1']
u

0    1
1    1
2    1
3    0
4    1
Name: link_1, dtype: int64

In [378]:
correlation = a.corrwith(u, axis=0)
correlation.sort_values(ascending=False)


link_1    1.000000
link_5    0.408248
link_2    0.250000
link_4   -0.250000
link_3   -0.408248
dtype: float64

# Test to get corrwith to work (user based)

In [419]:
a = pd.DataFrame([[1,0,0,1,1],[1,0,1,1,0],[1,1,1,1,0],[0,0,1,1,0],[1,0,0,0,1]])
a = pd.DataFrame([[1,3,2,1,3],[1,3,2,1,3],[1,1,2,1,1],[3,2,1,1,0],[1,0,0,0,1]])
a.columns = ['link_1', 'link_2', 'link_3', 'link_4', 'link_5']
a.T

Unnamed: 0,0,1,2,3,4
link_1,1,1,1,3,1
link_2,3,3,1,2,0
link_3,2,2,2,1,0
link_4,1,1,1,1,0
link_5,3,3,1,0,1


In [425]:
user = a.T[0]
user

link_1    1
link_2    3
link_3    2
link_4    1
link_5    3
Name: 0, dtype: int64

In [429]:
correlation = a.T.corrwith(user, axis=0)
correlation

0    1.000000
1    1.000000
2    0.000000
3   -0.438529
4    0.000000
dtype: float64