<a href="https://colab.research.google.com/github/madhurapi/Recommender-System/blob/main/5_Collaborative_Filtering_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# #Dataset
# The data set contains information about users, gender, age, and which artists they have listened to on Last.FM.
# In our case we only use Germany’s data and transform the data into a frequency matrix.

In [None]:
# We will use this to complete 2 types of collaborative filtering:

# Item Based: which takes similarities between items’ consumption histories
# User Based: that considers similarities between user consumption histories and item similarities

In [None]:
# --- Import Libraries --- #
import pandas as pd
from scipy.spatial.distance import cosine

In [None]:
# --- Read Data --- #
data = pd.read_csv('lastfm-matrix-germany.csv')

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

Unnamed: 0,user,a perfect circle,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,51,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.head().iloc[:,2:8]

Unnamed: 0,abba,ac/dc,adam green,aerosmith,afi,air
0,0,0,0,0,0,0
1,0,0,1,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


# Item Based Collaborative Filtering

In [None]:
# --- Start Item Based Recommendations --- #
# Drop any column named "user"
df.drop(data.columns[[0]],axis = 1 , inplace = True)

In [None]:
df.head()

Unnamed: 0,a perfect circle,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,alicia keys,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#user column drop
#285 columns remaining

In [None]:
# Before we calculate our similarities we need a place to store them. 
# We create a variable called data_ibs which is a Pandas Data Frame
# (… think of this as an excel table … but it’s vegan with super powers …)

In [None]:
# Create a placeholder dataframe listing item vs. item
df_ibs = pd.DataFrame(index=df.columns,columns=df.columns)

In [None]:
# In essense the cosine similarity takes the sum product of the first and second column, 
# then dives that by the product of the square root of the sum of squares of each column

In [None]:
df_ibs

Unnamed: 0,a perfect circle,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,alicia keys,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
a perfect circle,,,,,,,,,,,...,,,,,,,,,,
abba,,,,,,,,,,,...,,,,,,,,,,
ac/dc,,,,,,,,,,,...,,,,,,,,,,
adam green,,,,,,,,,,,...,,,,,,,,,,
aerosmith,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
trivium,,,,,,,,,,,...,,,,,,,,,,
u2,,,,,,,,,,,...,,,,,,,,,,
underoath,,,,,,,,,,,...,,,,,,,,,,
volbeat,,,,,,,,,,,...,,,,,,,,,,


In [None]:


# Lets fill in those empty spaces with cosine similarities
# Loop through the columns
for i in range(0,len(df_ibs.columns)) :
    # Loop through the columns for each column
    for j in range(0,len(df_ibs.columns)) :
      # Fill in placeholder with cosine similarities
      df_ibs.iloc[i,j] = 1-cosine(df.iloc[:,i],df.iloc[:,j])

In [None]:
# With our similarity matrix filled out we can look for each items “neighbour” by looping through ‘df_ibs’, 
# sorting each column in descending order, and grabbing the name of each of the top 10 songs.

# Create a placeholder items for closes neighbours to an item
df_neighbours = pd.DataFrame(index=df_ibs.columns,columns=range(1,11))
 
# Loop through our similarity dataframe and fill in neighbouring item names
for i in range(0,len(df_ibs.columns)):
    df_neighbours.iloc[i,:10] = df_ibs.iloc[0:,i].sort_values(ascending=False)[:10].index
 
# --- End Item Based Recommendations --- #

In [None]:
# Done!
df_neighbours.head(6).iloc[:6,1:4]

Unnamed: 0,2,3,4
a perfect circle,tool,dredg,deftones
abba,madonna,robbie williams,elvis presley
ac/dc,red hot chili peppers,metallica,iron maiden
adam green,the libertines,the strokes,babyshambles
aerosmith,u2,led zeppelin,metallica
afi,funeral for a friend,rise against,fall out boy


In [None]:
df_neighbours.head(20).iloc[:,1:8]

Unnamed: 0,2,3,4,5,6,7,8
a perfect circle,tool,dredg,deftones,porcupine tree,nine inch nails,incubus,system of a down
abba,madonna,robbie williams,elvis presley,michael jackson,queen,the beatles,kelly clarkson
ac/dc,red hot chili peppers,metallica,iron maiden,the offspring,black sabbath,die toten hosen,rammstein
adam green,the libertines,the strokes,babyshambles,radiohead,franz ferdinand,the kooks,foo fighters
aerosmith,u2,led zeppelin,metallica,ac/dc,lenny kravitz,the rolling stones,jack johnson
afi,funeral for a friend,rise against,fall out boy,anti-flag,sum 41,billy talent,lostprophets
air,massive attack,goldfrapp,morcheeba,thievery corporation,jamiroquai,nouvelle vague,coldplay
alanis morissette,tori amos,alicia keys,red hot chili peppers,kelly clarkson,dido,coldplay,pearl jam
alexisonfire,atreyu,underoath,funeral for a friend,silverstein,killswitch engage,rise against,caliban
alicia keys,beyonce,norah jones,maria mena,black eyed peas,lenny kravitz,amy winehouse,christina aguilera


In [None]:
df_neighbours.shape

(285, 10)

# User Based collaborative Filtering

In [None]:
# The process for creating a User Based recommendation system is as follows:

# Have an Item Based similarity matrix at your disposal (we do…wohoo!)
# Check which items the user has consumed
# For each item the user has consumed, get the top X neighbours
# Get the consumption record of the user for each neighbour.
# Calculate a similarity score using some formula
# Recommend the items with the highest score

In [None]:
# Lets begin.

# We first need a formula. We use the sum of the product 2 vectors (lists, if you will) containing purchase history and item similarity figures. We then divide that figure by the sum of the similarities in the respective vector.
# The function looks like this

In [None]:
# --- Start User Based Recommendations --- #
 
# Helper function to get similarity scores
def getScore(history, similarities):
   return sum(history*similarities)/sum(similarities)

In [None]:
# The rest is a matter of applying this function to the data frames in the right way.
# We start by creating a variable to hold our similarity data.
# This is basically the same as our original data but with nothing filled in except the headers


# Create a place holder matrix for similarities, and fill in the user name column

df_sims = pd.DataFrame(index=df.index,columns=df.columns)

df_sims.iloc[:,:1] = data.iloc[:,:1]

In [None]:
# We now loop through the rows and columns filling in empty spaces with similarity scores.

# Note that we score items that the user has already consumed as 0, because there is no point recommending it again

In [None]:
#Loop through all rows, skip the user column, and fill with similarity scores
for i in range(0,len(df_sims.index)):
    for j in range(1,len(df_sims.columns)):
        user = df_sims.index[i]
        product = df_sims.columns[j]
 
        if df.iloc[i][j] == 1:
            df_sims.iloc[i][j] = 0
        else:
            product_top_names = df_neighbours.loc[product][1:10]
            product_top_sims = df_ibs.loc[product].sort_values(ascending=False)[1:10]
            user_purchases = df.loc[user,product_top_names]
 
            df_sims.iloc[i][j] = getScore(user_purchases,product_top_sims)

In [None]:
# We can now produc a matrix of User Based recommendations as follows:
# Get the top songs
df_recommend = pd.DataFrame(index=df_sims.index, columns=['user','1','2','3','4','5','6'])
df_recommend.iloc[0:,0] = df_sims.iloc[:,0]

In [None]:
# Instead of having the matrix filled with similarity scores, however, it would be nice to see the song names.
# This can be done with the following loop:

In [None]:
# Instead of top song scores, we want to see names
for i in range(0,len(df_sims.index)):
    df_recommend.iloc[i,1:] = df_sims.iloc[i,:].sort_values(ascending=False).iloc[1:7,].index.transpose()

In [None]:
# Print a sample
print(df_recommend.iloc[:10,:4])

  user                      1                      2                3
0    1         flogging molly               coldplay        aerosmith
1   33  red hot chili peppers          kings of leon        peter fox
2   42                 oomph!            lacuna coil        rammstein
3   51            the subways              the kooks  franz ferdinand
4   62           jack johnson                incubus       mando diao
5   75             hoobastank             papa roach           sum 41
6  130      alanis morissette  the smashing pumpkins        pearl jam
7  141           machine head        sonic syndicate          caliban
8  144                editors              nada surf      the strokes
9  150                placebo            the subways     eric clapton
