## User-Based Collaborative Filtering

### Import Library

In [151]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

### Inspect Data

In [152]:
df = pd.read_csv('Raw_Dataset.csv',usecols=[1],names=['Plant Name']).drop(0,axis=0)
df.head()

Unnamed: 0,Plant Name
1,Lidah Mertua
2,Lili Paris;Monstrea;Lidah Mertua;Gelombang Cin...
3,Agglonema;Lili Paris;Alocasia;Monstrea;Lidah M...
4,Agglonema;Lili Paris;Alocasia;Monstrea;Gelomba...
5,Agglonema;Lili Paris;Alocasia


### Preprocess Data

**Add User Id**

In [153]:
df['User_id'] = [i for i in range(len(df))]
df.head()

Unnamed: 0,Plant Name,User_id
1,Lidah Mertua,0
2,Lili Paris;Monstrea;Lidah Mertua;Gelombang Cin...,1
3,Agglonema;Lili Paris;Alocasia;Monstrea;Lidah M...,2
4,Agglonema;Lili Paris;Alocasia;Monstrea;Gelomba...,3
5,Agglonema;Lili Paris;Alocasia,4


**Split the plants name**

In [154]:
df = df.set_index(['User_id']).apply(lambda x: x.str.split(';').explode()).reset_index()
df.head()

Unnamed: 0,User_id,Plant Name
0,0,Lidah Mertua
1,1,Lili Paris
2,1,Monstrea
3,1,Lidah Mertua
4,1,Gelombang Cinta


**Add Favorite**

In [155]:
df['Favorite'] = 1
df.head()

Unnamed: 0,User_id,Plant Name,Favorite
0,0,Lidah Mertua,1
1,1,Lili Paris,1
2,1,Monstrea,1
3,1,Lidah Mertua,1
4,1,Gelombang Cinta,1


**Create User-Plant Matrix**

In [156]:
df_matrix = df.pivot_table(index='User_id',columns='Plant Name',values='Favorite')
df_matrix.head()

Plant Name,Agglonema,Alocasia,Gelombang Cinta,Lidah Mertua,Lili Paris,Monstrea,Pucuk Merah,Suplir
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,,,,1.0,,,,
1,,,1.0,1.0,1.0,1.0,1.0,
2,1.0,1.0,1.0,1.0,1.0,1.0,,1.0
3,1.0,1.0,1.0,,1.0,1.0,,
4,1.0,1.0,,,1.0,,,


### Find Similarities

In [157]:
similarities = cosine_similarity(df_matrix.fillna(0))
similarities

array([[1.        , 0.4472136 , 0.37796447, ..., 0.        , 0.5       ,
        0.        ],
       [0.4472136 , 1.        , 0.6761234 , ..., 0.25819889, 0.89442719,
        0.4472136 ],
       [0.37796447, 0.6761234 , 1.        , ..., 0.65465367, 0.56694671,
        0.37796447],
       ...,
       [0.        , 0.25819889, 0.65465367, ..., 1.        , 0.28867513,
        0.        ],
       [0.5       , 0.89442719, 0.56694671, ..., 0.28867513, 1.        ,
        0.5       ],
       [0.        , 0.4472136 , 0.37796447, ..., 0.        , 0.5       ,
        1.        ]])

In [158]:
cosine_similarity_df = pd.DataFrame(similarities, index=df_matrix.index, columns=df_matrix.index)
cosine_similarity_df.head()

User_id,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.447214,0.377964,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.707107,0.0,0.57735,0.0,0.0,0.0,0.5,0.0
1,0.447214,1.0,0.676123,0.6,0.258199,0.258199,0.547723,0.516398,0.316228,0.447214,...,0.516398,0.516398,0.632456,0.0,0.516398,0.0,0.258199,0.258199,0.894427,0.447214
2,0.377964,0.676123,1.0,0.845154,0.654654,0.654654,0.92582,0.654654,0.267261,0.377964,...,0.436436,0.436436,0.534522,0.377964,0.654654,0.377964,0.654654,0.654654,0.566947,0.377964
3,0.0,0.6,0.845154,1.0,0.774597,0.774597,0.912871,0.774597,0.316228,0.0,...,0.516398,0.516398,0.316228,0.447214,0.516398,0.447214,0.774597,0.774597,0.447214,0.447214
4,0.0,0.258199,0.654654,0.774597,1.0,1.0,0.707107,0.333333,0.408248,0.0,...,0.333333,0.666667,0.0,0.57735,0.333333,0.57735,0.666667,0.666667,0.288675,0.57735


In [174]:

userId = 2

cosine_similarity_df.drop(index=userId, inplace=True)

cosine_similarity_df.head()

User_id,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.447214,0.377964,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.707107,0.0,0.57735,0.0,0.0,0.0,0.5,0.0
1,0.447214,1.0,0.676123,0.6,0.258199,0.258199,0.547723,0.516398,0.316228,0.447214,...,0.516398,0.516398,0.632456,0.0,0.516398,0.0,0.258199,0.258199,0.894427,0.447214
3,0.0,0.6,0.845154,1.0,0.774597,0.774597,0.912871,0.774597,0.316228,0.0,...,0.516398,0.516398,0.316228,0.447214,0.516398,0.447214,0.774597,0.774597,0.447214,0.447214
4,0.0,0.258199,0.654654,0.774597,1.0,1.0,0.707107,0.333333,0.408248,0.0,...,0.333333,0.666667,0.0,0.57735,0.333333,0.57735,0.666667,0.666667,0.288675,0.57735
5,0.0,0.258199,0.654654,0.774597,1.0,1.0,0.707107,0.333333,0.408248,0.0,...,0.333333,0.666667,0.0,0.57735,0.333333,0.57735,0.666667,0.666667,0.288675,0.57735


In [175]:
n = 5

treshold = 0.3


similar_user = cosine_similarity_df[cosine_similarity_df[userId] > treshold][userId].nlargest(n)

similar_user

User_id
19    0.935414
6     0.925820
18    0.925820
3     0.845154
20    0.771517
Name: 2, dtype: float64

### Narrow Down Item Pool

In [176]:
user_picked = df_matrix[df_matrix.index == userId].dropna(axis=1,how='all')
user_picked

Plant Name,Agglonema,Alocasia,Gelombang Cinta,Lidah Mertua,Lili Paris,Monstrea,Suplir
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [177]:
similar_user_plant = df_matrix[df_matrix.index.isin(similar_user.index)].dropna(axis=1,how='all')
similar_user_plant

Plant Name,Agglonema,Alocasia,Gelombang Cinta,Lidah Mertua,Lili Paris,Monstrea,Pucuk Merah,Suplir
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,1.0,1.0,1.0,,1.0,1.0,,
6,1.0,1.0,1.0,,1.0,1.0,,1.0
18,1.0,,1.0,1.0,1.0,1.0,,1.0
19,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
20,1.0,1.0,1.0,1.0,1.0,,1.0,


In [178]:
similar_user_plant.drop(user_picked.columns,axis=1,inplace=True,errors="ignore")

similar_user_plant

Plant Name,Pucuk Merah
User_id,Unnamed: 1_level_1
3,
6,
18,
19,1.0
20,1.0


### Recommended Plant

In [179]:
# A dictionary to store item scores
item_score = {}
# Loop through items

for i in similar_user_plant.columns:
  # Get the ratings for movie i
  plant_rating = similar_user_plant[i]
  # Create a variable to store the score
  total = 0
  # Create a variable to store the number of scores
  count = 0
  # Loop through similar users
  for u in similar_user.index:
    # If the movie has rating
    if pd.isna(plant_rating[u]) == False:
      # Score is the sum of user similarity score multiply by the movie rating
      score = similar_user[u] * plant_rating[u]
      # Add the score to the total score for the movie so far
      total += score
      # Add 1 to the count
      count +=1
  # Get the average score for the item
  item_score[i] = total / count
# Convert dictionary to pandas dataframe
item_score = pd.DataFrame(item_score.items(), columns=['Plant', 'Score'])
    
# Sort the movies by score
ranked_item_score = item_score.sort_values(by='Score', ascending=False)
# Select top m movies
m = 10
ranked_item_score.head(m)

Unnamed: 0,Plant,Score
0,Pucuk Merah,0.853466
