## User-Based Collaborative Filtering

### Import Library

In [104]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

### Inspect Data

In [105]:
df = pd.read_csv('Raw_Dataset.csv',usecols=[1],names=['Plant Name']).drop(0,axis=0)
df.head()

Unnamed: 0,Plant Name
1,Lidah Mertua
2,Lili Paris;Monstrea;Lidah Mertua;Gelombang Cin...
3,Agglonema;Lili Paris;Alocasia;Monstrea;Lidah M...
4,Agglonema;Lili Paris;Alocasia;Monstrea;Gelomba...
5,Agglonema;Lili Paris;Alocasia


### Preprocess Data

**Add User Id**

In [106]:
df['User_id'] = [i for i in range(len(df))]
df.head()

Unnamed: 0,Plant Name,User_id
1,Lidah Mertua,0
2,Lili Paris;Monstrea;Lidah Mertua;Gelombang Cin...,1
3,Agglonema;Lili Paris;Alocasia;Monstrea;Lidah M...,2
4,Agglonema;Lili Paris;Alocasia;Monstrea;Gelomba...,3
5,Agglonema;Lili Paris;Alocasia,4


**Split the plants name**

In [107]:
df = df.set_index(['User_id']).apply(lambda x: x.str.split(';').explode()).reset_index()
df.head()

Unnamed: 0,User_id,Plant Name
0,0,Lidah Mertua
1,1,Lili Paris
2,1,Monstrea
3,1,Lidah Mertua
4,1,Gelombang Cinta


**Add Favorite**

In [108]:
df['Favorite'] = 1
df.head()

**Create User-Plant Matrix**

In [66]:
df_matrix = df.pivot_table(index='User_id',columns='Plant Name',values='Favorite')
df_matrix.head()

Plant Name,Agglonema,Alocasia,Gelombang Cinta,Lidah Mertua,Lili Paris,Monstrea,Pucuk Merah,Suplir
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,,,,1.0,,,,
1,,,1.0,1.0,1.0,1.0,1.0,
2,1.0,1.0,1.0,1.0,1.0,1.0,,1.0
3,1.0,1.0,1.0,,1.0,1.0,,
4,1.0,1.0,,,1.0,,,


In [67]:
df_matrix.fillna(0,inplace=True)

### Find Similarities

In [68]:
similarities = cosine_similarity(df_matrix.fillna(0))
similarities

array([[1.        , 0.4472136 , 0.37796447, ..., 0.        , 0.5       ,
        0.        ],
       [0.4472136 , 1.        , 0.6761234 , ..., 0.25819889, 0.89442719,
        0.4472136 ],
       [0.37796447, 0.6761234 , 1.        , ..., 0.65465367, 0.56694671,
        0.37796447],
       ...,
       [0.        , 0.25819889, 0.65465367, ..., 1.        , 0.28867513,
        0.        ],
       [0.5       , 0.89442719, 0.56694671, ..., 0.28867513, 1.        ,
        0.5       ],
       [0.        , 0.4472136 , 0.37796447, ..., 0.        , 0.5       ,
        1.        ]])

In [69]:
cosine_similarity_df = pd.DataFrame(similarities, index=df_matrix.index, columns=df_matrix.index)
cosine_similarity_df.head()

User_id,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.447214,0.377964,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.707107,0.0,0.57735,0.0,0.0,0.0,0.5,0.0
1,0.447214,1.0,0.676123,0.6,0.258199,0.258199,0.547723,0.516398,0.316228,0.447214,...,0.516398,0.516398,0.632456,0.0,0.516398,0.0,0.258199,0.258199,0.894427,0.447214
2,0.377964,0.676123,1.0,0.845154,0.654654,0.654654,0.92582,0.654654,0.267261,0.377964,...,0.436436,0.436436,0.534522,0.377964,0.654654,0.377964,0.654654,0.654654,0.566947,0.377964
3,0.0,0.6,0.845154,1.0,0.774597,0.774597,0.912871,0.774597,0.316228,0.0,...,0.516398,0.516398,0.316228,0.447214,0.516398,0.447214,0.774597,0.774597,0.447214,0.447214
4,0.0,0.258199,0.654654,0.774597,1.0,1.0,0.707107,0.333333,0.408248,0.0,...,0.333333,0.666667,0.0,0.57735,0.333333,0.57735,0.666667,0.666667,0.288675,0.57735


In [70]:

userId = 2

cosine_similarity_df.drop(index=userId, inplace=True)

cosine_similarity_df.head()

User_id,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.447214,0.377964,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.707107,0.0,0.57735,0.0,0.0,0.0,0.5,0.0
1,0.447214,1.0,0.676123,0.6,0.258199,0.258199,0.547723,0.516398,0.316228,0.447214,...,0.516398,0.516398,0.632456,0.0,0.516398,0.0,0.258199,0.258199,0.894427,0.447214
3,0.0,0.6,0.845154,1.0,0.774597,0.774597,0.912871,0.774597,0.316228,0.0,...,0.516398,0.516398,0.316228,0.447214,0.516398,0.447214,0.774597,0.774597,0.447214,0.447214
4,0.0,0.258199,0.654654,0.774597,1.0,1.0,0.707107,0.333333,0.408248,0.0,...,0.333333,0.666667,0.0,0.57735,0.333333,0.57735,0.666667,0.666667,0.288675,0.57735
5,0.0,0.258199,0.654654,0.774597,1.0,1.0,0.707107,0.333333,0.408248,0.0,...,0.333333,0.666667,0.0,0.57735,0.333333,0.57735,0.666667,0.666667,0.288675,0.57735


In [71]:
n = 5

treshold = 0.3

similar_user = cosine_similarity_df[cosine_similarity_df[userId] > treshold][userId].nlargest(n)

similar_user

User_id
19    0.935414
6     0.925820
18    0.925820
3     0.845154
20    0.771517
Name: 2, dtype: float64

### Narrow Down Item Pool

In [72]:
user_picked = df_matrix[df_matrix.index == userId].dropna(axis=1,how='all')
user_picked

Plant Name,Agglonema,Alocasia,Gelombang Cinta,Lidah Mertua,Lili Paris,Monstrea,Pucuk Merah,Suplir
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0


In [73]:
similar_user_plant = df_matrix[df_matrix.index.isin(similar_user.index)].dropna(axis=1,how='all')
similar_user_plant

Plant Name,Agglonema,Alocasia,Gelombang Cinta,Lidah Mertua,Lili Paris,Monstrea,Pucuk Merah,Suplir
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
6,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
18,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
19,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
20,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [74]:
similar_user_plant.drop(user_picked.columns,axis=1,inplace=True,errors="ignore")

similar_user_plant

Plant Name
User_id
3
6
18
19
20


### Recommended Plant

In [75]:
item_score = {}


for i in similar_user_plant.columns:
  plant_rating = similar_user_plant[i]
  total = 0
  count = 0
  
  for u in similar_user.index:
    if pd.isna(plant_rating[u]) == False:

      score = similar_user[u] * plant_rating[u]
      
      total += score
      
      count +=1
  
  item_score[i] = total / count

item_score = pd.DataFrame(item_score.items(), columns=['Plant', 'Score'])
    

ranked_item_score = item_score.sort_values(by='Score', ascending=False)

m = 10
ranked_item_score.head(m)

Unnamed: 0,Plant,Score


### Predict New Input

In [109]:
def recommend_plant(user_favorites):
    df = pd.read_csv('Clean_Dataset.csv')
    new_id = max(df['User_id']) + 1
    favorite = 1

    #Create New Dataframe
    df_new = pd.DataFrame(
        {"User_id" : new_id,
        "Plant Name" : user_favorites,
        "Favorite" : favorite
    })

    #Concant new dataframe to old dataframe
    df = pd.concat([df,df_new],ignore_index=True)

    #Find Similarity between user using cosine similarity
    df_matrix = df.pivot_table(index='User_id',columns='Plant Name',values='Favorite')
    similarities = cosine_similarity(df_matrix.fillna(0))
    cosine_similarity_df = pd.DataFrame(similarities, index=df_matrix.index, columns=df_matrix.index)

    #Find the 5 most similar user
    cosine_similarity_df.drop(index=new_id, inplace=True)
    n = 5
    treshold = 0.3
    similar_user = cosine_similarity_df[cosine_similarity_df[new_id] > treshold][new_id].nlargest(n)

    #Narrow Down The user by removing all the plant that has been favorited by new user
    user_picked = df_matrix[df_matrix.index == new_id].dropna(axis=1,how='all')
    similar_user_plant = df_matrix[df_matrix.index.isin(similar_user.index)].dropna(axis=1,how='all')
    similar_user_plant.drop(user_picked.columns,axis=1,inplace=True,errors="ignore")


    #Recommend plant to new user
    item_score = {}
    for i in similar_user_plant.columns:
        plant_rating = similar_user_plant[i]
        total = 0
        count = 0
        
        for u in similar_user.index:
            if pd.isna(plant_rating[u]) == False:
                score = similar_user[u] * plant_rating[u]
                total += score
                count +=1
        
        item_score[i] = total / count

    item_score = pd.DataFrame(item_score.items(), columns=['Plant', 'Score'])
    ranked_item_score = item_score.sort_values(by='Score', ascending=False)

    #Save the appended dataframe
    df.to_csv('Raw_Dataset.csv')
    
    return ranked_item_score


In [110]:
new_recommend = recommend_plant(['Agglonema','Lili Paris','Alocasia'])

In [111]:
new_recommend

Unnamed: 0,Plant,Score
2,Monstrea,0.740852
0,Gelombang Cinta,0.729603
1,Lidah Mertua,0.707107
3,Pucuk Merah,0.707107
4,Suplir,0.707107
