In [2]:
# system
import sys, os
import pickle

# data wrangling
import pandas as pd
import numpy as np

# model
from recommenders.models.tfidf.tfidf_utils import TfidfRecommender

# Load Data

In [6]:
path = os.getcwd()
data = pd.read_csv(f'{os.path.abspath(os.path.join(path, os.pardir))}/data/recommender_tags_less_brackets.csv')

## Pre-processing


In [4]:
# replace empty tag list with null so we can work with them
data.replace('[]', np.NaN, inplace = True)

# combine all tag columns into one ~ coalesce function
combined_tags = (data.communityTags.combine_first(data.interestTags)
                                   .combine_first(data.livingTags)
                                   .combine_first(data.needTags)
                                   .combine_first(data.offerTags)
                                   .combine_first(data.skillTags)
                )

# create a new df with user id and the combined_tags column
combined_tags_df = (pd.concat([data._id, combined_tags], axis = 1)
                      .rename({'communityTags' : 'combined_tags'}, axis = 1)
)

# remove users that no tags at all, they add nothing to the model
# no imputation can be done to save them
combined_tags_df_na_dropped = combined_tags_df.dropna().reset_index(drop= True)

## Train Model


In [None]:
# Create the recommender object
recommender = TfidfRecommender(id_col='_id', tokenization_method='scibert')

# Clean tokens: out of the box functionality
df_input = recommender.clean_dataframe(combined_tags_df_na_dropped, 
                                       ['combined_tags'], 
                                       'cleaned_combined_tags'
                                      ) # tfidf was taking too long, so i thought running with a sample


# Tokenize text with tokenization_method specified in class instantiation
tf, vectors_tokenized = recommender.tokenize_text(df_input, text_col='cleaned_combined_tags')

# Fit the TF-IDF vectorizer to our vectorized data
recommender.fit(tf, vectors_tokenized)

## Get Recommendations for Chosen User

In [None]:
# get top 10 recommendations for each user
top_k_recommendations = recommender.recommend_top_k_items(df_input.iloc[:500], k=10)

interested_on_user = '5ef5ee15df376c7656e0c1f0'
# get recommendation from a user from the list generated above
similar_users = recommender.get_top_k_recommendations(df_input, interested_on_user)['rec__id']

# print results for that user
{f"{k} Most similar Users" : similar_users.to_json(orient = 'records')}