In [None]:
# Packages

#pip install -U sentence-transformers
from termcolor import colored
import pandas as pd
import numpy as np
import string
import regex as re
import zipfile
import gc
from scipy.stats import boxcox
import sys 
from collections import Counter 
from tqdm import tqdm 
import pandas as pd 
import numpy as np 
import warnings, math
from termcolor import colored
import pickle
import string
from sklearn.metrics.pairwise import cosine_similarity

# for eval
from sklearn.model_selection import train_test_split
import random


# for EMB
from sentence_transformers import SentenceTransformer, util
import scipy
from sklearn import preprocessing 

In [None]:
DF_PATH = "D:\Papers\Paper 3 - Recommender Systems\Recommender-systems\Files\Oct_Forth_projectType.csv"
df = pd.read_csv(DF_PATH)

PROJECTS_DF_PATH = "D:\Papers\Paper 3 - Recommender Systems\Recommender-systems\Files\projects.csv"
projects_df = pd.read_csv(PROJECTS_DF_PATH)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

print('# donations in Train set: ', f"{len(df_train):,}")
print('# donations in Test set: ',  f"{len(df_test):,}")

print('# Donors in Train set: ', f"{len(df_train['Donor ID'].unique()):,}")
print('# Donors in Test set: ',  f"{len(df_test['Donor ID'].unique()):,}")


print('# Donors in both Train and Test sets - the ones we choose for evaluation: ',
      colored(f"{len(df_train[df_train['Donor ID'].isin(df_test['Donor ID'].values.tolist())]['Donor ID'].unique()):,}", 'blue'))


df_train = df_train[df_train['Donor ID'].isin(df_test['Donor ID'].values.tolist())].reset_index(drop=True)
df_test = df_test[df_test['Donor ID'].isin(df_train['Donor ID'].values.tolist())].reset_index(drop=True)

# sum of donation in a grouped by donor id dataset
df_main_donor_index = df[df['Donor ID'].isin(df_test['Donor ID'])].groupby(by = ['Donor ID', 'Project ID']).sum()[['Donation Amount']].reset_index().set_index('Donor ID')
df_train_donor_index = df_train.groupby(by = ['Donor ID', 'Project ID']).sum()[['Donation Amount']].reset_index().set_index('Donor ID')
df_test_donor_index = df_test.groupby(by = ['Donor ID', 'Project ID']).sum()[['Donation Amount']].reset_index().set_index('Donor ID')

In [None]:
# ==================== Embeddings ==================== #

# ----------------------- projects profiles------------------- #
def get_project_profile_emb(project_id: str, embeddings):

    # get the ids
    idx = projects_id.index(project_id)
    project_profile = embeddings[idx:idx+1]
    
    return project_profile



def get_projects_profiles_emb(ids: pd.Series, embeddings):


    profiles_list = [get_project_profile_emb(project_id, embeddings)[0] for project_id in np.ravel([ids])]
    project_profiles = np.vstack(profiles_list)

    return project_profiles




# ----------------------- Donors profiles------------------- #

def build_donors_profile_emb(donor_id: str, df_train_donor_index: pd.DataFrame):

    # get the id of each person and the projects they
    # donated to
    donations_donor_df = df_train_donor_index.loc[donor_id]


    # get the vectors of projects this person has donated to
    donor_donated_project_intrain_profiles = get_projects_profiles_emb(donations_donor_df['Project ID'], embeddings)


    # get the smoothed donated amount as the weight of each project
    donor_project_strengths = np.array(donations_donor_df['Donation Amount']).reshape(-1, 1)


    # multiply the weights and tfidf vectors
    multiplication = np.multiply(donor_donated_project_intrain_profiles, donor_project_strengths)


    # now we normalize the whole vector 
    normalized_donor_preference = preprocessing.normalize(np.sum(multiplication, axis=0).reshape(1, -1))
    


    return normalized_donor_preference


def build_donors_profiles_emb(df_test_donor_index: pd.DataFrame, df_train_donor_index: pd.DataFrame, sample_size: int = None):
    
    # now for all donors we build a profile in a dictionary
    donor_profiles = {}
    donors_in_test_set = df_test_donor_index.index.unique().tolist()[:sample_size]

    for donor_id in tqdm(donors_in_test_set[:sample_size], position=0, leave=True):
        donor_profiles[donor_id] = build_donors_profile_emb(donor_id, df_train_donor_index)

    return donor_profiles

    


In [None]:
gc.collect()

In [None]:
projects_df = projects_df[projects_df['Project ID'].isin(df['Project ID'])].reset_index(drop=True)

project_txt = projects_df.loc[:, 'project_txt']
projects_id = projects_df['Project ID'].tolist()


# ----------------- embeddings---------------------#
model = SentenceTransformer('paraphrase-distilroberta-base-v1') # , device= 'cuda'
embeddings = model.encode(project_txt, batch_size=256, show_progress_bar=True)

In [None]:
donor_profiles_emb = build_donors_profiles_emb(df_test_donor_index, df_train_donor_index)

In [None]:
# --------------------- Embeddings----------------- #

class ContentBasedRecommenderWithEmbeddings:
    MODEL_NAME = 'Content-Based-Embeddings'


    """
    Recommending projects based on the embedding of the project description
    """
    
    def __init__(self, donor_profiles: dict, embeddings: np.ndarray):
        
        self.donor_profiles = donor_profiles
        self.embeddings = embeddings

    def get_model_name(self):
        return self.MODEL_NAME

    def _get_similar_projects_to_donor_profile(self, donor_id, top_n=1000):
        

        cosine_similarities = cosine_similarity(self.donor_profiles[donor_id], self.embeddings)


        # sort them and get the indices
        similar_indices = cosine_similarities.argsort()[::-1].flatten()[:top_n]
        

        # get the id of project and the score of it
        similar_projects = sorted([(projects_id[i], cosine_similarities[0, i]) for i in similar_indices], key=lambda x:x[1], reverse=True)
        
        
        return similar_projects


    def recommend_projects(self, donor_id: str, projects_to_ignore: list = [], top_n=10, df_test: pd.DataFrame = df_test):


        donated_projects_test = df_test_donor_index.loc[donor_id, 'Project ID']
        if type(donated_projects_test) == str:
            donated_projects_test  = [donated_projects_test]
        else:
            donated_projects_test  = list(donated_projects_test .values)
        


        similar_projects = self._get_similar_projects_to_donor_profile(donor_id, top_n)



        # remove projects from ignore list if it is also in the test set
        wanted_in_test = []

        for project_id in projects_to_ignore:
            if project_id in donated_projects_test:
                wanted_in_test.append(project_id)
        
        if wanted_in_test != []: projects_to_ignore = set(projects_to_ignore).difference(wanted_in_test)

        
        similar_projects_filtered = [x for x in similar_projects if x[0] not in projects_to_ignore]

        recommendations_df = pd.DataFrame(similar_projects_filtered, columns = ['Project ID', 'recommStrength']).head(top_n)

        recommendations_df = pd.merge(left=recommendations_df, right=projects_df, how = 'left')[['recommStrength',  'Project ID', 'Project Title', 'Project Need Statement']]

        return recommendations_df, projects_to_ignore

model = ContentBasedRecommenderWithEmbeddings(donor_profiles_emb, embeddings)

In [None]:
def get_projects_donated(donor_id: str, df_donor_indexed: pd.DataFrame) -> set:

    """ 
    get the project one has donated to in a specific df
    """
    try:
        donated_projects = df_donor_indexed.loc[donor_id]['Project ID']

        return set(donated_projects if type(donated_projects) == pd.Series else [donated_projects])
        
    except KeyError:
        return []



In [None]:
#Top-N accuracy metrics 
EVAL_RANDOM_SAMPLE_NON_INTERACTED_PROJECTS = 100


class ModelEvaluator:

    def __init__(self, df_main_donor_index, projects_id):

        self.df_main_donor_index = df_main_donor_index
        self.df_test_donor_index = df_test_donor_index
        self.df_train_donor_index = df_train_donor_index
        self.projects_id = projects_id
        


    def get_not_donated_projects_sample(self, donor_id: str, sample_size: int, seed=42) -> set:
        
        """
        input: donor_id
        output: a set of not donated projects in df
        """
        
        donated_projects = get_projects_donated(donor_id=donor_id, df_donor_indexed=self.df_main_donor_index)

        not_donated_projects = [x for x in self.projects_id if x not in donated_projects]
        not_donated_projects_sample = random.sample(not_donated_projects, sample_size)
        
        return set(not_donated_projects_sample)



    def _verify_hit_top_n(self, project_id: str, recommended_projects: pd.Series, top_n) -> (bool, int):
        """ 
        input: one project id (a project our donor has donated to), a set of recommended projects
        output: the index of that project_id among all the recomms
        """
        try:
            index = next(i for i, c in enumerate(recommended_projects) if c == project_id)
        except:
            index = -1

        hit = int(index in range(0, top_n))

        return hit, index
        

    def evaluate_model_for_donor(self, model, donor_id: str):
        """
        evaluates the recommendations recommended to one donor

        """

        # what donor has donated to in the test set
        if type(self.df_test_donor_index.loc[donor_id, 'Project ID']) == pd.Series:
            donated_projects_test = set(self.df_test_donor_index.loc[donor_id, 'Project ID'])
        else:
            donated_projects_test = set([self.df_test_donor_index.loc[donor_id, 'Project ID']])

        donated_projects_count_test = len(donated_projects_test)

        # overlap
            
        df_train_d = df_train_donor_index.loc[donor_id, 'Project ID']
        df_test_d = df_test_donor_index.loc[donor_id, 'Project ID']

        if type(df_test_d) == str: 
            df_test_d = [df_test_d]
        else: df_test_d = list(df_test_d.values)


        if type(df_train_d) == str: 
            df_train_d = [df_train_d]
        else: df_train_d = list(df_train_d.values)

        overlap = 0
        for project_id in df_test_d:
            if project_id in df_train_d:
                overlap+=1
            else: continue


        recommendations_df, projects_to_ignore = model.recommend_projects(donor_id, projects_to_ignore= get_projects_donated(donor_id, 
        df_donor_indexed = df_train_donor_index), top_n = None, df_test=df_test)

        # if correct we have ranked all the projects except the ones the donor have been only in training set of 
        # this specific donor
        assert len(recommendations_df) == len(projects_id) - len(projects_to_ignore)


        
        hits_at_3_count = 0
        hits_at_5_count = 0
        hits_at_10_count = 0

         
          
        for project_id in donated_projects_test:


            # first get a sample of the ones he\she has not donated to 
            not_donated_projects_sample =  self.get_not_donated_projects_sample(donor_id, sample_size = EVAL_RANDOM_SAMPLE_NON_INTERACTED_PROJECTS, seed = 42)
                    
                    
            # add a donated project to a list of 100 projects this donor has not interacted with
            validation_projects = not_donated_projects_sample.union(set([project_id]))


            # if true means we have correctly identified the projects they have not interacted with
            assert len(not_donated_projects_sample)+1 == len(validation_projects)





            recommendations_df_ =  recommendations_df[recommendations_df['Project ID'].isin(validation_projects)].reset_index(drop=True)
 

            recommended_project_ids = recommendations_df_['Project ID'].values
            assert len(recommended_project_ids) == 101 , print(len(recommended_project_ids))


            hit_at_3, index_at_3 = self._verify_hit_top_n(project_id, recommended_project_ids, 3)
            hits_at_3_count += hit_at_3

            hit_at_5, index_at_5 = self._verify_hit_top_n(project_id, recommended_project_ids, 5)
            hits_at_5_count += hit_at_5

            hit_at_10, index_at_10 = self._verify_hit_top_n(project_id, recommended_project_ids, 10)
            hits_at_10_count += hit_at_10



        # ---------------------------- Recall --------------------------#
        recall_at_3 = hits_at_3_count/float(donated_projects_count_test)
        recall_at_5 = hits_at_5_count / float(donated_projects_count_test)
        recall_at_10 = hits_at_10_count / float(donated_projects_count_test)


        donor_metrics = {'donor_id': donor_id,
                        'hits@3_count':hits_at_3_count, 
                         'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'donated_count': donated_projects_count_test,
                          'project_overlap_test_train_count': overlap,
                          'recall@3': recall_at_3,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return donor_metrics

    def evaluate_model(self, model):
        """
        aggregates the results of evaluate_model_for_donor
        """
        metrics = []

        for idx, donor_id in enumerate(list(self.df_test_donor_index.index.unique().values)):
            
            donor_metrics = self.evaluate_model_for_donor(model, donor_id)
            if idx%500 ==0: print('%d donors processed' % idx)

            metrics.append(donor_metrics)

            detailed_results_df = pd.DataFrame(metrics).sort_values('donated_count',  ascending=False).reset_index(drop=True)
            
        glob_num_donations = float(detailed_results_df['donated_count'].sum())

        global_recall_at_3 = detailed_results_df['hits@3_count'].sum()/ glob_num_donations
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum()/ glob_num_donations
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum()/ glob_num_donations

        global_metrics = {'modelName': model.get_model_name(),
                          'recall@3': global_recall_at_3,
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    

model_evaluator = ModelEvaluator(df_main_donor_index, projects_id)


In [None]:
print(colored('Embedding- and content-based: ', 'green'))
global_metrics, detailed_results_df = model_evaluator.evaluate_model(model)


print('\nGlobal metrics:\n%s' % global_metrics)
detailed_results_df = detailed_results_df[['donor_id', 'donated_count', "hits@3_count", 'hits@5_count', 'hits@10_count', 'recall@3','recall@5','recall@10']]
detailed_results_df.head(10)