In [47]:
pip install -U sentence-transformers



In [48]:
# Packages

#pip install -U sentence-transformers
from termcolor import colored
import pandas as pd
import numpy as np
import string
import regex as re
import zipfile
import gc
from scipy.stats import boxcox
import sys 
from collections import Counter 
from tqdm import tqdm 
import pandas as pd 
import numpy as np 
import warnings, math
from termcolor import colored
import pickle
import string
from sklearn.metrics.pairwise import cosine_similarity

# for eval
from sklearn.model_selection import train_test_split
import random


# for SVD
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
pd.set_option('display.float_format','{:.5f}'.format)

# for EMB
from sentence_transformers import SentenceTransformer, util
import scipy
from sklearn import preprocessing 

# for TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from sklearn import preprocessing 

In [49]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
!unzip "/content/drive/MyDrive/Recommender Systems/Oct-2021/Forth_Oct.zip"

Archive:  /content/drive/MyDrive/Recommender Systems/Oct-2021/Forth_Oct.zip
replace Oct_Forth_projectType.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [51]:
DF_PATH = 'Oct_Forth_projectType.csv'
df = pd.read_csv(DF_PATH)

PROJECTS_DF_PATH = "/content/drive/MyDrive/Recommender Systems/Oct-2021/projects.csv"
projects_df = pd.read_csv(PROJECTS_DF_PATH)

In [52]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

print('# donations in Train set: ', f"{len(df_train):,}")
print('# donations in Test set: ',  f"{len(df_test):,}")

print('# Donors in Train set: ', f"{len(df_train['Donor ID'].unique()):,}")
print('# Donors in Test set: ',  f"{len(df_test['Donor ID'].unique()):,}")


print('# Donors in both Train and Test sets - the ones we choose for evaluation: ',
      colored(f"{len(df_train[df_train['Donor ID'].isin(df_test['Donor ID'].values.tolist())]['Donor ID'].unique()):,}", 'blue'))



df_train = df_train[df_train['Donor ID'].isin(df_test['Donor ID'].values.tolist())].reset_index(drop=True)
df_test = df_test[df_test['Donor ID'].isin(df_train['Donor ID'].values.tolist())].reset_index(drop=True)

# sum of donation in a grouped by donor id dataset
df_main_donor_index = df.groupby(by = ['Donor ID', 'Project ID']).sum()[['Donation Amount']].reset_index().set_index('Donor ID')
df_train_donor_index = df_train.groupby(by = ['Donor ID', 'Project ID']).sum()[['Donation Amount']].reset_index().set_index('Donor ID')
df_test_donor_index = df_test.groupby(by = ['Donor ID', 'Project ID']).sum()[['Donation Amount']].reset_index().set_index('Donor ID')

# donations in Train set:  43,691
# donations in Test set:  4,855
# Donors in Train set:  795
# Donors in Test set:  618
# Donors in both Train and Test sets - the ones we choose for evaluation:  [34m618[0m


In [53]:
projects_df = projects_df[projects_df['Project ID'].isin(df['Project ID'])].reset_index(drop=True)

project_txt = projects_df.loc[:, 'project_txt']
projects_id = projects_df['Project ID'].tolist()


In [54]:
def get_projects_donated(donor_id: str, df_donor_indexed: pd.DataFrame) -> set:

    """ 
    get the project one has donated to in a specific df
    """
    try:
        donated_projects = df_donor_indexed.loc[donor_id]['Project ID']

        return set(donated_projects if type(donated_projects) == pd.Series else [donated_projects])
        
    except KeyError:
        return []

## TFIDF

In [55]:

# TFIDF
vectorizer = TfidfVectorizer(max_df = 0.99,    
                             min_df = 0.001,
                             stop_words='english',
                             strip_accents='unicode',
                             analyzer='word')

tfidf_matrix = vectorizer.fit_transform(project_txt)

print(f"tfidf_matrix.shape = {tfidf_matrix.shape}")

tfidf_matrix.shape = (860, 3074)


In [56]:

# =======================TFIDF=======================#

# -------------------projects-------------------#

# single project 
def get_project_profile_tfidf(project_id, tfidf_matrix):
    """
    input: a project id
    output: the tfidf of that project
    """
    # get the ids
    idx = projects_id.index(project_id)

    # get the word count vector for that document
    project_profile = tfidf_matrix[idx:idx+1]
    return project_profile




# multiple projects profile
def get_projects_profiles_tfidf(ids: str, tfidf_matrix: np.ndarray) -> scipy.sparse:

    """
    input: a series of project ids
    output: a matrix of projects tfidfs
    """
    # repeat for all projects

    profiles_list = [get_project_profile_tfidf(project_id, tfidf_matrix) for project_id in np.ravel([ids])]


    # stack them onto each other onto a list
    project_profiles = scipy.sparse.vstack(profiles_list)

    return project_profiles













# --------------donors------------------#

# one donor
def build_donor_profile_tfidf(donor_id: str, df_train_donor_index: pd.DataFrame, tfidf_matrix: np.ndarray = tfidf_matrix) -> np.ndarray:
    """
    input: id of one donor, training set
    output: a one rowed matrix of donor profile, shape: (1, tfidf_vocab_len) : array([[..., ..., ... ]])
    """

    donor_train_donations_series = df_train_donor_index.loc[donor_id]


    # get the vectors of projects this person has donated to
    donor_donated_project_intrain_profiles = get_projects_profiles_tfidf(donor_train_donations_series['Project ID'], tfidf_matrix)
    

    # get the smoothed donated amount as the weight of each project
    donor_project_strengths = np.array(donor_train_donations_series['Donation Amount']).reshape(-1, 1)


    # multiply the weights and tfidf vectors
    multiplication = donor_donated_project_intrain_profiles.multiply(donor_project_strengths)


    # now we normalize the whole vector 
    normalized_donor_preference = preprocessing.normalize(np.sum(multiplication, axis=0))


    return normalized_donor_preference



# multiple donors
def build_donors_profiles_tfidf(df_test_donor_index: pd.DataFrame, df_train_donor_index: pd.DataFrame, sample_size: int = None):
    
    """
    input: donors in test, the trainset, how many donors
    output: a dictionary of "donor_id": "tfidf_projects"

    """
    
    # now for all donors we build a profile in a dictionary
    donor_profiles = {}
    donors_in_test_set = df_test_donor_index.index.unique().tolist()[:sample_size]

    for donor_id in tqdm(donors_in_test_set[:sample_size], position=0, leave=True):
        donor_profiles[donor_id] = build_donor_profile_tfidf(donor_id, df_train_donor_index)

    return donor_profiles


In [193]:
#-------------------- Content Based------------------ #


class ContentBasedRecommender:
    MODEL_NAME = 'Content-Based-TFIDF'

    def __init__(self, donor_profiles, tfidf_matrix):

        self.donor_profiles = donor_profiles
        self.tfidf_matrix = tfidf_matrix

    def get_model_name(self):
        return self.MODEL_NAME

    def _get_similar_projects_to_donor_profile(self, donor_id:str, top_n=1000) -> dict:

        """
        gets the donor id and calculates the cosine similarity between donor profile
        and projects tfidf
        output: dictionary(idx, sim_score)
        """
        
        cosine_similarities = cosine_similarity(self.donor_profiles[donor_id], self.tfidf_matrix)

        # sort them and get the indices
        similar_indices = cosine_similarities.argsort()[::-1].flatten()[:top_n]
        

        # get the id of project and the score of it
        similar_projects = sorted([(projects_id[i], cosine_similarities[0, i]) for i in similar_indices], key=lambda x:x[1], reverse=True)
        

        return similar_projects




    def recommend_projects(self, donor_id: str, projects_to_ignore: list = [], top_n=10, df_test: pd.DataFrame = df_test):

        """
        gets the donor id 
        outputs a df and the list of ignoring projects
        """

        donated_projects_test = df_test_donor_index.loc[donor_id, 'Project ID']
        if type(donated_projects_test) == str:
            donated_projects_test  = [donated_projects_test]
        else:
            donated_projects_test  = list(donated_projects_test .values)
        
        similar_projects = self._get_similar_projects_to_donor_profile(donor_id, top_n)




        # remove projects from ignore list if it is also in the test set
        wanted_in_test = []

        for project_id in projects_to_ignore:
            if project_id in donated_projects_test:
                wanted_in_test.append(project_id)
        
        if wanted_in_test != []: projects_to_ignore = set(projects_to_ignore).difference(wanted_in_test)


        similar_projects_filtered = [x for x in similar_projects if x[0] not in projects_to_ignore]


        recommendations_df = pd.DataFrame(similar_projects_filtered, columns = ['Project ID', 'recommStrength']).head(top_n)


        recommendations_df = recommendations_df.merge(projects_df, how = 'left')[['recommStrength',  'Project ID', 'Project Title', 'Project Need Statement']]

        return recommendations_df, projects_to_ignore




In [194]:
donor_profiles_tfidf = build_donors_profiles_tfidf(df_test_donor_index, df_train_donor_index)
tfidf_model = ContentBasedRecommender(donor_profiles_tfidf, tfidf_matrix)

100%|██████████| 618/618 [00:03<00:00, 160.55it/s]


In [199]:
recommendations_df, projects_to_ignore = tfidf_model.recommend_projects(donor_id, top_n = 10, projects_to_ignore=df_train_donor_index.loc[donor_id, 'Project ID'].values)

In [200]:
recommendations_df

Unnamed: 0,recommStrength,Project ID,Project Title,Project Need Statement
0,0.03085,00be6e8f408fd6018383cde358d07106,We've Flipped for Fluency!,"My students need fluency cards, sight word sen..."
1,0.03051,a27906a7f0331bae1761315c6639e394,Providing Modern Learners with Modern Seating ...,My students need six Kore wobble chairs.
2,0.02936,b12a64a2e5405ae505ec0bbcc7c77c20,Clean Carpet Means Clean Children,My students need Bissell vacuum with replaceme...
3,0.02852,d653e5865f1a746479435f490771f5c0,Healthy Learning Starts in Pre-K,"My students need a Sun Shade Sail Kit, refrige..."
4,0.02833,579187b34d839335938ff50ce09935b1,"Dig, Dig, Dig! Is That A Shark Tooth?",My students need these screens to sift through...
5,0.02784,779509bbb388d299d7c2cf25c2fba1d0,Sun? Wind? Fuel? Which Energy Source to Choose?!,My students need 2 Renewable Energy Education ...
6,0.02734,aaa60c99e4622c3eeeb9fe6523b3746e,No More Bullies,My students need ten copies of Bullying in Sch...
7,0.02166,410f4b08685fd168921c606d1aa3102f,Pancake Social,"My students need a PancakeBot, an SD card, pan..."
8,0.01915,99680705a261f739f5b1ba58939b7df3,"On and Off the Field, We Can Go the Distance!","My students need team shirts for track team, f..."
9,0.01474,097be16aa2c6e9d49160f06828b1ba1f,Caught Being Good Once Again!,My students need Caught Being Good Prizes such...


## Embedding

In [160]:
# ==================== Embeddings ==================== #

# ----------------------- projects profiles------------------- #
def get_project_profile_emb(project_id: str, embeddings):

    # get the ids
    idx = projects_id.index(project_id)
    project_profile = embeddings[idx:idx+1]
    
    return project_profile



def get_projects_profiles_emb(ids: pd.Series, embeddings):


    profiles_list = [get_project_profile_emb(project_id, embeddings)[0] for project_id in np.ravel([ids])]
    project_profiles = np.vstack(profiles_list)

    return project_profiles




# ----------------------- Donors profiles------------------- #

def build_donors_profile_emb(donor_id: str, df_train_donor_index: pd.DataFrame):

    # get the id of each person and the projects they
    # donated to
    donations_donor_df = df_train_donor_index.loc[donor_id]


    # get the vectors of projects this person has donated to
    donor_donated_project_intrain_profiles = get_projects_profiles_emb(donations_donor_df['Project ID'], embeddings)


    # get the smoothed donated amount as the weight of each project
    donor_project_strengths = np.array(donations_donor_df['Donation Amount']).reshape(-1, 1)


    # multiply the weights and tfidf vectors
    multiplication = np.multiply(donor_donated_project_intrain_profiles, donor_project_strengths)


    # now we normalize the whole vector 
    normalized_donor_preference = preprocessing.normalize(np.sum(multiplication, axis=0).reshape(1, -1))
    


    return normalized_donor_preference


def build_donors_profiles_emb(df_test_donor_index: pd.DataFrame, df_train_donor_index: pd.DataFrame, sample_size: int = None):
    
    # now for all donors we build a profile in a dictionary
    donor_profiles = {}
    donors_in_test_set = df_test_donor_index.index.unique().tolist()[:sample_size]

    for donor_id in tqdm(donors_in_test_set[:sample_size], position=0, leave=True):
        donor_profiles[donor_id] = build_donors_profile_emb(donor_id, df_train_donor_index)

    return donor_profiles

    


In [161]:
# ----------------- embeddings---------------------#
model = SentenceTransformer('paraphrase-distilroberta-base-v1') # , device= 'cuda'
embeddings = model.encode(project_txt, batch_size=256, show_progress_bar=True)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [162]:
# --------------------- Embeddings----------------- #

class ContentBasedRecommenderWithEmbeddings:
    MODEL_NAME = 'Content-Based-Embeddings'


    """
    Recommending projects based on the embedding of the project description
    """
    
    def __init__(self, donor_profiles: dict, embeddings: np.ndarray):
        
        self.donor_profiles = donor_profiles
        self.embeddings = embeddings

    def get_model_name(self):
        return self.MODEL_NAME

    def _get_similar_projects_to_donor_profile(self, donor_id, top_n=1000):
        

        cosine_similarities = cosine_similarity(self.donor_profiles[donor_id], self.embeddings)


        # sort them and get the indices
        similar_indices = cosine_similarities.argsort()[::-1].flatten()[:top_n]
        

        # get the id of project and the score of it
        similar_projects = sorted([(projects_id[i], cosine_similarities[0, i]) for i in similar_indices], key=lambda x:x[1], reverse=True)
        
        
        return similar_projects


    def recommend_projects(self, donor_id: str, projects_to_ignore: list = [], top_n=10, df_test: pd.DataFrame = df_test):


        donated_projects_test = df_test_donor_index.loc[donor_id, 'Project ID']
        if type(donated_projects_test) == str:
            donated_projects_test  = [donated_projects_test]
        else:
            donated_projects_test  = list(donated_projects_test .values)
        


        similar_projects = self._get_similar_projects_to_donor_profile(donor_id, top_n)



        # remove projects from ignore list if it is also in the test set
        wanted_in_test = []

        for project_id in projects_to_ignore:
            if project_id in donated_projects_test:
                wanted_in_test.append(project_id)
        
        if wanted_in_test != []: projects_to_ignore = set(projects_to_ignore).difference(wanted_in_test)

        
        similar_projects_filtered = [x for x in similar_projects if x[0] not in projects_to_ignore]

        recommendations_df = pd.DataFrame(similar_projects_filtered, columns = ['Project ID', 'recommStrength']).head(top_n)

        recommendations_df = pd.merge(left=recommendations_df, right=projects_df, how = 'left')[['recommStrength',  'Project ID', 'Project Title', 'Project Need Statement']]

        return recommendations_df, projects_to_ignore


In [163]:
donor_profiles_emb = build_donors_profiles_emb(df_test_donor_index, df_train_donor_index)
emb_model = ContentBasedRecommenderWithEmbeddings(donor_profiles_emb, embeddings)

100%|██████████| 618/618 [00:00<00:00, 888.07it/s]


In [145]:
print(df['Project ID'].dtype)
print(projects_df['Project ID'].dtype)

object
object


In [146]:
recommendations_df, projects_to_ignore = emb_model.recommend_projects(donor_id, top_n = 10, projects_to_ignore=df_train_donor_index.loc[donor_id, 'Project ID'].values)

## SVD

In [28]:
# aggregating projects and donors in donor_project
df = df[df['Donor ID'].isin(df_main_donor_index.index)].reset_index(drop=True)
print(f"len(df) = {len(df)}")

donor_project_df = df.groupby(['Donor ID', 'Project ID'])['Donation Amount'].sum().reset_index()
donors_projects_df_pivoted = donor_project_df.pivot('Donor ID', 'Project ID', 'Donation Amount').fillna(0)

donors_projects = donors_projects_df_pivoted.values
donors_id = donors_projects_df_pivoted.index
projects_id = donors_projects_df_pivoted.columns

print(f'len(donors_id): {len(donors_id)}')
print(f'len(projects_id): {len(projects_id)}')

len(df) = 48546
len(donors_id): 795
len(projects_id): 860


In [31]:
U, sigma, Vt = svds(donors_projects, k=50)
sigma = np.diag(sigma)
all_donor_predicted_preference = np.dot(np.dot(U, sigma), Vt)
svg_predicted_preference_df = pd.DataFrame(all_donor_predicted_preference, columns=projects_id, index=donors_id)#.transpose()

assert svg_predicted_preference_df.shape[1] == len(projects_id)
assert svg_predicted_preference_df.shape[0] == len(donors_id)


In [187]:
class CFRecommender:
    

    """
Collaborate filtering using Singular Value Decomposition (SVD)
    """

    MODEL_NAME = 'Collaborative-Filtering'


    def __init__(self, svg_predicted_preference_df: pd.DataFrame, projects_df: pd.DataFrame):

        self.svg_predicted_preference_df = svg_predicted_preference_df
        self.projects_df = projects_df



    def get_model_name(self):
        return self.MODEL_NAME



    def recommend_projects(self, donor_id: str, projects_to_ignore : list=[], top_n=10, df_test: pd.DataFrame = df_test):
        
        """
        
        """
        donated_projects_test = df_test_donor_index.loc[donor_id, 'Project ID']


        # if they have only one donation
        if type(donated_projects_test) == str:
            donated_projects_test  = [donated_projects_test]
        else:
            # other cases
            donated_projects_test  = list(donated_projects_test .values)




        # remove projects from ignore list if it is also in the test set
        wanted_in_test = []

        for project_id in projects_to_ignore:
            if project_id in donated_projects_test:
                wanted_in_test.append(project_id)
        
        if wanted_in_test != []: projects_to_ignore = set(projects_to_ignore).difference(wanted_in_test)


        # get recommendations
        donor_predictions = self.svg_predicted_preference_df.loc[donor_id, :].reset_index().rename(columns={donor_id: 'recommStrength'})
        

        # making sure we have all projects we want and sorting them
        recommendations_df = donor_predictions[~donor_predictions['Project ID'].isin(projects_to_ignore)].sort_values(by='recommStrength', ascending=False)[:top_n]


        # merging with projects dataset to get the descriptions
        recommendations_df_with_description = pd.merge(left = recommendations_df, right = projects_df, how='left', on='Project ID')[['recommStrength',  'Project ID', 'Project Title', 'Project Need Statement']]
        

        return recommendations_df_with_description, projects_to_ignore


In [188]:
cf_model = CFRecommender(svg_predicted_preference_df, df)

## Hybrid

In [278]:
class HybridRecommender:

    MODEL_NAME = 'Hybrid'
    def __init__(self, models: list):
        self.models = models

    def get_model_name(self):
        return self.MODEL_NAME



    def recommend_projects(self, donor_id: str, top_n: int=None, projects_to_ignore: list=[], df_test: pd.DataFrame = df_test, agg: str='add') -> (pd.DataFrame, list):

        # ignore projects ---------------------------------------------------------------
        donated_projects_test = df_test_donor_index.loc[donor_id, 'Project ID']
        if type(donated_projects_test) == str:
            donated_projects_test  = [donated_projects_test]
        else:
            donated_projects_test  = list(donated_projects_test .values)



        # remove projects from ignore list if it is also in the test set
        wanted_in_test = []

        for project_id in projects_to_ignore:
            if project_id in donated_projects_test:
                wanted_in_test.append(project_id)
        
        if wanted_in_test != []: projects_to_ignore = set(projects_to_ignore).difference(wanted_in_test)

        # find the aggregated score--------------------------------------------------------
        recommendations_dfs = []
        strength_col_names = []

        for model in self.models:
            model_name_ = model.get_model_name()
            
            
            if  model_name_== 'Content-Based-Embeddings':
                recommendations_df, projects_to_ignore = model.recommend_projects(
                    donor_id, projects_to_ignore, top_n=None, df_test=df_test
                    )


            elif model_name_ == 'Content-Based-TFIDF':
                recommendations_df, projects_to_ignore = model.recommend_projects(
                    donor_id, projects_to_ignore, top_n=None, df_test=df_test
                    )

            elif model_name_ == 'Collaborative-Filtering':
                recommendations_df, projects_to_ignore = model.recommend_projects(
                    donor_id, projects_to_ignore, top_n=None, df_test=df_test
                    )

            recommendations_df.rename(columns={'recommStrength': model_name_ + '_strength'}, inplace=True)
            recommendations_dfs.append(recommendations_df.loc[:, :'Project ID'].set_index('Project ID'))


        # join them
        recommendations_df_total = pd.concat(recommendations_dfs, join='inner', axis=1).reset_index()
        
        # aggregate their results
        if agg=='multiply': default_val = 1
        else: default_val = 0
        hybrid_strength = pd.Series([default_val]* len(recommendations_df_total), dtype=np.float64)

        

        for col_name in recommendations_df_total.columns:
            if re.findall(r'.*(strength)$', col_name):
                
                if agg=='multiply':
                    hybrid_strength = hybrid_strength.multiply(recommendations_df_total[col_name])

                elif agg=='add':
                    hybrid_strength += recommendations_df_total[col_name]


        recommendations_df_total.loc[:, 'HybridStrength'] = hybrid_strength


        recommendations_df_total = recommendations_df_total.sort_values(by='HybridStrength', ascending=False).head(top_n)
        recommendations_df_total = pd.merge(left = recommendations_df_total, right=projects_df, on='Project ID', how='inner')


        return recommendations_df_total, projects_to_ignore

In [279]:
hybrid_model = HybridRecommender([tfidf_model, emb_model, cf_model])

In [280]:
donor_id =  '009d5fc7b87883ffad248db5150bf1fc'
recommendations_df, projects_to_ignore = hybrid_model.recommend_projects(donor_id, top_n = 10, projects_to_ignore=df_train_donor_index.loc[donor_id, 'Project ID'].values, agg='add')

In [281]:
recommendations_df

Unnamed: 0,Project ID,Content-Based-TFIDF_strength,Content-Based-Embeddings_strength,Collaborative-Filtering_strength,HybridStrength,Project Subject Subcategory Tree,Project Title,Project Need Statement,School State,project_txt
0,010732a68a6a6a7a40b6827355bd2a04,0.29234,0.67124,5.1742,6.13778,"Literacy, Performing Arts",Help Little Voices Be Heard! Mics For Our Show...,My students need wireless microphones for dram...,Texas,help little voices be heard mics for our shows...
1,d2926b1c30b4ccbaf4a104c7430ce10a,0.20532,0.55311,3.82444,4.58287,Music,I Want To Bang On the Drum All Day!,My students need 2 drums to start building our...,Indiana,i want to bang on the drum all day & my studen...
2,b82b4bc5891d416a23a401f1b054af4d,0.13855,0.6097,3.66031,4.40856,"Gym & Fitness, Health & Wellness","Fitness on Your Mark, Get Set, Go!","My students need a portable PA system, balls, ...",New York,"fitness on your mark, get set, go & my student..."
3,d3831ea2611d92ca62a91a55454085ce,0.23123,0.68145,3.30866,4.22134,Literacy,Books of Our Very Own!,"My students need books of their very own, such...",Texas,books of our very own & my students need books...
4,a312a129579bf0ca8244c6cacee03baa,0.19146,0.75437,2.56167,3.5075,"Music, Performing Arts",Orff We Go!,"My students need 6 alto xylophones, 3 soprano ...",Indiana,"orff we go & my students need alto xylophones,..."
5,004a152bbe8952ea5e9d5ef89c179933,0.14375,0.6696,2.41104,3.22439,"ESL, Visual Arts",Claymation Experimentation,My students need assorted colors of Sculpey II...,New York,claymation experimentation & my students need ...
6,ae053d8e69da55b026ba4881f2f972f1,0.20755,0.68432,2.10999,3.00187,Music,Keyboards To Start Music,My students need keyboards and headphones for ...,New York,keyboards to start music & my students need ke...
7,55bdd3141b8e8c811b3f72682446d9f8,0.19449,0.55723,2.21755,2.96927,"Music, Performing Arts",Uke Troupe! New Music Group! Whoop! Whoop!,My students need twenty-six Makala soprano uku...,Indiana,uke troupe new music group whoop whoop & my st...
8,c74530cc205daa04ff554ada50426de0,0.19715,0.62025,1.87866,2.69606,Music,Recorders Please!,My students need recorders to learn the basics...,Florida,recorders please & my students need recorders ...
9,24e28393a4f9c87f14bc91558743726d,0.22674,0.74609,1.60549,2.57832,"Literacy, Literature & Writing",Our Books Need a Home,My students need 2 bookcases to give their poe...,Texas,our books need a home & my students need bookc...


## Evaluator

In [275]:
#Top-N accuracy metrics 
EVAL_RANDOM_SAMPLE_NON_INTERACTED_PROJECTS = 100


class ModelEvaluator:

    def __init__(self, df_main_donor_index, projects_id):

        self.df_main_donor_index = df_main_donor_index
        self.df_test_donor_index = df_test_donor_index
        self.df_train_donor_index = df_train_donor_index
        self.projects_id = projects_id
        


    def get_not_donated_projects_sample(self, donor_id: str, sample_size: int, seed=42) -> set:
        
        """
        input: donor_id
        output: a set of not donated projects in df
        """
        
        donated_projects = get_projects_donated(donor_id=donor_id, df_donor_indexed=self.df_main_donor_index)

        not_donated_projects = [x for x in self.projects_id if x not in donated_projects]
        not_donated_projects_sample = random.sample(not_donated_projects, sample_size)
        
        return set(not_donated_projects_sample)



    def _verify_hit_top_n(self, project_id: str, recommended_projects: pd.Series, top_n) -> (bool, int):
        """ 
        input: one project id (a project our donor has donated to), a set of recommended projects
        output: the index of that project_id among all the recomms
        """
        try:
            index = next(i for i, c in enumerate(recommended_projects) if c == project_id)
        except:
            index = -1

        hit = int(index in range(0, top_n))

        return hit, index
        

    def evaluate_model_for_donor(self, model, donor_id: str):
        """
        evaluates the recommendations recommended to one donor

        """

        # what donor has donated to in the test set
        if type(self.df_test_donor_index.loc[donor_id, 'Project ID']) == pd.Series:
            donated_projects_test = set(self.df_test_donor_index.loc[donor_id, 'Project ID'])
        else:
            donated_projects_test = set([self.df_test_donor_index.loc[donor_id, 'Project ID']])

        donated_projects_count_test = len(donated_projects_test)

        # overlap--------------------------------------------------------
            
        df_train_d = df_train_donor_index.loc[donor_id, 'Project ID']
        df_test_d = df_test_donor_index.loc[donor_id, 'Project ID']

        if type(df_test_d) == str: 
            df_test_d = [df_test_d]
        else: df_test_d = list(df_test_d.values)


        if type(df_train_d) == str: 
            df_train_d = [df_train_d]
        else: df_train_d = list(df_train_d.values)

        overlap = 0
        for project_id in df_test_d:
            if project_id in df_train_d:
                overlap+=1
            else: continue

        # get the recommendations--------------------------------------------------------
        
        recommendations_df, projects_to_ignore = model.recommend_projects(donor_id, projects_to_ignore= get_projects_donated(donor_id, 
        df_donor_indexed = df_train_donor_index), top_n = None, df_test=df_test)

        # if correct we have ranked all the projects except the ones the donor have been only in training set of 
        # this specific donor
        assert len(recommendations_df) == len(projects_id) - len(projects_to_ignore)


        
        hits_at_3_count = 0
        hits_at_5_count = 0
        hits_at_10_count = 0

         
          
        for project_id in donated_projects_test:


            # first get a sample of the ones he\she has not donated to 
            not_donated_projects_sample =  self.get_not_donated_projects_sample(donor_id, sample_size = EVAL_RANDOM_SAMPLE_NON_INTERACTED_PROJECTS, seed = 42)
                    
                    
            # add a donated project to a list of 100 projects this donor has not interacted with
            validation_projects = not_donated_projects_sample.union(set([project_id]))


            # if true means we have correctly identified the projects they have not interacted with
            assert len(not_donated_projects_sample)+1 == len(validation_projects)





            recommendations_df_ =  recommendations_df[recommendations_df['Project ID'].isin(validation_projects)].reset_index(drop=True)
 

            recommended_project_ids = recommendations_df_['Project ID'].values
            assert len(recommended_project_ids) == 101 , print(len(recommended_project_ids))


            hit_at_3, index_at_3 = self._verify_hit_top_n(project_id, recommended_project_ids, 3)
            hits_at_3_count += hit_at_3

            hit_at_5, index_at_5 = self._verify_hit_top_n(project_id, recommended_project_ids, 5)
            hits_at_5_count += hit_at_5

            hit_at_10, index_at_10 = self._verify_hit_top_n(project_id, recommended_project_ids, 10)
            hits_at_10_count += hit_at_10



        # ---------------------------- Recall --------------------------#
        recall_at_3 = hits_at_3_count/float(donated_projects_count_test)
        recall_at_5 = hits_at_5_count / float(donated_projects_count_test)
        recall_at_10 = hits_at_10_count / float(donated_projects_count_test)


        donor_metrics = {'donor_id': donor_id,
                        'hits@3_count':hits_at_3_count, 
                         'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'donated_count': donated_projects_count_test,
                          'project_overlap_test_train_count': overlap,
                          'recall@3': recall_at_3,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return donor_metrics

    def evaluate_model(self, model):
        """
        aggregates the results of evaluate_model_for_donor
        """
        metrics = []

        for idx, donor_id in enumerate(list(self.df_test_donor_index.index.unique().values)):
            
            donor_metrics = self.evaluate_model_for_donor(model, donor_id)
            if idx%500 ==0: print('%d donors processed' % idx)

            metrics.append(donor_metrics)

            detailed_results_df = pd.DataFrame(metrics).sort_values('donated_count',  ascending=False).reset_index(drop=True)
            
        glob_num_donations = float(detailed_results_df['donated_count'].sum())

        global_recall_at_3 = detailed_results_df['hits@3_count'].sum()/ glob_num_donations
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum()/ glob_num_donations
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum()/ glob_num_donations

        global_metrics = {'modelName': model.get_model_name(),
                          'recall@3': global_recall_at_3,
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    

model_evaluator = ModelEvaluator(df_main_donor_index, projects_id)


In [276]:
print(colored('Hybrid: ', 'green'))
global_metrics, detailed_results_df = model_evaluator.evaluate_model(hybrid_model)


print('\nGlobal metrics:\n%s' % global_metrics)
detailed_results_df = detailed_results_df[['donor_id', 'donated_count', "hits@3_count", 'hits@5_count', 'hits@10_count', 'recall@3','recall@5','recall@10']]
detailed_results_df.head(10)

[32mHybrid: [0m
0 donors processed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: htt

500 donors processed

Global metrics:
{'modelName': 'Hybrid', 'recall@3': 0.6050611316462895, 'recall@5': 0.6724481091839636, 'recall@10': 0.752061415979528}


Unnamed: 0,donor_id,donated_count,hits@3_count,hits@5_count,hits@10_count,recall@3,recall@5,recall@10
0,237db43817f34988f9d543ca518be4ee,90,90,90,90,1.0,1.0,1.0
1,b51c76411b51751f45527c63c69ead9e,67,65,67,67,0.97015,1.0,1.0
2,4416745560343f14a74dedcda4ec03b0,60,58,58,58,0.96667,0.96667,0.96667
3,39df9399f5384334a42905bcf0acdcbf,57,57,57,57,1.0,1.0,1.0
4,c376c98b0cdb746cf025bb21ee810376,49,34,38,43,0.69388,0.77551,0.87755
5,a299db9679f7746a805fbc300362191d,44,19,24,36,0.43182,0.54545,0.81818
6,98c4cd327c417683cd76a2ac19fc6254,43,35,39,40,0.81395,0.90698,0.93023
7,03fa60275eb66e873c30ecb86840df4b,42,42,42,42,1.0,1.0,1.0
8,24ecca49933c30a0beb83090591720c0,41,41,41,41,1.0,1.0,1.0
9,609e28d99b36d35679ae56268e4dddc3,39,24,29,32,0.61538,0.74359,0.82051
