## Load libraries

In [1]:
# Set up test mode to save some time
test_mode = True

In [2]:
#enable auto complete
%config IPCompleter.greedy=True
%matplotlib inline

import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np # fundamental package for scientific computing with Python
import matplotlib as cm
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import scipy
from scipy.sparse.linalg import svds
import math
import random
import sklearn

from numpy import array

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


#import os
#print(os.listdir("../input"))

from sklearn import preprocessing
# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings("ignore")

# Print all rows and columns
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_rows', None)

## Load data

In [48]:
projects = pd.read_csv('Projects.csv')
#donations = pd.read_csv('Donations.csv')
#donors = pd.read_csv('Donors.csv', low_memory=False)

In [49]:
print(' donations: ',donations.shape,'\n','donors: ',donors.shape,'\n','projects',projects.shape)

 donations:  (4687884, 7) 
 donors:  (2122640, 6) 
 projects (1208651, 15)


In [50]:
#donors.head(2)
#donations.head(2)

In [51]:
#this piece of code converts Donor_ID which is a 32-bit Hex int digits 10-1010

# create column "donor_id" with sequential integers
b=len(donors)
donors['donor_id'] = np.nan
a = list(range(10,b+10))
a = pd.Series(a)
donors['donor_id'] = a.values


#add donor_id to donors
df = pd.merge(donations, donors, on='Donor ID', how='left')
print(' df: ',df.shape,'\n','donors: ',donors.shape)

 df:  (4687884, 12) 
 donors:  (2122640, 6)


In [52]:
#donations.head(1)

In [53]:
#check for donor_id duplicates
from collections import Counter
mylist = df.donor_id
a=[k for k,v in Counter(mylist).items() if v>1]
len(a)

0

In [54]:
#this piece of code converts Project_ID which is a 32-bit Hex int digits 10-1010

# create column "project_id" with sequential integers
f=len(projects)
projects['project_id'] = np.nan
g = list(range(10,f+10))
g = pd.Series(g)
projects['project_id'] = g.values

#add project_id to projects
df = pd.merge(df, projects, on='Project ID', how='left')

print(' df shape: ',df.shape,'\n','projects: ',projects.shape)

 df shape:  (4687887, 27) 
 projects:  (1208651, 16)


In [55]:
#donations.head(1)

In [56]:
#list(donations.columns.values)

In [57]:
#check for project_id duplicates
from collections import Counter
mylist = projects.project_id
a=[k for k,v in Counter(mylist).items() if v>1]
len(a)

0

In [58]:
#check for donor_id duplicates
from collections import Counter
mylist = df.donor_id
a=[k for k,v in Counter(mylist).items() if v>1]
len(a)

552673

### Set up test mode where only 10000 rows of donation/donor dataframe are used. 
When testing is complete we will need to turn off test mode.

In [59]:
# only load a few lines in test mode
if test_mode:
    df = df.head(10000)

donations_df = df
print('shape of df is ',df.shape)

shape of df is  (10000, 27)


In [60]:
print(' donations: ',donations.shape,'\n','donors: ',donors.shape,'\n', 'shape of df is ',df.shape)

 donations:  (4687884, 7) 
 donors:  (2122640, 6) 
 shape of df is  (10000, 27)


In [61]:
# Deal with missing values
donations["Donation Amount"] = donations["Donation Amount"].fillna(0)

# Define event strength as the donated amount to a certain project
donations_df['eventStrength'] = donations_df['Donation Amount']

def smooth_donor_preference(x):
    return math.log(1+x, 2)
    
donations_full_df = donations_df \
                    .groupby(['donor_id', 'project_id'])['eventStrength'].sum() \
                    .apply(smooth_donor_preference).reset_index()
        
# Update projects dataset
project_cols = projects.columns
projects = df[project_cols].drop_duplicates()

print('# of projects: %d' % len(projects))
print('# of unique user/project donations: %d' % len(donations_full_df))

# of projects: 1889
# of unique user/project donations: 8631


In [62]:
donations_full_df.head()

Unnamed: 0,donor_id,project_id,eventStrength
0,144.0,437585,5.672425
1,528.0,405327,4.70044
2,618.0,556052,4.70044
3,668.0,49138,3.087463
4,754.0,1045629,5.672425


# Evaluation

Evaluation is important for machine learning projects, because it allows to compare objectivelly different algorithms and hyperparameter choices for models.
One key aspect of evaluation is to ensure that the trained model generalizes for data it was not trained on, using Cross-validation techniques. We are using here a simple cross-validation approach named holdout, in which a random data sample (20% in this case) are kept aside in the training process, and exclusively used for evaluation. All evaluation metrics reported here are computed using the test set.

Ps. A more robust evaluation approach could be to split train and test sets by a reference date, where the train set is composed by all interactions before that date, and the test set are interactions after that date. For the sake of simplicity, we chose the first random approach for this notebook, but you may want to try the second approach to better simulate how the recsys would perform in production predicting "future" users interactions.

In [63]:
donations_train_df, donations_test_df = train_test_split(donations_full_df,
                                   test_size=0.20,
                                   random_state=42)

print('# donations on Train set: %d' % len(donations_train_df))
print('# donations on Test set: %d' % len(donations_test_df))

# donations on Train set: 6904
# donations on Test set: 1727


In [64]:
#Indexing by donor_id to speed up the searches during evaluation
donations_full_indexed_df = donations_full_df.set_index('donor_id')
donations_train_indexed_df = donations_train_df.set_index('donor_id')
donations_test_indexed_df = donations_test_df.set_index('donor_id')

person_id -> 'donor_id'
contentId -> project_id
articles_df -> donations_df
item_id -> 'project_id'
interactions_df -> donations_df
interactions -> donations
items -> projects
interacted -> donated`

In [65]:
def get_proj_donated(donor_id, donations_df):
    # Get the user's data and merge in project info
    donated_projects = donations_df.loc[donor_id]['project_id']
    return set(donated_projects if type(donated_projects) == pd.Series else [donated_projects])

In [67]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_PROJECTS = 100

class ModelEvaluator:


    def get_not_donated_projects_sample(self, donor_id, sample_size, seed=42):
        donated_projects = get_proj_donated(donor_id, donations_full_indexed_df)
        all_projects = set(projects_df['project_id'])
        non_donated_projects = all_projects - donated_projects

        random.seed(seed)
        non_donated_projects_sample = random.sample(non_donated_projects, sample_size)
        return set(non_donated_projects_sample)

    def _verify_hit_top_n(self, project_id, recommended_projects, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_projects) if c == project_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, donor_id):
        #Getting the projects in test set
        donated_values_testset = donations_test_indexed_df.loc[donor_id]
        if type(donated_values_testset['project_id']) == pd.Series:
            person_donated_projects_testset = set(donated_values_testset['project_id'])
        else:
            person_donated_projects_testset = set([int(donated_values_testset['project_id'])])  
        donated_projects_count_testset = len(person_donated_projects_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_projects(donor_id, 
                                               projects_to_ignore=get_proj_donated(donor_id, 
                                                                                    donations_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has donated in test set
        for project_id in person_donated_projects_testset:
            #Getting a random sample (100) projects the user has not donated 
            #(to represent projects that are assumed to be not relevant to the user)
            non_donated_projects_sample = self.get_not_donated_projects_sample(donor_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_PROJECTS, 
                                                                          seed=project_id%(2**32))

            #Combining the current donated item with the 100 random projects
            projects_to_filter_recs = non_donated_projects_sample.union(set(['project_id']))

            #Filtering only recommendations that are either the donated item or from a random sample of 100 non-donated projects
            valid_recs_df = person_recs_df[person_recs_df['project_id'].isin(projects_to_filter_recs)]                    
            valid_recs = valid_recs_df['project_id'].values
            #Verifying if the current donated item is among the Top-N recommended projects
            hit_at_5, index_at_5 = self._verify_hit_top_n(project_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(project_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the donated projects that are ranked among the Top-N recommended projects, 
        #when mixed with a set of non-relevant projects
        recall_at_5 = hits_at_5_count / float(donated_projects_count_testset)
        recall_at_10 = hits_at_10_count / float(donated_projects_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'donated_count': donated_projects_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, donor_id in enumerate(list(donations_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, donor_id)  
            person_metrics['_donor_id'] = donor_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('donated_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['donated_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['donated_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

## Content-Based Filtering model
We will use Content-Based Filtering method to find projects that are similar to the project(s) that a donor has already donated to. We can calculate the similarity between projects based on data and/or text features extracted from the text data.


In [68]:
# Preprocessing of text data
textfeats = ["Project Title","Project Essay"]
for cols in textfeats:
    projects[cols] = projects[cols].astype(str) 
    projects[cols] = projects[cols].astype(str).fillna('') # FILL NA
    projects[cols] = projects[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
 
text = projects["Project Title"] + ' ' + projects["Project Essay"]
vectorizer = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             lowercase=True, # Convert all uppercase to lowercase
                             stop_words='english', # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal
                             max_df = 0.9, # Only consider words that appear in fewer than max_df percent of all documents
                             # max_features=5000 # Maximum features to be extracted                    
                            )                        
project_ids = projects['project_id'].tolist()
tfidf_matrix = vectorizer.fit_transform(text)
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<1889x12490 sparse matrix of type '<class 'numpy.float64'>'
	with 182757 stored elements in Compressed Sparse Row format>

To model the user profile, we take all the item profiles the user has interacted and average them. The average is weighted by the interaction strength, in other words, the articles the user has interacted the most (eg. liked or commented) will have a higher strength in the final user profile.

In [69]:
def get_project_profile(project_id):
    idx = project_ids.index(project_id)
    project_profile = tfidf_matrix[idx:idx+1]
    return project_profile

def get_project_profiles(ids):
    project_profiles_list = [get_project_profile(x) for x in np.ravel([ids])]
    project_profiles = scipy.sparse.vstack(project_profiles_list)
    return project_profiles

def build_donors_profile(donor_id, donations_indexed_df):
    donations_donor_df = donations_indexed_df.loc[donor_id]
    donor_project_profiles = get_project_profiles(donations_donor_df['project_id'])
    donor_project_strengths = np.array(donations_donor_df['eventStrength']).reshape(-1,1)
    #Weighted average of project profiles by the donations strength
    donor_project_strengths_weighted_avg = np.sum(donor_project_profiles.multiply(donor_project_strengths), axis=0) / (np.sum(donor_project_strengths)+1)
    donor_profile_norm = sklearn.preprocessing.normalize(donor_project_strengths_weighted_avg)
    return donor_profile_norm


def build_donors_profiles(): 
    donations_indexed_df = donations_full_df[donations_full_df['project_id'].isin(projects['project_id'])].set_index('donor_id')
    donor_profiles = {}
    for donor_id in donations_indexed_df.index.unique():
        donor_profiles[donor_id] = build_donors_profile(donor_id, donations_indexed_df)
    return donor_profiles

In [70]:
donor_profiles = build_donors_profiles()
print("# of donors with profiles: %d" % len(donor_profiles))

# of donors with profiles: 7998


In [71]:
donations_full_indexed_df.head(10)

Unnamed: 0_level_0,project_id,eventStrength
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1
144.0,437585,5.672425
528.0,405327,4.70044
618.0,556052,4.70044
668.0,49138,3.087463
754.0,1045629,5.672425
847.0,131068,7.838069
1568.0,122883,4.075533
1756.0,13838,4.70044
1861.0,122883,5.672425
2672.0,36921,4.70044


Get top 5 terms for 10 donors 

In [72]:
ind_donor=donations_full_indexed_df.index.values[0:10]
ind_donor

array([  144.,   528.,   618.,   668.,   754.,   847.,  1568.,  1756.,
        1861.,  2672.])

In [73]:
donor1 = ind_donor[0]


In [78]:
donor1_profile = pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        donor_profiles[donor1].flatten().tolist()), 
                        key=lambda x: -x[1])[:5],
                        columns=['token1', 'relevance1'])

donor1_profile

Unnamed: 0,token1,relevance1
0,sets,0.313237
1,reading,0.28672
2,zoom,0.283351
3,levels,0.232165
4,books,0.213696


Examine the results

## Content-Based Recommender

In [76]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, projects=None):
        self.project_ids = project_ids
        self.projects = projects
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_projects_to_donor_profile(self, donor_id, topn=1000):
        #Computes the cosine similarity between the donor profile and all project profiles
        cosine_similarities = cosine_similarity(donor_profiles[donor_id], tfidf_matrix)
        #Gets the top similar projects
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar projects by similarity
        similar_projects = sorted([(project_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_projects
        
    def recommend_projects(self, donor_id, projects_to_ignore=[], topn=10, verbose=False):
        similar_projects = self._get_similar_projects_to_donor_profile(donor_id)
        #Ignores projects the donor has already donated
        similar_projects_filtered = list(filter(lambda x: x[0] not in projects_to_ignore, similar_projects))
        
        recommendations_df = pd.DataFrame(similar_projects_filtered, columns=['project_id', 'recStrength']).head(topn)

        recommendations_df = recommendations_df.merge(self.projects, how = 'left', 
                                                    left_on = 'project_id', 
                                                    right_on = 'project_id')[['recStrength', 'project_id', 'Project Title', 'Project Essay']]


        return recommendations_df

In [77]:
content_based_recommender_model = ContentBasedRecommender(projects)
content_based_recommender_model.recommend_projects(donor1)

Unnamed: 0,recStrength,project_id,Project Title,Project Essay
0,1.0,437585,help us zoom up through the reading levels!,i am working hard to advance my first graders'...
1,0.336841,278524,we want to read!,my 7th and 8th grade students come from povert...
2,0.329714,943492,reading about us,i have 27 third grade students who are ready t...
3,0.311516,549132,learning to read is fun with leveled books!,our school is part of a very diverse district ...
4,0.309028,444061,creating life-long readers,"in our classroom, we thrive to be the best we ..."
5,0.307533,76716,leveled books to help us read!,"have you ever been told you need to read, but ..."
6,0.305458,757562,we are in need of books,my students are amazing students for many reas...
7,0.304062,658457,help readers grow by growing their library!,i am a veteran teacher. i have taught in nyc p...
8,0.302696,489442,love for literacy,"as dr. seuss best said, ""the more that you rea..."
9,0.296526,291256,guided reading leveled library needed for vora...,"students love shopping for books, but finding ..."


In [79]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...


KeyError: 'the label [1577993.0] is not in the [index]'

# Collaborative Filtering model


## Create the donor-project matrix

Matrix Factorization

In [80]:
#Creating a sparse pivot table with donors in rows and projects in columns
donors_projects_pivot_matrix_df = donations_train_df.pivot(index='donor_id', 
                                                          columns='project_id', 
                                                          values='eventStrength').fillna(0)

donors_projects_pivot_matrix_df.head(3)

project_id,24,385,637,914,1200,1210,1434,1461,2461,2672,...,1200143,1200707,1202271,1203476,1204080,1204888,1205055,1206615,1206623,1206866
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
528.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
618.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
# Transform the donor-project dataframe into a matrix
donors_projects_pivot_matrix = donors_projects_pivot_matrix_df.as_matrix()
donors_projects_pivot_matrix[:3]

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [82]:
# Get donor_ids
donors_ids = list(donors_projects_pivot_matrix_df.index)
donors_ids[:10]

[144.0, 528.0, 618.0, 668.0, 754.0, 847.0, 1568.0, 1756.0, 2672.0, 2781.0]

In [83]:
# Print the first 5 rows of the donor-project matrix
donors_projects_pivot_matrix[:5]

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

Now we will use SVD to get latent factors. After the factorization, we will try to reconstruct the original matrix by multiplying its factors. The resulting matrix is not sparse any more. It is the generated predictions for projects the donor have not yet donated to, which we will exploit for recommendations.

In [84]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(donors_projects_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [85]:
U.shape

(6466, 15)

In [86]:
Vt.shape

(15, 1761)

In [87]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [88]:
# Reconstruct the matrix by multiplying its factors
all_donor_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_donor_predicted_ratings

array([[  4.52748253e-32,  -1.57704158e-37,  -1.02307235e-34, ...,
          4.91903406e-33,  -1.52321944e-33,  -6.47873295e-33],
       [  5.92150205e-33,  -6.14397860e-36,  -8.19244651e-33, ...,
          1.39192660e-33,   2.34648107e-34,  -1.85911054e-33],
       [ -6.47743686e-34,  -5.23997372e-37,  -4.38413923e-34, ...,
         -1.52563675e-34,   3.84387664e-34,   1.56757169e-34],
       ..., 
       [  1.95330627e-32,   1.88555943e-35,  -4.84669622e-33, ...,
         -2.60913163e-33,   6.22020966e-34,   2.72140629e-33],
       [  1.47569417e-32,   8.90682724e-36,   7.06191528e-33, ...,
          2.78038546e-33,  -7.05778906e-33,  -2.69452654e-33],
       [ -2.09751586e-33,   4.30327012e-37,  -6.14855116e-34, ...,
          7.89138553e-34,  -1.81173857e-33,  -4.94583365e-34]])

In [89]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_donor_predicted_ratings, 
                           columns = donors_projects_pivot_matrix_df.columns, 
                           index=donors_ids).transpose()
#cf_preds_df.head(10)
## Error: IOPub data rate exceeded.

In [90]:
len(cf_preds_df.columns)

6466

## Build the Collaborative Filtering Model

In [91]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, projects=None):
        self.cf_predictions_df = cf_predictions_df
        self.projects = projects
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_projects(self, donor_id, projects_to_ignore=[], topn=10):
        # Get and sort the donor's predictions
        sorted_donor_predictions = self.cf_predictions_df[donor_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={donor_id: 'recStrength'})

        # Recommend the highest predicted projects that the donor hasn't donated to
        recommendations_df = sorted_donor_predictions[~sorted_donor_predictions['project_id'].isin(projects_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

 
        recommendations_df = recommendations_df.merge(self.projects, how = 'left', 
                                                          left_on = 'project_id', 
                                                          right_on = 'project_id')[['recStrength', 'project_id', 'Project Title', 'Project Essay']]


        return recommendations_df

In [92]:
cf_recommender_model = CFRecommender(cf_preds_df, projects)

In [93]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...


KeyError: 'the label [1577993.0] is not in the [index]'

In [94]:
cf_recommender_model.recommend_projects(donor1)

Unnamed: 0,recStrength,project_id,Project Title,Project Essay
0,2.8900990000000005e-17,270325,"may i have this dance, please?",the tango music begins. the students look at o...
1,2.4534820000000002e-17,577373,claymation experimentation,after seeing my students sewing a jabba the hu...
2,1.614747e-17,386868,document cameras for student centered learning!,has someone ever tried to explain a concept to...
3,1.5978840000000002e-17,261670,help us document our artistic growth!,the art room is a very busy place! it is full...
4,1.5433440000000002e-17,1000049,purposeful play on the playground,our students come from a title i school in jer...
5,1.506141e-17,129793,help! third graders need a pencil sharpener,having writing utensils is essential for stude...
6,1.3417740000000001e-17,288913,feed my brilliant bookworms!!!,hi there! do you want to help to instill a lif...
7,1.316624e-17,498208,astronomical astronomy,my classroom is a melting pot in a suburb of n...
8,1.257348e-17,347277,a calming classroom carpet,"""sometimes the questions are complicated and t..."
9,1.1611050000000001e-17,455330,tower gardens for our school,as service learning coordinators at our elemen...


In [95]:
cf_recommender_model.recommend_projects(donor2)

NameError: name 'donor2' is not defined

# Hybrid Method

In [96]:
class HybridRecommender:
    
    MODEL_NAME = 'Hybrid'
    
    def __init__(self, cb_rec_model, cf_rec_model, projects_df):
        self.cb_rec_model = cb_rec_model
        self.cf_rec_model = cf_rec_model
        self.projects_df = projects_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_projects(self, donor_id, projects_to_ignore=[], topn=10):
        #Getting the top-1000 Content-based filtering recommendations
        cb_recs_df = self.cb_rec_model.recommend_projects(donor_id, projects_to_ignore=projects_to_ignore, 
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCB'})
        
        #Getting the top-1000 Collaborative filtering recommendations
        cf_recs_df = self.cf_rec_model.recommend_projects(donor_id, projects_to_ignore=projects_to_ignore,  
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCF'})
        
        #Combining the results by project_id
        recs_df = cb_recs_df.merge(cf_recs_df,
                                   how = 'inner', 
                                   left_on = 'project_id', 
                                   right_on = 'project_id')
        
        #Computing a hybrid recommendation score based on CF and CB scores
        recs_df['recStrengthHybrid'] = recs_df['recStrengthCB'] * recs_df['recStrengthCF']
        
        #Sorting recommendations by hybrid score
        recommendations_df = recs_df.sort_values('recStrengthHybrid', ascending=False).head(topn)

        recommendations_df = recommendations_df.merge(self.projects_df, how = 'left', 
                                                    left_on = 'project_id', 
                                                    right_on = 'project_id')[['recStrengthHybrid', 
                                                                              'project_id', 'Project Title', 
                                                                              'Project Essay']]


        return recommendations_df
    
hybrid_recommender_model = HybridRecommender(content_based_recommender_model, cf_recommender_model, projects)

In [97]:
hybrid_recommender_model.recommend_projects(donor1)

Unnamed: 0,recStrengthHybrid,project_id,Project Title,Project Essay
0,2.801618e-18,288913,feed my brilliant bookworms!!!,hi there! do you want to help to instill a lif...
1,2.096616e-18,163851,jonesing for some good comprehension activities,i teach in an integrated co-teaching classroom...
2,1.651523e-18,577373,claymation experimentation,after seeing my students sewing a jabba the hu...
3,1.125072e-18,129793,help! third graders need a pencil sharpener,having writing utensils is essential for stude...
4,1.00965e-18,1000049,purposeful play on the playground,our students come from a title i school in jer...
5,9.099040999999999e-19,76716,leveled books to help us read!,"have you ever been told you need to read, but ..."
6,8.829033e-19,386868,document cameras for student centered learning!,has someone ever tried to explain a concept to...
7,7.977451999999999e-19,347277,a calming classroom carpet,"""sometimes the questions are complicated and t..."
8,5.147060999999999e-19,531848,best books of 2015,there's no such thing as a kid who hates readi...
9,4.429421e-19,50843,let's get organized!,help us learn to take responsibility for our m...


In [98]:
hybrid_recommender_model.recommend_projects(donor2)

NameError: name 'donor2' is not defined

In [99]:
print('Evaluating Hybrid model...')
hybrid_global_metrics, hybrid_detailed_results_df = model_evaluator.evaluate_model(hybrid_recommender_model)
print('\nGlobal metrics:\n%s' % hybrid_global_metrics)
hybrid_detailed_results_df.head(10)

Evaluating Hybrid model...


KeyError: 'the label [1577993.0] is not in the [index]'

# Comparing Methods

In [100]:
global_metrics_df = pd.DataFrame([pop_global_metrics, cf_global_metrics, cb_global_metrics, hybrid_global_metrics]) \
                        .set_index('modelName')
global_metrics_df

NameError: name 'pop_global_metrics' is not defined

In [101]:
%matplotlib inline
ax = global_metrics_df.transpose().plot(kind='bar', figsize=(15,8))
for p in ax.patches:
    ax.annotate("%.3f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

NameError: name 'global_metrics_df' is not defined

# Testing

In [102]:
def inspect_donations(donor_id, test_set=True):
    if test_set:
        donations_df = donations_test_indexed_df
    else:
        donations_df = donations_train_indexed_df
    return donations_df.loc[donor_id].merge(projects_df, how = 'left', 
                                                      left_on = 'project_id', 
                                                      right_on = 'project_id') \
                          .sort_values('eventStrength', ascending = False)[['eventStrength', 
                                                                          'project_id']]

In [103]:
inspect_donations(donor1, test_set=False).head(20)


AttributeError: 'Series' object has no attribute 'merge'

In [104]:
hybrid_recommender_model.recommend_projects(donor1, topn=20)

Unnamed: 0,recStrengthHybrid,project_id,Project Title,Project Essay
0,2.801618e-18,288913,feed my brilliant bookworms!!!,hi there! do you want to help to instill a lif...
1,2.096616e-18,163851,jonesing for some good comprehension activities,i teach in an integrated co-teaching classroom...
2,1.651523e-18,577373,claymation experimentation,after seeing my students sewing a jabba the hu...
3,1.125072e-18,129793,help! third graders need a pencil sharpener,having writing utensils is essential for stude...
4,1.00965e-18,1000049,purposeful play on the playground,our students come from a title i school in jer...
5,9.099040999999999e-19,76716,leveled books to help us read!,"have you ever been told you need to read, but ..."
6,8.829033e-19,386868,document cameras for student centered learning!,has someone ever tried to explain a concept to...
7,7.977451999999999e-19,347277,a calming classroom carpet,"""sometimes the questions are complicated and t..."
8,5.147060999999999e-19,531848,best books of 2015,there's no such thing as a kid who hates readi...
9,4.429421e-19,50843,let's get organized!,help us learn to take responsibility for our m...
