In [361]:
import pandas as pd
import warnings
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix
from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

In [362]:
books = pd.read_csv('Books.csv', header=0)
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [363]:
books.shape

(271360, 8)

In [364]:
columns = ['ISBN','Book-Title','Book-Author','Year-Of-Publication','Publisher']
books = books[columns]
books.shape

(271360, 5)

In [365]:
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial


In [366]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
dtypes: object(5)
memory usage: 10.4+ MB


In [367]:
books.nunique(dropna=True)

ISBN                   271360
Book-Title             242135
Book-Author            102023
Year-Of-Publication       202
Publisher               16807
dtype: int64

In [368]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
dtype: int64

In [369]:
books = books[books['Year-Of-Publication'] != '']
books = books[books['Book-Title'] != '']
books = books[books['Book-Author'] != '']
books = books[books['Publisher'] != '']
books = books[books['ISBN'] != '']

In [370]:
books = books[~books['Year-Of-Publication'].isna()]
books = books[~books['Book-Title'].isna()]
books = books[~books['Book-Author'].isna()]
books = books[~books['Publisher'].isna()]
books = books[~books['ISBN'].isna()]

In [371]:
books.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [372]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
dtype: int64

In [373]:
books.shape

(271357, 5)

In [374]:
info_df = pd.DataFrame(books.dtypes)
info_df['missing_val'] = books.isnull().sum()
info_df['missing_val_ratio'] = (info_df['missing_val'] / books.shape[0] * 100).round().astype(int)
info_df = info_df.rename(columns = {0:'data_type'})
info_df

Unnamed: 0,data_type,missing_val,missing_val_ratio
ISBN,object,0,0
Book-Title,object,0,0
Book-Author,object,0,0
Year-Of-Publication,object,0,0
Publisher,object,0,0


In [375]:
import re

def Year_Of_Publication(year):
    match = re.search(r'\d*', year)
    if match:
        if (match.group(len(match.groups())) < '2023') and (match.group(len(match.groups())) > '1800'):
            return match.group(len(match.groups()))
        return ''
    return ''

books['Year-Of-Publication'] = books['Year-Of-Publication'].apply(lambda x: Year_Of_Publication(str(x)))
books.dropna()
books = books[books['Year-Of-Publication'] != '']
books.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [376]:
books['Year-Of-Publication'].max()

'2021'

In [377]:
books['Year-Of-Publication'].min()

'1806'

In [378]:
books['Year-Of-Publication'].mean()

inf

In [379]:
books['Year-Of-Publication'].median()

1996.0

In [380]:
books['Year-Of-Publication'].describe()

count     266721
unique       107
top         2002
freq       17626
Name: Year-Of-Publication, dtype: object

In [381]:
books.shape

(266721, 5)

In [382]:
def C_ISBN(ISBN):
    match = re.search(r'\d*', ISBN)
    if match:
        return match.group(len(match.groups()))
    return ''

books['ISBN'] = books['ISBN'].apply(lambda x: C_ISBN(str(x)))
books.dropna()
books = books[books['ISBN'] != '']
books.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [383]:
books['ISBN'].max()

'9999999999'

In [384]:
books['ISBN'].min()

'0000913154'

In [385]:
books['ISBN'].describe()

count        266662
unique       266353
top       055321232
freq              2
Name: ISBN, dtype: object

In [386]:
books.shape

(266662, 5)

In [387]:
books = books.sample(1500)

In [388]:
ratings = pd.read_csv('Ratings.csv', header=0)
ratings.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [389]:
ratings.shape

(1149780, 3)

In [390]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [391]:
ratings = ratings[ratings['Book-Rating'] != '']
ratings = ratings[ratings['Book-Rating'] != 0]
ratings = ratings[ratings['ISBN'] != '']
ratings = ratings[ratings['User-ID'] != '']
ratings = ratings[ratings['Book-Rating'] != ' ']
ratings = ratings[ratings['ISBN'] != ' ']
ratings = ratings[ratings['User-ID'] != ' ']

In [392]:
ratings = ratings[~ratings['User-ID'].isna()]
ratings = ratings[~ratings['ISBN'].isna()]
ratings = ratings[~ratings['Book-Rating'].isna()]
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6


In [393]:
ratings.shape

(433671, 3)

In [394]:
ratings['ISBN'] = ratings['ISBN'].apply(lambda x: C_ISBN(str(x)))
ratings.dropna()
ratings = ratings[ratings['ISBN'] != '']
ratings.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,155061224,5
3,276729,52165615,3
4,276729,521795028,6
6,276736,3257224281,8
7,276737,600570967,6


In [395]:
ratings['User-ID'] = ratings['User-ID'].apply(lambda x: C_ISBN(str(x)))
ratings.dropna()
ratings = ratings[ratings['User-ID'] != '']
ratings.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,155061224,5
3,276729,52165615,3
4,276729,521795028,6
6,276736,3257224281,8
7,276737,600570967,6


In [396]:
ratings.shape

(433115, 3)

In [397]:
ratings['Book-Rating'].max()

10

In [398]:
ratings['Book-Rating'].min()

1

In [399]:
ratings['ISBN'].max()

'9999999999999'

In [400]:
ratings['ISBN'].min()

'0'

In [401]:
ratings['User-ID'].max()

'99998'

In [402]:
ratings['User-ID'].min()

'10'

In [403]:
info_df = pd.DataFrame(ratings.dtypes)
info_df['missing_val'] = ratings.isnull().sum()
info_df['missing_val_ratio'] = (info_df['missing_val'] / ratings.shape[0] * 100).round().astype(int)
info_df = info_df.rename(columns = {0:'data_type'})
info_df

Unnamed: 0,data_type,missing_val,missing_val_ratio
User-ID,object,0,0
ISBN,object,0,0
Book-Rating,int64,0,0


In [404]:
ids = books['ISBN'].values
ratings = ratings[ratings['ISBN'].isin(ids)]

In [405]:
ratings.shape

(2243, 3)

In [406]:
users = pd.read_csv('Users.csv', header=0)
users.head(5)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [407]:
users.shape

(278858, 3)

In [408]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [409]:
users['User-ID'] = users['User-ID'].apply(lambda x: C_ISBN(str(x)))
users.dropna()
users = users[users['User-ID'] != '']
users = users[users['Location'] != '']
users = users[~users['User-ID'].isna()]
users = users[~users['Location'].isna()]
users.head(5)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [410]:
users['User-ID'].max()

'99999'

In [411]:
users['User-ID'].min()

'1'

In [412]:
ids = ratings['User-ID'].values
users = users[users['User-ID'].isin(ids)]

In [413]:
users.shape

(1812, 3)

In [414]:
users.head()

Unnamed: 0,User-ID,Location,Age
253,254,"minneapolis, minnesota, usa",24.0
336,337,"málaga, n/a, spain",17.0
407,408,"mountain view, california, usa",17.0
439,440,"brookfield, wisconsin, usa",16.0
928,929,"king of prussia, ,",36.0


In [415]:
users.isnull().sum()

User-ID       0
Location      0
Age         545
dtype: int64

In [416]:
users = users[~users['Age'].isna()]
users = users[users['Age'] != '']

In [417]:
print(users.shape)
users.head()

(1267, 3)


Unnamed: 0,User-ID,Location,Age
253,254,"minneapolis, minnesota, usa",24.0
336,337,"málaga, n/a, spain",17.0
407,408,"mountain view, california, usa",17.0
439,440,"brookfield, wisconsin, usa",16.0
928,929,"king of prussia, ,",36.0


In [418]:
users['Age'].max()

209.0

In [419]:
users['Age'].min()

1.0

In [420]:
users['Age'].mean()

36.26835043409629

In [421]:
users['Age'].median()

34.0

In [422]:
users['Age'].describe()

count    1267.000000
mean       36.268350
std        14.926962
min         1.000000
25%        27.000000
50%        34.000000
75%        44.000000
max       209.000000
Name: Age, dtype: float64

In [423]:
def Age(age):
    if (age < 50) and (age > 14):
        return age
    return 0
users['Age'] = users['Age'].apply(lambda x: Age(x))
users = users[users['Age'] != 0]

In [424]:
print(users.shape)
users.head()

(1039, 3)


Unnamed: 0,User-ID,Location,Age
253,254,"minneapolis, minnesota, usa",24.0
336,337,"málaga, n/a, spain",17.0
407,408,"mountain view, california, usa",17.0
439,440,"brookfield, wisconsin, usa",16.0
928,929,"king of prussia, ,",36.0


In [425]:
users['Age'].max()

49.0

In [426]:
users['Age'].min()

15.0

In [427]:
users['Age'].mean()

32.1665062560154

In [428]:
users['Age'].median()

32.0

In [429]:
users['Age'].describe()

count    1039.000000
mean       32.166506
std         8.373863
min        15.000000
25%        26.000000
50%        32.000000
75%        38.000000
max        49.000000
Name: Age, dtype: float64

In [430]:
threshold_date = '2002'
ids = books[books['Year-Of-Publication'] < threshold_date]['ISBN'].values

training_data = ratings[ratings['ISBN'].isin(ids)]
print(f'Training data size: {training_data.shape}')
testing_data = ratings[~ratings['ISBN'].isin(ids)]
print(f'Testing data size: {testing_data.shape}')

Training data size: (1851, 3)
Testing data size: (392, 3)


In [431]:
training_data.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating
90,276798,3442131340,7
470,276925,840149236,5


In [432]:
def normalize(value, old_max, old_min, new_max=10, new_min=0.0):
    old_range = (old_max - old_min)  
    new_range = (new_max - new_min)  
    return (((value - old_min) * new_range) / old_range) + new_min

In [433]:
def get_favorite_movies(user_id, ratings_df):
    favorites = ratings_df[(ratings_df['User-ID'] == user_id) & (ratings_df['Book-Rating'] >= 0.5)].sort_values(by='Book-Rating', ascending=False)['ISBN']
    return set(favorites if type(favorites) == pd.Series else [favorites])

In [434]:
class ModelEvaluator:
    def __init__(self, training_data, testing_data, threshold=0.5):
        self.training_data = training_data
        self.testing_data = testing_data
        self.threshold = threshold
    def evaluate_model_for_user(self, model, user_id):
        favorites_in_test = get_favorite_movies(user_id, self.testing_data)
        person_recs_df = model.recommend_items(user_id,items_to_ignore=get_favorite_movies(user_id, self.training_data))
        person_recs_df = person_recs_df[person_recs_df['predicted_rating'] >= self.threshold].sort_values(by='predicted_rating', ascending=False)
        true_relevent = person_recs_df[person_recs_df['ISBN'].isin(favorites_in_test)].shape[0]        
        top_5_recommended = person_recs_df.head(5)
        top_10_recommended = person_recs_df.head(10)
        hits_at_5_count = top_5_recommended[top_5_recommended['ISBN'].isin(favorites_in_test)].shape[0]
        hits_at_10_count = top_10_recommended[top_10_recommended['ISBN'].isin(favorites_in_test)].shape[0]        
        precision_at_5 = hits_at_5_count / top_5_recommended.shape[0] if top_5_recommended.shape[0] != 0 else 1
        recall_at_5 = hits_at_5_count / true_relevent if true_relevent != 0 else 1
        precision_at_10 = hits_at_10_count / top_10_recommended.shape[0] if top_10_recommended.shape[0] != 0 else 1
        recall_at_10 = hits_at_10_count / true_relevent if true_relevent != 0 else 1
        person_metrics = {'hits@5_count':hits_at_5_count, 
                        'hits@10_count':hits_at_10_count,
                        'recommended@5_count':top_5_recommended.shape[0], 
                        'recommended@10_count':top_10_recommended.shape[0],  
                        'relevents': true_relevent,
                        'recall@5': recall_at_5,
                        'recall@10': recall_at_10,
                        'precision@5': precision_at_5,
                        'precision@10': precision_at_10}
        return person_recs_df,person_metrics
    def evaluate_model(self, model):
        users_metrics = []
        person_recs_df = pd.DataFrame()
        users_ids = list(set(self.testing_data['User-ID'].values))
        for idx, user_id in enumerate(users_ids):
            rec , metrics = self.evaluate_model_for_user(model, user_id)
            metrics['User-ID'] = user_id
            users_metrics.append(metrics)
            if(not person_recs_df.empty):
                person_recs_df.append(rec)
            else:
                person_recs_df = rec
        detailed_results_df = pd.DataFrame(users_metrics).sort_values('hits@5_count', ascending=False)
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['relevents'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['relevents'].sum())
        global_precision_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['recommended@5_count'].sum())
        global_precision_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['recommended@10_count'].sum())
        global_metrics = {'modelName': model.get_model_name(),
                        'recall@5': global_recall_at_5,
                        'recall@10': global_recall_at_10,
                        'precision@5': global_precision_at_5,
                        'precision@10': global_precision_at_10}    
        return global_metrics, detailed_results_df,person_recs_df
model_evaluator = ModelEvaluator(training_data, testing_data)              

# الطلب الأول

# popularity

In [435]:
populartiy = testing_data.groupby('ISBN').agg({'Book-Rating': ['mean', 'count']}).reset_index()
populartiy.columns = ['ISBN', 'ratings_mean', 'ratings_count']
print(populartiy.shape)
populartiy.sort_values(by='ratings_mean', ascending=False).head(5)

(129, 3)


Unnamed: 0,ISBN,ratings_mean,ratings_count
46,451458818,10.0,1
22,373691602,10.0,1
98,156931893,10.0,1
29,375414908,10.0,1
112,1931514054,10.0,1


In [436]:
class PopularityRecommender:
    MODEL_NAME = 'Popularity'
    def __init__(self, popularities_df):
        self.popularities_df = popularities_df
    def get_model_name(self):
        return self.MODEL_NAME
    def weighted_rating(self, x, m, C):
        v = x['ratings_count']
        R = x['ratings_mean']
        return (v/(v+m) * R) + (m/(m+v) * C)
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        C = self.popularities_df['ratings_mean'].mean()
        self.popularities_df['predicted_rating'] = self.popularities_df.apply(lambda x: self.weighted_rating(x, 5, C), axis=1)
        recommendations_df = self.popularities_df[~self.popularities_df['ISBN'].isin(items_to_ignore)].sort_values('predicted_rating', ascending = False).head(topn)
        return recommendations_df
popularity_model = PopularityRecommender(populartiy)

In [437]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df,recommend_items_popularity = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...

Global metrics:
{'modelName': 'Popularity', 'recall@5': 0.6875, 'recall@10': 1.0, 'precision@5': 0.00625, 'precision@10': 0.004545454545454545}


Unnamed: 0,hits@5_count,hits@10_count,recommended@5_count,recommended@10_count,relevents,recall@5,recall@10,precision@5,precision@10,User-ID
46,1,1,5,10,1,1.0,1.0,0.2,0.1,205314
152,1,2,5,10,2,0.5,1.0,0.2,0.2,98391
53,1,1,5,10,1,1.0,1.0,0.2,0.1,196148
258,1,1,5,10,1,1.0,1.0,0.2,0.1,169140
129,1,1,5,10,1,1.0,1.0,0.2,0.1,227447
111,1,1,5,10,1,1.0,1.0,0.2,0.1,125039
84,1,1,5,10,1,1.0,1.0,0.2,0.1,231354
142,1,1,5,10,1,1.0,1.0,0.2,0.1,31471
74,1,1,5,10,1,1.0,1.0,0.2,0.1,226006
316,1,1,5,10,1,1.0,1.0,0.2,0.1,254377


In [438]:
recommend_items_popularity = recommend_items_popularity.drop(columns=['ratings_mean','ratings_count'])

In [439]:
recommend_items_popularity.head()

Unnamed: 0,ISBN,predicted_rating
59,689860242,8.396
62,740733001,8.309714
86,843951818,8.240889
68,765307219,8.194667
79,786867213,8.194667


In [440]:
recommend_items_popularity.shape

(10, 2)

In [441]:
ids = popularity_model.recommend_items(10)['ISBN'].values
books[books['ISBN'].isin(ids)]['Book-Title'].values

array(['The Journal of Professor Abraham Van Helsing',
       'Guide to the Bible', 'Behemoth: B-Max',
       'Treasure Island (Treasury of Illustrated Classics)',
       'Dragonsinger (Mccaffrey, Anne. Harper Hall Trilogy, V. 2.)',
       'Juline, Book 5',
       'The Color Code : A Revolutionary Eating Plan for Optimum Health',
       'Seek My Face', 'The Get Fuzzy Experience', 'Blood Games'],
      dtype=object)

# الطلب الثاني

# based-content

In [442]:
expended_books_df = books.copy()
expended_books_df.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
255012,1895455529,Pizza (Original),Jean Par,1999,Company's Coming Publishing
30724,689714874,Stormy : Mistys Foal,Marguerite Henry,1991,Aladdin


In [443]:
count1 = CountVectorizer(stop_words='english')
count_matrix1 = count1.fit_transform(expended_books_df['Publisher'])
count_matrix1.shape

(1500, 824)

In [444]:
items_ids = expended_books_df['ISBN'].values.tolist()
users_ids = list(set(testing_data['User-ID'].values))
topn = 2000
threshold = 4
users_metrics = []
recommendations = pd.DataFrame()

In [445]:
def get_item_profile(item_id):
    idx = items_ids.index(item_id)
    return items_matrix[idx].toarray().reshape(-1)
def get_items_profiles(ids):
    items_profiles = np.array([get_item_profile(x) for x in ids])
    return items_profiles

In [446]:
def result(users_metrics,ModelName):
    detailed_results_df = pd.DataFrame(users_metrics).sort_values('hits@5_count', ascending=False)

    global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['relevents'].sum()) if float(detailed_results_df['relevents'].sum()) !=0 else 1
    global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['relevents'].sum()) if float(detailed_results_df['relevents'].sum()) !=0 else 1

    global_precision_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['recommended@5_count'].sum()) if float(detailed_results_df['recommended@5_count'].sum()) != 0 else 1
    global_precision_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['recommended@10_count'].sum()) if float(detailed_results_df['recommended@5_count'].sum()) != 0 else 1

    global_metrics = {'modelName': ModelName,
                    'recall@5': global_recall_at_5,
                    'recall@10': global_recall_at_10,
                    'precision@5': global_precision_at_5,
                    'precision@10': global_precision_at_10}
    return detailed_results_df,global_metrics

In [447]:
def Metrics(favorites_in_test,person_recs_df,threshold):
    person_recs_df = person_recs_df[person_recs_df['predicted_rating'] >= threshold].sort_values(by='predicted_rating', ascending=False)
    true_relevent = person_recs_df[person_recs_df['ISBN'].isin(favorites_in_test)].shape[0]
    top_5_recommended = person_recs_df.head(5)
    top_10_recommended = person_recs_df.head(10)
    hits_at_5_count = top_5_recommended[top_5_recommended['ISBN'].isin(favorites_in_test)].shape[0]
    hits_at_10_count = top_10_recommended[top_10_recommended['ISBN'].isin(favorites_in_test)].shape[0]
    precision_at_5 = hits_at_5_count / top_5_recommended.shape[0] if top_5_recommended.shape[0] != 0 else 1
    recall_at_5 = hits_at_5_count / true_relevent if true_relevent != 0 else 1
    precision_at_10 = hits_at_10_count / top_10_recommended.shape[0] if top_10_recommended.shape[0] != 0 else 1
    recall_at_10 = hits_at_10_count / true_relevent if true_relevent != 0 else 1
    person_metrics = {'hits@5_count':hits_at_5_count, 
                    'hits@10_count':hits_at_10_count,
                    'recommended@5_count':top_5_recommended.shape[0], 
                    'recommended@10_count':top_10_recommended.shape[0],  
                    'relevents': true_relevent,
                    'recall@5': recall_at_5,
                    'recall@10': recall_at_10,
                    'precision@5': precision_at_5,
                    'precision@10': precision_at_10}
    return person_metrics

In [448]:
def Content_based(items_ids,users_ids,items_matrix,topn,threshold,users_metrics,recommendations,ModelName):
    for idx, user_id in enumerate(users_ids):
        items_to_ignore=get_favorite_movies(user_id, training_data)
        favorites_in_test = get_favorite_movies(user_id, testing_data)
        user_df = training_data[training_data['User-ID'] == user_id]
        if not (user_df.empty) :
            user_items_profiles = get_items_profiles(user_df['ISBN'].values)
            user_items_ratings = np.array(user_df['Book-Rating'].values).reshape(-1,1)
            if sum(user_items_ratings)!=0:
                user_profile = np.sum(np.multiply(user_items_profiles, user_items_ratings), axis=0) / np.sum(user_items_ratings)
                user_profile = user_profile.reshape(1, -1)
                cosine_similarities = cosine_similarity(user_profile, items_matrix.toarray())
                similar_indices = cosine_similarities.argsort().flatten()[-1000:]
                similar_items = sorted([(int(items_ids[i]), cosine_similarities[0,i]) for i in similar_indices[0:len(items_ids)-5]], key=lambda x: -x[0])
                similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))        
                recommendations_df = pd.DataFrame(similar_items_filtered, columns=['ISBN', 'predicted_rating']).head(topn)
                recommendations_df['predicted_rating'] = recommendations_df['predicted_rating'].apply(lambda x: normalize(x, 1.0, 0.0))
                recommendations = recommendations_df
                person_recs_df = recommendations_df
                metrics = Metrics(favorites_in_test,person_recs_df,threshold)
                metrics['User-ID'] = user_id
                users_metrics.append(metrics)
    detailed_results_df,global_metrics = result(users_metrics,ModelName)
    return recommendations,detailed_results_df,global_metrics

In [449]:
items_matrix = count_matrix1 
recommendations_Publisher_based,detailed_results_Publisher_based,global_metrics_Publisher_based = Content_based(items_ids,users_ids,items_matrix,topn,threshold,users_metrics,recommendations,"Publisher_based")
print('Evaluating Content-Based Filtering model number 1 (Publisher)...')
print('\nGlobal metrics:\n%s' % global_metrics_Publisher_based)
detailed_results_Publisher_based.head(10)

Evaluating Content-Based Filtering model number 1 (Publisher)...

Global metrics:
{'modelName': 'Publisher_based', 'recall@5': 0.4, 'recall@10': 0.4, 'precision@5': 0.010443864229765013, 'precision@10': 0.00554016620498615}


Unnamed: 0,hits@5_count,hits@10_count,recommended@5_count,recommended@10_count,relevents,recall@5,recall@10,precision@5,precision@10,User-ID
38,2,2,5,6,2,1.0,1.0,0.4,0.333333,189835
31,1,1,5,10,1,1.0,1.0,0.2,0.1,98391
39,1,1,5,10,1,1.0,1.0,0.2,0.1,169663
50,0,0,5,10,0,1.0,1.0,0.0,0.0,16795
57,0,0,5,10,0,1.0,1.0,0.0,0.0,28634
56,0,0,5,6,0,1.0,1.0,0.0,0.0,222488
55,0,0,5,10,0,1.0,1.0,0.0,0.0,144531
54,0,0,5,10,0,1.0,1.0,0.0,0.0,31556
53,0,0,5,5,0,1.0,1.0,0.0,0.0,158627
52,0,0,5,10,0,1.0,1.0,0.0,0.0,204864


In [450]:
recommendations_Publisher_based[recommendations_Publisher_based['predicted_rating']>0].head()

Unnamed: 0,ISBN,predicted_rating
11,8484034119,3.535534
28,8170592461,4.082483
64,2890212203,5.0
79,1930051220,5.0
80,1902825454,5.0


In [451]:
count2 = CountVectorizer(stop_words='english')
count_matrix2 = count2.fit_transform(expended_books_df['Book-Author'])
count_matrix2.shape

(1500, 1897)

In [452]:
items_matrix = count_matrix2 
recommendations_BookAuthor_based,detailed_results_BookAuthor_based,global_metrics_BookAuthor_based = Content_based(items_ids,users_ids,items_matrix,topn,threshold,users_metrics,recommendations,"BookAuthor_based")
print('Evaluating Content-Based Filtering model number 2 (BookAuthor)...')
print('\nGlobal metrics:\n%s' % global_metrics_BookAuthor_based)
detailed_results_BookAuthor_based.head(10)

Evaluating Content-Based Filtering model number 2 (BookAuthor)...

Global metrics:
{'modelName': 'BookAuthor_based', 'recall@5': 0.45454545454545453, 'recall@10': 0.45454545454545453, 'precision@5': 0.007352941176470588, 'precision@10': 0.00429553264604811}


Unnamed: 0,hits@5_count,hits@10_count,recommended@5_count,recommended@10_count,relevents,recall@5,recall@10,precision@5,precision@10,User-ID
38,2,2,5,6,2,1.0,1.0,0.4,0.333333,189835
39,1,1,5,10,1,1.0,1.0,0.2,0.1,169663
118,1,1,4,4,1,1.0,1.0,0.25,0.25,189835
31,1,1,5,10,1,1.0,1.0,0.2,0.1,98391
0,0,0,5,10,0,1.0,1.0,0.0,0.0,56554
111,0,0,3,3,0,1.0,1.0,0.0,0.0,98391
105,0,0,5,6,0,1.0,1.0,0.0,0.0,224430
106,0,0,5,10,0,1.0,1.0,0.0,0.0,37712
107,0,0,5,10,0,1.0,1.0,0.0,0.0,118135
108,0,0,5,8,0,1.0,1.0,0.0,0.0,39345


In [453]:
recommendations_BookAuthor_based[recommendations_BookAuthor_based['predicted_rating']>0].head()

Unnamed: 0,ISBN,predicted_rating
287,821735144,10.0
737,373709080,4.082483
975,34543319,4.082483


In [454]:
count3 = CountVectorizer(stop_words='english')
count_matrix3 = count3.fit_transform(expended_books_df['Book-Title'])
count_matrix3.shape

(1500, 3543)

In [455]:
items_matrix = count_matrix3 
recommendations_BookTitle_based,detailed_results_BookTitle_based,global_metrics_BookTitle_based = Content_based(items_ids,users_ids,items_matrix,topn,threshold,users_metrics,recommendations,"BookTitle_based")
print('Evaluating Content-Based Filtering model number 3 (BookTitle)...')
print('\nGlobal metrics:\n%s' % global_metrics_BookTitle_based)
detailed_results_BookTitle_based.head(10)

Evaluating Content-Based Filtering model number 3 (BookTitle)...

Global metrics:
{'modelName': 'BookTitle_based', 'recall@5': 0.5, 'recall@10': 0.5, 'precision@5': 0.0069124423963133645, 'precision@10': 0.004398826979472141}


Unnamed: 0,hits@5_count,hits@10_count,recommended@5_count,recommended@10_count,relevents,recall@5,recall@10,precision@5,precision@10,User-ID
38,2,2,5,6,2,1.0,1.0,0.4,0.333333,189835
31,1,1,5,10,1,1.0,1.0,0.2,0.1,98391
165,1,1,3,3,1,1.0,1.0,0.333333,0.333333,267372
118,1,1,4,4,1,1.0,1.0,0.25,0.25,189835
39,1,1,5,10,1,1.0,1.0,0.2,0.1,169663
154,0,0,5,9,0,1.0,1.0,0.0,0.0,265889
155,0,0,5,10,0,1.0,1.0,0.0,0.0,88677
156,0,0,3,3,0,1.0,1.0,0.0,0.0,25409
157,0,0,5,10,0,1.0,1.0,0.0,0.0,229551
158,0,0,5,9,0,1.0,1.0,0.0,0.0,53220


In [456]:
recommendations_BookTitle_based[recommendations_BookTitle_based['predicted_rating']>0].head()

Unnamed: 0,ISBN,predicted_rating
140,1569714827,3.535534
251,843951818,5.0
289,821735144,10.0
426,671850245,4.082483
452,671041177,3.922323


### Integration (Publisher + Book-Author)

#### Method 1

#### وهي أسوء شي  BookTitle طلعت نفس نتائج ال

In [457]:
intg = expended_books_df['Publisher'].tolist() + expended_books_df['Book-Author'].tolist()

In [458]:
count4 = CountVectorizer(stop_words='english')
count_matrix4 = count4.fit_transform(intg)
count_matrix4.shape

(3000, 2616)

In [459]:
for i in range(int(count_matrix4.shape[0]/2)):
    count_matrix4[i] = count_matrix4[i] + count_matrix4[1500+i]

In [460]:
count_matrix4 = count_matrix4[:1500]

In [461]:
count_matrix4.shape

(1500, 2616)

In [462]:
items_matrix = count_matrix4
recommendations_BookTitle_based,detailed_results_BookTitle_based,global_metrics_PublisherBookAuthor_based = Content_based(items_ids,users_ids,items_matrix,topn,threshold,users_metrics,recommendations,"PublisherBook-Author")
print('Evaluating Content-Based Filtering model number 3 (Publisher + Book-Author)...')
print('\nGlobal metrics:\n%s' % global_metrics_BookTitle_based)
detailed_results_BookTitle_based.head(10)

Evaluating Content-Based Filtering model number 3 (Publisher + Book-Author)...

Global metrics:
{'modelName': 'BookTitle_based', 'recall@5': 0.5, 'recall@10': 0.5, 'precision@5': 0.0069124423963133645, 'precision@10': 0.004398826979472141}


Unnamed: 0,hits@5_count,hits@10_count,recommended@5_count,recommended@10_count,relevents,recall@5,recall@10,precision@5,precision@10,User-ID
38,2,2,5,6,2,1.0,1.0,0.4,0.333333,189835
31,1,1,5,10,1,1.0,1.0,0.2,0.1,98391
165,1,1,3,3,1,1.0,1.0,0.333333,0.333333,267372
39,1,1,5,10,1,1.0,1.0,0.2,0.1,169663
118,1,1,4,4,1,1.0,1.0,0.25,0.25,189835
0,0,0,5,10,0,1.0,1.0,0.0,0.0,56554
217,0,0,2,2,0,1.0,1.0,0.0,0.0,28634
216,0,0,1,1,0,1.0,1.0,0.0,0.0,222488
215,0,0,1,1,0,1.0,1.0,0.0,0.0,144531
214,0,0,1,1,0,1.0,1.0,0.0,0.0,31556


In [463]:
recommendations_BookTitle_based[recommendations_BookTitle_based['predicted_rating']>0].head()

Unnamed: 0,ISBN,predicted_rating
11,8484034119,1.825742
30,8170592461,2.0
65,2890212203,2.236068
79,1931040168,2.0
80,1930051220,2.236068


#### Method 2

#### كمان نفس اللي قبلا ما تغير شي

In [464]:
count5 = CountVectorizer(stop_words='english')
count_matrix5 = count4.fit_transform(expended_books_df['Publisher'] +" "+expended_books_df['Book-Author'])
count_matrix5.shape

(1500, 2616)

In [465]:
items_matrix = count_matrix5
recommendations_BookTitle_based,detailed_results_BookTitle_based,global_metrics_PublisherBookAuthor_based = Content_based(items_ids,users_ids,items_matrix,topn,threshold,users_metrics,recommendations,"PublisherBook-Author")
print('Evaluating Content-Based Filtering model number 3 (Publisher + Book-Author)...')
print('\nGlobal metrics:\n%s' % global_metrics_BookTitle_based)
detailed_results_BookTitle_based.head(10)

Evaluating Content-Based Filtering model number 3 (Publisher + Book-Author)...

Global metrics:
{'modelName': 'BookTitle_based', 'recall@5': 0.5, 'recall@10': 0.5, 'precision@5': 0.0069124423963133645, 'precision@10': 0.004398826979472141}


Unnamed: 0,hits@5_count,hits@10_count,recommended@5_count,recommended@10_count,relevents,recall@5,recall@10,precision@5,precision@10,User-ID
38,2,2,5,6,2,1.0,1.0,0.4,0.333333,189835
165,1,1,3,3,1,1.0,1.0,0.333333,0.333333,267372
39,1,1,5,10,1,1.0,1.0,0.2,0.1,169663
31,1,1,5,10,1,1.0,1.0,0.2,0.1,98391
118,1,1,4,4,1,1.0,1.0,0.25,0.25,189835
272,0,0,5,10,0,1.0,1.0,0.0,0.0,83235
271,0,0,2,2,0,1.0,1.0,0.0,0.0,98391
270,0,0,5,6,0,1.0,1.0,0.0,0.0,166391
269,0,0,5,9,0,1.0,1.0,0.0,0.0,75819
268,0,0,5,10,0,1.0,1.0,0.0,0.0,39345


In [466]:
recommendations_BookTitle_based[recommendations_BookTitle_based['predicted_rating']>0].head()

Unnamed: 0,ISBN,predicted_rating
11,8484034119,1.825742
30,8170592461,2.0
65,2890212203,2.236068
79,1931040168,2.0
80,1930051220,2.236068


# الطلب الثالث

# Collaborative Filtering model

In [467]:
users_items_pivot_matrix_df = ratings.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
users_items_pivot_matrix_df.head(10)

ISBN,0002187272,0002557398,000255755,0006380921,0007107110,0020199600,0020295456,0020426402,0027627403,0028614518,...,8486542480,8495501759,8526708066,880611784,8817844810,9023679245,907433699,9501700194,9681500555,9681501233
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10030,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0
101583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [468]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [469]:
from scipy.sparse.linalg import svds
NUMBER_OF_FACTORS_MF = 15
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [470]:
U.shape

(1812, 15)

In [471]:
Vt.shape

(15, 840)

In [472]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

The resulting matrix is not sparse any more. It was generated predictions for items the user have not yet interaction, which we will exploit for recommendations:

In [473]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[-2.56678426e-17,  3.31200635e-18, -2.08154126e-17, ...,
         1.82386321e-17, -9.92618669e-17,  3.19348290e-17],
       [-1.83629038e-18, -5.28242549e-18, -1.07635910e-18, ...,
        -1.60465927e-18, -1.26522238e-17, -6.36226906e-18],
       [-6.61569212e-17, -1.18063921e-16, -3.34184742e-17, ...,
        -7.26287308e-17, -4.28185277e-17,  1.31543319e-18],
       ...,
       [-1.10925717e-17, -2.34379431e-17, -4.76307971e-18, ...,
        -2.03242355e-17,  1.85902966e-16,  1.48430115e-17],
       [-2.37767926e-34, -3.69291774e-34, -2.87483072e-34, ...,
        -7.74360552e-34,  6.70226456e-34,  2.01257768e-33],
       [-2.91233389e-33,  2.19563332e-33, -1.20165081e-33, ...,
         3.38973968e-33,  1.16821994e-32,  2.99792302e-33]])

In [474]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=users_items_pivot_matrix_df.columns, index=users_items_pivot_matrix_df.index)

In [475]:
preds_df.head(10)

ISBN,0002187272,0002557398,000255755,0006380921,0007107110,0020199600,0020295456,0020426402,0027627403,0028614518,...,8486542480,8495501759,8526708066,880611784,8817844810,9023679245,907433699,9501700194,9681500555,9681501233
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100088,-2.5667840000000003e-17,3.312006e-18,-2.0815410000000002e-17,5.258457e-05,-3.611404e-17,-0.0335176,8.961986e-18,4.197612e-17,-4.681192e-05,0.0009080498,...,9.910352e-18,6.819094000000001e-17,-2.856224e-17,1.2688130000000002e-17,-0.001053513,-1.8776190000000002e-17,4.680265e-17,1.8238630000000002e-17,-9.926187e-17,3.193483e-17
100164,-1.83629e-18,-5.282425e-18,-1.076359e-18,-1.424513e-05,-6.268761999999999e-19,-0.001531011,2.130995e-18,7.16235e-18,-1.168397e-05,-2.187099e-05,...,-1.283967e-18,-9.468149e-19,-4.761991e-18,8.746997e-19,0.05586996,-1.16238e-18,1.885838e-18,-1.604659e-18,-1.265222e-17,-6.362269e-18
10030,-6.615692000000001e-17,-1.180639e-16,-3.341847e-17,-0.0008333093,1.078266e-16,7.804661,-5.746362e-18,-4.1424980000000006e-17,1.946119e-05,-4.111364e-05,...,-4.18886e-17,-4.5328850000000005e-17,2.500728e-17,1.541214e-18,-0.003564054,7.347417e-17,-3.110103e-17,-7.262873e-17,-4.2818530000000004e-17,1.315433e-18
100393,-2.023775e-18,-6.972997e-18,-1.048904e-18,-1.875841e-05,-1.385398e-18,-0.007655479,3.163208e-18,1.0825810000000001e-17,-1.750645e-05,-2.831639e-05,...,-1.61987e-18,-4.853589999999999e-19,-7.07103e-18,1.198214e-18,0.07345118,-2.098941e-18,3.071185e-18,-2.026896e-18,-1.412896e-17,-9.438949e-18
100545,1.2571030000000001e-33,-1.298163e-33,1.3843020000000001e-33,1.9167259999999998e-20,-5.354195e-33,-6.484836000000001e-17,2.8075340000000002e-33,1.056815e-32,-7.614192e-21,-5.294754e-21,...,2.144725e-34,-4.214023e-33,-6.649253e-33,1.049403e-33,2.3274370000000002e-17,-5.9491630000000004e-33,1.372629e-33,8.395441e-34,-2.9541370000000003e-33,-9.717383e-33
100846,2.6220540000000002e-18,-5.5840790000000004e-18,2.173418e-18,5.084888e-05,-5.657296e-19,-0.004718021,3.851871e-18,1.2496390000000001e-17,-1.97217e-05,-4.049952e-05,...,-1.908647e-18,-4.676432999999999e-19,-8.275494e-18,1.558658e-18,0.09655566,-1.271164e-18,3.86352e-18,-2.262572e-18,-2.0802130000000002e-17,-1.7508810000000002e-17
100906,1.847967e-16,6.490600000000001e-17,1.259953e-16,-0.0007397503,1.5566970000000003e-17,0.003171215,1.4540250000000002e-17,5.3291310000000004e-17,-0.0001861027,-0.0002238301,...,1.8474830000000002e-18,-6.625063e-18,-3.229798e-17,5.674223e-18,0.2080749,1.900885e-17,7.202515000000001e-17,1.97433e-18,-8.077209000000001e-17,-3.889787e-16
101081,-2.0534270000000003e-17,2.649605e-18,-1.6652330000000003e-17,4.206766e-05,-2.889123e-17,-0.02681408,7.169589e-18,3.35809e-17,-3.744954e-05,0.0007264399,...,7.928282e-18,5.4552750000000004e-17,-2.2849790000000002e-17,1.0150500000000002e-17,-0.0008428107,-1.502095e-17,3.7442120000000005e-17,1.4590910000000003e-17,-7.940949000000001e-17,2.5547860000000004e-17
101550,5.949633e-33,8.148445e-33,3.9554480000000005e-33,5.097257e-21,-4.342972e-33,-7.262873e-17,6.914108e-35,6.2641639999999996e-34,-1.302869e-21,2.532686e-21,...,2.219651e-33,2.629077e-33,-2.661296e-34,3.5176399999999997e-34,-3.850297e-18,-2.2022490000000003e-33,2.6425960000000002e-33,3.638519e-33,4.9898000000000004e-33,-4.414393e-33
101583,-1.196454e-32,-1.6759e-32,-7.94156e-33,-6.669934e-21,9.067256e-33,1.45368e-16,-8.557307e-35,-1.1674710000000001e-33,2.502316e-21,-5.263623e-21,...,-4.6282070000000005e-33,-5.417096e-33,4.586657e-34,-6.786055e-34,8.880532e-18,4.6319060000000005e-33,-5.45588e-33,-7.544649e-33,-1.0412150000000001e-32,8.66297e-33


In [476]:
preds_df = preds_df.apply(lambda x: normalize(x, all_user_predicted_ratings.max(), all_user_predicted_ratings.min()))
preds_df.head(10)

ISBN,0002187272,0002557398,000255755,0006380921,0007107110,0020199600,0020295456,0020426402,0027627403,0028614518,...,8486542480,8495501759,8526708066,880611784,8817844810,9023679245,907433699,9501700194,9681500555,9681501233
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100088,0.651334,0.651334,0.651334,0.651372,0.651334,0.627281,0.651334,0.651334,0.6513,0.651986,...,0.651334,0.651334,0.651334,0.651334,0.650578,0.651334,0.651334,0.651334,0.651334,0.651334
100164,0.651334,0.651334,0.651334,0.651324,0.651334,0.650235,0.651334,0.651334,0.651326,0.651318,...,0.651334,0.651334,0.651334,0.651334,0.691427,0.651334,0.651334,0.651334,0.651334,0.651334
10030,0.651334,0.651334,0.651334,0.650736,0.651334,6.252052,0.651334,0.651334,0.651348,0.651305,...,0.651334,0.651334,0.651334,0.651334,0.648776,0.651334,0.651334,0.651334,0.651334,0.651334
100393,0.651334,0.651334,0.651334,0.651321,0.651334,0.64584,0.651334,0.651334,0.651321,0.651314,...,0.651334,0.651334,0.651334,0.651334,0.704043,0.651334,0.651334,0.651334,0.651334,0.651334
100545,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,...,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334
100846,0.651334,0.651334,0.651334,0.65137,0.651334,0.647948,0.651334,0.651334,0.65132,0.651305,...,0.651334,0.651334,0.651334,0.651334,0.720623,0.651334,0.651334,0.651334,0.651334,0.651334
100906,0.651334,0.651334,0.651334,0.650803,0.651334,0.65361,0.651334,0.651334,0.6512,0.651173,...,0.651334,0.651334,0.651334,0.651334,0.800651,0.651334,0.651334,0.651334,0.651334,0.651334
101081,0.651334,0.651334,0.651334,0.651364,0.651334,0.632092,0.651334,0.651334,0.651307,0.651855,...,0.651334,0.651334,0.651334,0.651334,0.650729,0.651334,0.651334,0.651334,0.651334,0.651334
101550,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,...,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334
101583,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,...,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334,0.651334


In [477]:
class CFRecommender:
    MODEL_NAME = 'Collaborative Filtering'
    def __init__(self, predictions_df):
        self.predictions_df = predictions_df
    def get_model_name(self):
        return self.MODEL_NAME
    def recommend_items(self, user_id, items_to_ignore=[], topn=1000, verbose=False):
        sorted_user_predictions = self.predictions_df.loc[user_id].sort_values(ascending=False)
        recommendations = {'ISBN': sorted_user_predictions.index, 'predicted_rating': sorted_user_predictions.values}
        recommendations_df = pd.DataFrame(recommendations)
        recommendations_df = recommendations_df[~recommendations_df['ISBN'].isin(items_to_ignore)] \
                               .sort_values('predicted_rating', ascending = False) \
                               .head(topn)
#         print(recommendations_df)
        return recommendations_df
cf_recommender_model = CFRecommender(preds_df)

In [478]:
model_evaluator = ModelEvaluator(training_data, testing_data, 0.5)              
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df,recommend_items_cf = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.46683673469387754, 'recall@10': 0.5051020408163265, 'precision@5': 0.10397727272727272, 'precision@10': 0.05625}


Unnamed: 0,hits@5_count,hits@10_count,recommended@5_count,recommended@10_count,relevents,recall@5,recall@10,precision@5,precision@10,User-ID
152,5,10,5,10,23,0.217391,0.434783,1.0,1.0,98391
165,5,5,5,10,5,1.0,1.0,1.0,0.5,11676
341,2,2,5,10,2,1.0,1.0,0.4,0.2,153797
208,2,2,5,10,2,1.0,1.0,0.4,0.2,68555
62,2,2,5,10,2,1.0,1.0,0.4,0.2,210485
226,2,2,5,10,3,0.666667,0.666667,0.4,0.2,114368
4,2,2,5,10,2,1.0,1.0,0.4,0.2,226724
286,1,1,5,10,1,1.0,1.0,0.2,0.1,8067
292,1,1,5,10,1,1.0,1.0,0.2,0.1,90049
115,1,1,5,10,1,1.0,1.0,0.2,0.1,165308


In [479]:
recommend_items_cf.head()

Unnamed: 0,ISBN,predicted_rating
0,425167313,0.651334
3,385511612,0.651334
4,446601977,0.651334
5,316666009,0.651334
1,684835959,0.651334


# الطلب الرابع

# hybrid = Collaborative + based-content (Publisher)

In [480]:
recommend_items_cf_copy = recommend_items_cf[recommend_items_cf['predicted_rating']>0]
recommendations_Publisher_based_copy = recommendations_Publisher_based[recommendations_Publisher_based['predicted_rating']>0]
ids = recommend_items_cf_copy['ISBN'].values
collaborative = expended_books_df[expended_books_df['ISBN'].isin(ids)]['Book-Title'].tolist()
df = recommendations_Publisher_based_copy[recommendations_Publisher_based_copy['ISBN'].isin(ids)]
for idx, user_id in enumerate(users_ids):
    favorites_in_test = get_favorite_movies(user_id, testing_data)
    metrics = Metrics(favorites_in_test,df,threshold=0.5)
    metrics['User-ID'] = user_id
    users_metrics.append(metrics)
detailed_results_df,global_metrics = result(users_metrics,"hybrid")
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
print('\nGlobal metrics:\n%s' % global_metrics)
detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...

Global metrics:
{'modelName': 'hybrid', 'recall@5': 0.08695652173913043, 'recall@10': 0.18840579710144928, 'precision@5': 0.001875, 'precision@10': 0.0022775052557813596}


Unnamed: 0,hits@5_count,hits@10_count,recommended@5_count,recommended@10_count,relevents,recall@5,recall@10,precision@5,precision@10,User-ID
38,2,2,5,6,2,1.0,1.0,0.4,0.333333,189835
118,1,1,4,4,1,1.0,1.0,0.25,0.25,189835
31,1,1,5,10,1,1.0,1.0,0.2,0.1,98391
39,1,1,5,10,1,1.0,1.0,0.2,0.1,169663
165,1,1,3,3,1,1.0,1.0,0.333333,0.333333,267372
506,0,0,5,10,1,0.0,0.0,0.0,0.0,232343
499,0,0,5,10,0,1.0,1.0,0.0,0.0,233255
500,0,0,5,10,0,1.0,1.0,0.0,0.0,140000
501,0,0,5,10,0,1.0,1.0,0.0,0.0,127190
502,0,0,5,10,0,1.0,1.0,0.0,0.0,237271
