### Loading Data 

In [1]:
# Loading Data
from surprise import Dataset
from surprise import Reader
import pandas as pd

# Loading data ...
file_path_users = './data/BX-Users.csv'
file_path_ratings = './data/BX-Book-Ratings.csv'
file_path_books = './data/BX-Books.csv'

# Loading .csv data into pandas df
book_df = pd.read_csv(file_path_books, names = ['iid', 'title', 'author', 'year', 'publisher','image'], sep=',',skiprows=1, dtype={'year':int})
rating_df = pd.read_csv(file_path_ratings, names = ['uid', 'iid','rating'], sep=',',skiprows=1)
user_df = pd.read_csv(file_path_users, names = ['uid', 'age'], sep=',',skiprows=1)
user_df = user_df.dropna()

# df -> surprise Dataset
data_rating = Dataset.load_from_df(rating_df[["uid", "iid", "rating"]], Reader(rating_scale=(1, 5)))

### Books 

In [2]:
print(book_df.shape)
book_df.head(3)

(266737, 6)


Unnamed: 0,iid,title,author,year,publisher,image
0,964442011X,Tasht-i khun,IsmaÂ°il Fasih,1376,Nashr-i Alburz,http://images.amazon.com/images/P/964442011X.0...
1,9643112136,Dalan-i bihisht (Dastan-i Irani),Nazi Safavi,1378,Intisharat-i Quqnus,http://images.amazon.com/images/P/9643112136.0...
2,781228956,"Complete Works 10 Volumes [2,6,7,8,9] (Notable...",Benjamin Franklin,1806,Reprint Services Corp,http://images.amazon.com/images/P/0781228956.0...


### Ratings

In [3]:
print(rating_df.shape)
rating_df.head(3)

(999999, 3)


Unnamed: 0,uid,iid,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0


### Users

In [4]:
print(user_df.shape)
user_df.head(3)

(168096, 2)


Unnamed: 0,uid,age
1,2,18.0
3,4,17.0
5,6,61.0


### Filtering data from selected age

In [5]:
def filter_data_by_age(age, u_df, r_df, b_df):
    if age == '0-18':
        get_uid = u_df.loc[(u_df['age'] < 18)]['uid']
    elif age == '18-30':
        get_uid = u_df.loc[(u_df['age'] >= 18) & (u_df['age'] < 30)]['uid']
    elif age == '30-40':
        get_uid = u_df.loc[(u_df['age'] >= 30) & (u_df['age'] < 40)]['uid']
    elif age == '40-50':
        get_uid = u_df.loc[(u_df['age'] >= 40) & (u_df['age'] < 50)]['uid']
    else:
        get_uid = u_df.loc[u_df['age'] >= 50]['uid']
    
    get_iid = r_df.loc[r_df['uid'].isin(get_uid)]['iid']
    get_books = b_df.loc[b_df['iid'].isin(get_iid)]
        
    return get_books.drop_duplicates(), get_iid, get_uid

# data from pop-up windown 1 - name and age
form = {
    'age' : '0-1',
    'name' : 'aa'
}

selected_books, selected_iid, selected_uid = filter_data_by_age(form['age'], user_df, rating_df, book_df)
#selected_books.sample(n=100)
selected_books.shape

(67950, 6)

### Algorithem 1

In [6]:
# filter rating table by selcted age 
selected_data = rating_df.loc[rating_df['uid'].isin(selected_uid)].sample(n=14000)
print(selected_data.shape)
# df -> surprise Dataset
data = Dataset.load_from_df(selected_data[["uid", "iid", "rating"]], Reader(rating_scale=(1, 5)))

(14000, 3)


In [7]:
from surprise import KNNBasic

train_set = data.build_full_trainset()
sim_options = {'name': 'cosine',
               'user_based': False,  # compute  similarities between users
               'min_support': 1  # minimum number of common items for two users
               }
algo = KNNBasic(k=20, sim_options=sim_options)
algo.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


  sim = construction_func[name](*args)


<surprise.prediction_algorithms.knns.KNNBasic at 0x13a761b80>

## Testing

In [8]:
from typing import Optional, List
from pydantic import BaseModel
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import pandas as pd
import numpy as np
import os
import csv
from sklearn.cluster import estimate_bandwidth
from surprise.model_selection import train_test_split
from utils import map_genre
import json
from surprise import dump
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
import pandas as pd

KeyboardInterrupt: 

In [None]:
# Loading Data
from surprise import Dataset
from surprise import Reader
import pandas as pd

# Loading data file path ...
file_path_users = './data/BX-Users.csv'
file_path_ratings = './data/BX-Book-Ratings.csv'
file_path_books = './data/BX-Books.csv'

# Loading .csv data into pandas df
book_df = pd.read_csv(file_path_books, names = ['iid', 'title', 'author', 'year', 'publisher','image'], sep=',',skiprows=1, dtype={'year':int})
rating_df = pd.read_csv(file_path_ratings, names = ['uid', 'iid','rating'], sep=',',skiprows=1)
user_df = pd.read_csv(file_path_users, names = ['uid', 'age'], sep=',',skiprows=1)
user_df = user_df.dropna()

# df -> surprise Dataset
data_rating = Dataset.load_from_df(rating_df[["uid", "iid", "rating"]].sample(n=10), Reader(rating_scale=(1, 5)))


sim_options = {'name': 'cosine',
                    'user_based': False,  # compute  similarities between users
                    'min_support': 1  # minimum number of common items for two users
               }
algo = KNNBasic(k=20, sim_options=sim_options)

In [None]:
def books_in_age(form: dict):
    print("----User selected----")
    print(form['age'])
    print(form['name'])
    selected_books, selected_iid, selected_uid = filter_data_by_age(form['age'], user_df, rating_df, book_df)
    # update data for training
    selected_data = rating_df.loc[rating_df['uid'].isin(selected_uid)].sample(n=8000)
    data_rating = Dataset.load_from_df(selected_data[["uid", "iid", "rating"]], Reader(rating_scale=(1, 10)))
    # fit algo 1
    trainset = data_rating.build_full_trainset()
    algo.fit(trainset)
    # return books in selected age 
    books_in_age = selected_books.sample(n=30)
    books_in_age.loc[:, 'score'] = None
    return books_in_age.to_json(orient='index')

# filtering books data from selected age range
def filter_data_by_age(age, u_df, r_df, b_df):
    if age == '0-18':
        get_uid = u_df.loc[(u_df['age'] < 18)]['uid']
    elif age == '18-30':
        get_uid = u_df.loc[(u_df['age'] >= 18) & (u_df['age'] < 30)]['uid']
    elif age == '30-40':
        get_uid = u_df.loc[(u_df['age'] >= 30) & (u_df['age'] < 40)]['uid']
    elif age == '40-50':
        get_uid = u_df.loc[(u_df['age'] >= 40) & (u_df['age'] < 50)]['uid']
    else:
        get_uid = u_df.loc[u_df['age'] >= 50]['uid']
    
    get_iid = r_df.loc[r_df['uid'].isin(get_uid)]['iid']
    get_books = b_df.loc[(b_df['iid'].isin(get_iid)) & (b_df['year']>1995)]
        
    return get_books.drop_duplicates(), get_iid, get_uid


def like_recommend(item_id):
    print("-=========================item_id:",item_id)
    # get_similar
    res = get_similar_items(str(item_id), n=5)
    res = [int(i) for i in res]
    print(res)
    rec_books = book_df.loc[book_df['iid'].isin(res)]
    print(rec_books)
    # rec_books.loc[:, 'like'] = None
    #results = rec_books.loc[:, ['iid', 'title', 'year', 'image']]
    return json.loads(rec_books.to_json(orient="records"))

# Method 1 - like
def get_similar_items(iid, n=5):
    inner_id = algo.trainset.to_inner_iid(str(iid))
    print(inner_id)
    neighbors = algo.get_neighbors(inner_id, k=n)
    neighbors_iid = [algo.trainset.to_raw_iid(x) for x in neighbors]
    print(neighbors_iid)
    return neighbors_iid


In [None]:
# data from pop-up windown 1 - name and age
form = {
    'age' : '0-1',
    'name' : 'aa'
}

books_in_age(form)

In [None]:
like_recommend("1567311199")

### Algorithem 2

### Evaluation

In [9]:
from scipy.stats import ttest_rel
from scipy.stats import ttest_ind

methodA_round_1 = [1.0003585533955126, 1.02521875, 1.0832961893171518, 1.2084630993538859, 1.0262983316877212, 0.9910370638447699, 1.1140163892982005, 0.9891552266959924, 1.33346875, 1.1842550664869456]
methodA_round_2 = [1.0995360610374764, 0.95965625, 1.1669204534699709, 1.2059589711259096, 1.0197927198241072,  1.1149396862190573, 1.2380823160392296, 1.1170941394556906, 1.31928125, 1.2163998510589558]
methodB_round_1 = [0.8134668985654723, 0.9309493362512627, 0.9207381501496753, 0.6666666666666666, 0.7698003589195009, 0.9940564109486686, 0.5443310539518174, 0.7888106377466154, 0.7314228762270925, 1.5869840952317444]
methodB_round_2 = [0.9562785167639519, 0.9906974722292783, 1.220902186396364, 0.9960173936495105, 0.7314228762270925, 0.69602043392737, 0.7200822998230957, 0.5443310539518174, 0.49690399499995325, 0.8606629658238704]

methodA =  methodA_round_1 + methodA_round_2
methodB =  methodB_round_1 + methodB_round_2

In [10]:
ttest_rel(methodA_round_1, methodA_round_2)

Ttest_relResult(statistic=-2.2627596316345344, pvalue=0.04995078793102759)

In [11]:
ttest_rel(methodB_round_1, methodB_round_2)

Ttest_relResult(statistic=0.5176203288269218, pvalue=0.6172046202696231)

In [12]:
ttest_ind(methodA, methodA)

Ttest_indResult(statistic=0.0, pvalue=1.0)

In [13]:
from statistics import mean

print("Average RMSE of Method A", mean(methodA))
print("Average RMSE of Method B", mean(methodB))

Average RMSE of Method A 1.1206614559155288
Average RMSE of Method B 0.848027283922541
