<a href="https://colab.research.google.com/github/paolopetta/FIA-Yourbook/blob/main/Yourbook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import delle librerie necessarie
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from PIL import Image
import requests

In [None]:
#Import dei dataset con le informazioni:
#user.cvs -> info sugli utenti
#book.cvs -> info su i libri
#ratings.cvs -> votazioni che utenti hanno dati ai libri
#Inoltre, l'import elimina tutte le righe dei dataset che non sono corrette 
users = pd.read_csv('Users.csv', error_bad_lines=False, delimiter=';', encoding = 'ISO-8859-1')
books = pd.read_csv('Books.csv', error_bad_lines=False, delimiter=';', engine = 'python' , encoding = 'ISO-8859-1')
ratings = pd.read_csv('Ratings.csv', error_bad_lines=False, delimiter=';', engine = 'python' , encoding = 'ISO-8859-1')

In [None]:
#Inizia la fase di preparazione dei dati di ratings
#Rimuoviamo tutti i duplicati presenti
ratings.drop_duplicates(inplace=True, keep='first')
#Rimuoviamo le righe di rating che sono nulle
ratings = ratings.dropna()
#stampiamo la dimesione della tabella di ratings
print(ratings.shape)
#stampiamo la media delle votazioni
ratings['Book-Rating'].mean()
#rimuoviamo da rating tutte le votazioni dei libri uguali a 0
ratings = ratings[ratings['Book-Rating'] != 0]
ratings.info()

In [None]:
#Inizia la fase di preparazione dei dati di user
#stampiamo le informazioni di user-id
users['User-ID'].describe()
#andiamo ad eliminare tutti gli user che sono null
users_df0 = users.dropna()
#Gli userid erano scritti come float e li trasformiamo in int
users_df0['User-ID'].astype(np.int64)

In [None]:
#Merge tra ratings e user
B1 = pd.merge(ratings, users_df0, on='User-ID', how='left')

In [None]:
#Merge b1 e books
B2 = pd.merge(B1, books, on='ISBN', how='left')
#Cancelliamo tutti i dati che sono null
B3 = B2.dropna()
#Abbiamo ottenuto una tabella unita e di molto ridotta rispetto alle tabelle iniziali
print(B3.shape)
#Rinominiamo le colonne
B3.rename(columns={
    'User-ID': 'User_ID', 
    'Book-Rating': 'Book_Rating', 
    'Book-Title': 'Book_Title',
    'Book-Author': 'Book_Author',
    'Year-Of-Publication': 'Year_Of_Publication'
}, inplace=True)
B3.info()
bn = B3["Book_Title"].value_counts()
B3["User_ID"].value_counts()
user = B3['User_ID'].astype("str")
user.describe()

In [None]:
#Eliminiamo tutti gli utenti che hanno età >= 80 || <= 10
B4 = B3.drop(B3[B3['Age'] >= 80].index)
B4 = B4.drop(B4[B4['Age'] <= 10].index)
B4.shape

In [None]:
#Eliminiarmo tutti i libri pubblicati dal 2010 in poi e prima del 1200
B4 = B4.drop(B4[B4['Year_Of_Publication'] >= 2010].index)
B4 = B4.drop(B4[B4['Year_Of_Publication'] <= 1200].index)
B4['Year_Of_Publication'].describe()
B4.shape

In [None]:
#Contiamo quanti titoli sono presenti
bn = B4["Book_Title"].value_counts()

In [None]:
#Assegnamo a user tutti gli id e ne eliminiamo i duplicati
user = B4['User_ID']
user.drop_duplicates(inplace=True, keep='first') 

#grafico che rappresenta la frequenza delle votazioni e la frequenza di una determinata età degli utenti
#x (da 0 a 10) votazione dei libri, (da 11 a 80) età degli utenti
#y frequenza di questi componenti
user = pd.merge(user, B4, on='User_ID', how='left')
user['Age'].plot(kind='hist', title='Age Distribution',)
B4['Book_Rating'].plot(kind='hist', title='Book_Rating Distribution',)

In [None]:
from matplotlib import pyplot as plt
from matplotlib import font_manager

#grafico con i 5 libri più apprezzati (*)
data1 = B4.groupby(by="Book_Title").count().sort_values(by="Book_Rating", ascending=False)[:5]["Book_Rating"]
_x = data1.index
_y = data1.values


plt.figure(figsize=(29,8), dpi=100)
plt.bar(range(len(_x)), _y, width=0.5)

plt.xticks(range(len(_x)), _x)
plt.xlabel("Book Title")
plt.ylabel("Num Counts")
plt.title("Top Rated Books")
plt.show()

In [None]:
user = B4['User_ID'].astype("str")

In [None]:
B4["User_ID"].value_counts()

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from networkx.algorithms import bipartite

In [None]:
#dataset su cui lavorare, ancora da modificare
data = B4

In [None]:
#stampa dei primi 20 elmenti, da modificare se lo manteniamo
data.head(100)

In [None]:

data['ISBN'] = pd.to_numeric(data['ISBN'],errors='coerce')
data.dropna(inplace=True)
data.head()
data.info()

In [None]:
#eliminazione delle immagini
data.drop(['Image-URL-S','Image-URL-M','Image-URL-L'],axis=1,inplace=True)

In [None]:
G = nx.Graph()
m=list(data['User_ID'])
n=list(data['Book_Title'])
zip_list=list(zip(m,n))
# Add nodes with the node attribute "bipartite"
G.add_nodes_from(m, bipartite=0)
G.add_nodes_from(n, bipartite=1)
G.add_edges_from(list(zip(m,n))) 
    
bipartite.is_bipartite(G)

In [None]:
#G.adj

In [None]:
#togliere quello sopra assolutamente
#print(nx.adjacency_matrix(G).todense())

In [None]:
pdd = pd.DataFrame(zip_list,columns=['source','target'])
pdd.head()

In [None]:
top_nodes = {n for n, d in G.nodes(data=True) if d["bipartite"] == 0}
bottom_nodes = set(G) - top_nodes

In [None]:
cent = nx.degree_centrality(G)
name = []
centrality = []

for key, value in cent.items():
    name.append(key)
    centrality.append(value)

In [None]:
cent = pd.DataFrame()    
cent['name'] = name
cent['centrality'] = centrality
cent = cent.sort_values(by='centrality', ascending=False)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
data1 = data.copy()

In [None]:
#eliminiamo altri dati inutili
data.drop(['Book_Title','Publisher','Location'],axis=1,inplace=True)

In [None]:
#stampa dei dati risultanti
data.head()
data.info()

In [None]:
#prendiamo i get dummies su bookauthor
data_encoded = pd.get_dummies(data, columns=["Book_Author"], sparse=True)                  
data_encoded
data1.info()

In [None]:
data_encoded.info()

In [None]:
# minmax scaler (this part is referred from HW2)
scaler = MinMaxScaler()

#andiamo a creare i dataset per il training e il test
#molto ridotti per mancanza di capacità di elaborazione
train = data_encoded[:1000]
test = data_encoded[1001:1101]

#train, test = np.split(data_encoded.sample(frac=0.01), [int(.6*len(data_encoded))], [int(.4*len(data_encoded))])
#print(train)
#print(test)

#train_X,test_X = train_test_split(data_encoded, test_size=0.3, random_state=930)
X_train = scaler.fit_transform(train)
X_test = scaler.transform(test)
print(X_test)

In [None]:
data_encoded.info()

In [None]:
X = scaler.transform(data_encoded[:1000])

In [None]:
print(X)

In [None]:
#grafico di punto di gomito
K = range(1, 7)
meanDispersions = [] 
for k in K:
    kmeans = KMeans(n_clusters=2)
    kmeans.fit(X_train)
    
    meanDispersions.append(kmeans.inertia_)
      

In [None]:
plt.plot(K, meanDispersions, 'rx-')
plt.xlabel('k')
plt.ylabel('Average Dispersion')
plt.title('Selecting k with the Elbow Method')
plt.show() 

In [None]:
kmeans = KMeans(n_clusters=3)

y1 = kmeans.fit_predict(X_train)
y2 = kmeans.predict(X_test)
whole_data = kmeans.predict(X)

#generate two subsets with data generated from last step 
train = pd.DataFrame(X_train, columns = data_encoded.columns)
test = pd.DataFrame(X_test, columns = data_encoded.columns)
#then add the prediction of clustering to these data
train['Cluster'] = y1
test['Cluster'] = y2
data1= data1[:1000]
data1['Cluster']= whole_data

In [None]:
# check the outcomes of each cluster
groupby1 = data1.groupby(by='Cluster').mean()
groupby1

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y1, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
score = silhouette_score(X, y1)
print("Silouhette score: " + str(score))

In [None]:
merged = ratings.merge(books, on = 'ISBN')
merged.head()
merged.info()

In [None]:
#merge the data with number of ratings
merged_groupby=merged.groupby('Book-Title')['Book-Rating'].count().reset_index()
merged_groupby.rename(columns={'Book-Rating':'number_of_ratings'},inplace=True)
#filter books with more than 30 reviews 
merged_groupby=merged_groupby[merged_groupby['number_of_ratings']>30]
merged_groupby.head()

In [None]:
#merge the above two files together to get an integrated book review data with total review count for each book;then remove the duplicates
integrated_merged=merged.merge(merged_groupby, on='Book-Title')
integrated_merged.drop_duplicates(['User-ID','Book-Title'],inplace=True)
integrated_merged.head()
integrated_merged.info()

In [None]:
pivot=pd.pivot_table(integrated_merged, columns='User-ID',index='Book-Title',fill_value=0,values='Book-Rating')
pivot.shape
pivot

In [None]:
pivot_csr=csr_matrix(pivot)
pivot_csr

In [None]:
#construct kNN models
model=NearestNeighbors(algorithm='brute')
model.fit(pivot_csr)

In [None]:
pivot.iloc[:,:].values.reshape(1,-1)
#example of k neighbors 
distances,suggestions=model.kneighbors(pivot.iloc[55,:].values.reshape(1,-1))

In [None]:
distances

In [None]:
suggestions

In [None]:
#test the kNN collaborative filtering model
for i in range(len(suggestions)):
    print(pivot.index[suggestions[i]])
    print(suggestions[i])

In [None]:
list1=list(B3['Book_Title'])
list1

In [None]:
#final recommender system function building
def book_recommend(book_name):
    if book_name in list1:
        book_id = np.where(pivot.index == book_name)[0][0]
        distances, recommendations = model.kneighbors(pivot.iloc[book_id,:].values.reshape(1,-1))
        print('begin to recommend all books similar to this book!!!')
        for i in range(len(recommendations)):
            if i == 0:
                print(f"For book \"{book_name}\" we would recommend the following:")
            if not i:
                list2=pivot.index[recommendations[i]]
                for j in range(len(list2)):
                    print(list2[j])
    else:
        raise ValueError

In [None]:
name=input('Please Input a book name: ')
book_recommend(name)