# Song Recommendation System on Amazon Dataset

In [None]:
# Importing header files
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
import nltk

from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import cross_validation as cv
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import re
from math import sqrt
import string
import operator
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import mean_squared_error

## Importing Reviews Dataset

In [None]:
# Importing review datset for review based recommender
reviews_music_df = pd.read_json("data/reviews_Digital_Music_5.json", lines=True)
reviews_music_df.head()

## Importing Ratings Dataset

In [None]:
# Importing rating datset for rating based recommender
header = ['user_id', 'item_id', 'rating', 'timestamp']
ratings_music_df = pd.read_csv('data/ratings_Digital_Music.csv', names=header)
ratings_music_df.head()

## EDA

In [None]:
# Complete Dataset
n_users_complete = ratings_music_df.user_id.unique().shape[0]
n_items_complete = ratings_music_df.item_id.unique().shape[0]
print ('Number of users = ' , str(n_users_complete) , ' | Number of items = ' , str(n_items_complete))

In [None]:
# Sparsity of Complete Dataset
sparsity_complete = round(1.0-len(ratings_music_df)/float(n_users_complete * n_items_complete),3)
print ('The sparsity level of Complete Music Dataset is ' ,  str(sparsity_complete*100) , '%')

In [None]:
# First 10000 rows
ratings_music_df_10000 = ratings_music_df.head(10000)
n_users = ratings_music_df_10000.user_id.unique().shape[0]
n_items = ratings_music_df_10000.item_id.unique().shape[0]
print ('Number of users = ' , str(n_users) , ' | Number of items = ' , str(n_items))

In [None]:
# Sparsity of first 10000 rows Dataset
sparsity_10000 = round(1.0-len(ratings_music_df_10000)/float(n_users*n_items),3)
print ('The sparsity level of Music dataset 10000 is ' ,  str(sparsity_10000*100) , '%')

In [None]:
# Calculating mean for each items (song)
ratings_music_df_10000.groupby('item_id')['rating'].mean().sort_values(ascending=False).head()

In [None]:
# Calculating count for each items (song)
ratings_music_df_10000.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

In [None]:
# Calculating mean for each items (song) and convert to DataFrame
ratings_mean = pd.DataFrame(ratings_music_df_10000.groupby('item_id')['rating'].mean())
ratings_mean.head()

In [None]:
# Calculating mean for each items (song) and convert to DataFrame merge count to it
ratings_mean['rating_numbers'] = pd.DataFrame(ratings_music_df_10000.groupby('item_id')['rating'].count())
ratings_mean.head()

In [None]:
# Sort ratings_mean according to ratings_count
ratings_mean.sort_values('rating_numbers', ascending=False).head()

In [None]:
# Plot of ratings count
ratings_mean['rating_numbers'].hist(bins=70)

In [None]:
# Plot of rating
ratings_mean['rating'].hist(bins=70)

In [None]:
# Join Plot
sns.jointplot(x='rating', y='rating_numbers', data=ratings_mean, alpha=0.5)

## Correlation based Recommender

In [None]:
# Create ratings matrix
ratings_matrix = ratings_music_df_10000.pivot_table(index='user_id', columns='item_id', values='rating')
ratings_matrix.head()

In [None]:
# Ratings of that product
B00000016W_user_ratings = ratings_matrix['B00000016W']
B000000TDH_user_ratings = ratings_matrix['B000000TDH']

In [None]:
# Correlation of the product with matrix
similar_to_B00000016W = ratings_matrix.corrwith(B00000016W_user_ratings)
similar_to_B000000TDH = ratings_matrix.corrwith(B000000TDH_user_ratings)

In [None]:
# Convert to DataFrame
corr_B00000016W = pd.DataFrame(similar_to_B00000016W, columns=['Correlation'])
corr_B00000016W.dropna(inplace=True)
corr_B00000016W.head()

In [None]:
# Convert to DataFrame
corr_B000000TDH = pd.DataFrame(similar_to_B000000TDH, columns=['Correlation'])
corr_B000000TDH.dropna(inplace=True)
corr_B000000TDH.head()

In [None]:
# Sort the items, based on correlation
corr_B00000016W.sort_values('Correlation', ascending=False).head()

In [None]:
# Sort the items, based on correlation
corr_B000000TDH.sort_values('Correlation', ascending=False).head()

In [None]:
# Join ratings count
corr_B00000016W = corr_B00000016W.join(ratings_mean['rating_numbers'], how='left', lsuffix='_left', rsuffix='_right')
corr_B00000016W.head()

In [None]:
# Sorting based on correlation and subset based on rating_numbers > 100
corr_B00000016W[corr_B00000016W['rating_numbers']>100].sort_values('Correlation', ascending=False).head()

In [None]:
# Join ratings count
corr_B000000TDH = corr_B000000TDH.join(ratings_mean['rating_numbers'], how='left', lsuffix='_left', rsuffix='_right')
corr_B000000TDH.head()

In [None]:
# Sorting based on correlation and subset based on rating_numbers > 100
corr_B000000TDH[corr_B000000TDH['rating_numbers']>100].sort_values('Correlation', ascending=False).head()

## Similarity based Recommender (cosine, euclidean, manhattan)

In [None]:
# Taking only first 10000 rows 
ratings_music_df_10000.head()

In [None]:
# Print Number of reviewers and number of items
n_users = ratings_music_df_10000.user_id.unique().shape[0]
n_items = ratings_music_df_10000.item_id.unique().shape[0]
print ('Number of users = ' , str(n_users) , ' | Number of items = ' , str(n_items))

In [None]:
# Train and Test split
train_data, test_data = cv.train_test_split(ratings_music_df_10000, test_size=0.25)

In [None]:
# Convert train and test to matrix
train_data_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
test_data_matrix = test_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

In [None]:
# Calculate cosine similarity
user_similarity_cosine = pairwise_distances(train_data_matrix, n_jobs=-1, metric='cosine')
item_similarity_cosine = pairwise_distances(train_data_matrix.T, n_jobs=-1, metric='cosine')

In [None]:
# Calculate euclidean similarity
user_similarity_euclidean = pairwise_distances(train_data_matrix, n_jobs=-1, metric='euclidean')
item_similarity_euclidean = pairwise_distances(train_data_matrix.T, n_jobs=-1, metric='euclidean')

In [None]:
# Calculate manhattan similarity
user_similarity_manhattan = pairwise_distances(train_data_matrix, n_jobs=-1, metric='manhattan')
item_similarity_manhattan = pairwise_distances(train_data_matrix.T, n_jobs=-1, metric='manhattan')

In [None]:
# Prediction function
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        #ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        ratings_diff = ratings.sub(ratings.mean(axis=1), axis=0)
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity).div(pd.DataFrame(np.array([np.abs(similarity).sum(axis=1)])).iloc[0]).values
    return pred

In [None]:
# prediction using cosine
item_prediction_cosine = predict(train_data_matrix, item_similarity_cosine, type='item')
user_prediction_cosine = predict(train_data_matrix, user_similarity_cosine, type='user')

In [None]:
# prediction using euclidean
item_prediction_euclidean = predict(train_data_matrix, item_similarity_euclidean, type='item')
user_prediction_euclidean = predict(train_data_matrix, user_similarity_euclidean, type='user')

In [None]:
# prediction using manhattan
item_prediction_manhattan = predict(train_data_matrix, item_similarity_manhattan, type='item')
user_prediction_manhattan = predict(train_data_matrix, user_similarity_manhattan, type='user')

In [None]:
# Calculate the RMSE and MSE
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(ground_truth, prediction))

def mse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return mean_squared_error(ground_truth, prediction)

In [None]:
print ('User-based CF (cosine) MSE: ' , str(mse(user_prediction_cosine, test_data_matrix.values)))
print ('Item-based CF (cosine) MSE: ' , str(mse(item_prediction_cosine, test_data_matrix.values)))
print ('-------------------------------------------------------------------------------------------')
print ('User-based CF (euclidean) MSE: ' , str(mse(user_prediction_euclidean, test_data_matrix.values)))
print ('Item-based CF (euclidean) MSE: ' , str(mse(item_prediction_euclidean, test_data_matrix.values)))
print ('-------------------------------------------------------------------------------------------')
print ('User-based CF (manhattan) MSE: ' , str(mse(user_prediction_manhattan, test_data_matrix.values)))
print ('Item-based CF (manhattan) MSE: ' , str(mse(item_prediction_manhattan, test_data_matrix.values)))

In [None]:
print ('User-based CF (cosine) RMSE: ' , str(rmse(user_prediction_cosine, test_data_matrix.values)))
print ('Item-based CF (cosine) RMSE: ' , str(rmse(item_prediction_cosine, test_data_matrix.values)))
print ('-------------------------------------------------------------------------------------------')
print ('User-based CF (euclidean) RMSE: ' , str(rmse(user_prediction_euclidean, test_data_matrix.values)))
print ('Item-based CF (euclidean) RMSE: ' , str(rmse(item_prediction_euclidean, test_data_matrix.values)))
print ('-------------------------------------------------------------------------------------------')
print ('User-based CF (manhattan) RMSE: ' , str(rmse(user_prediction_manhattan, test_data_matrix.values)))
print ('Item-based CF (manhattan) RMSE: ' , str(rmse(item_prediction_manhattan, test_data_matrix.values)))

In [None]:
# Using SVD, calculating MSE and RMSE
# Get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print ('User-based CF MSE: ' , str(mse(X_pred, test_data_matrix.values)))
print ('User-based CF RMSE: ' , str(rmse(X_pred, test_data_matrix.values)))

In [None]:
# Create DataFrame of user_prediction_cosine
columns = [i for i in train_data_matrix]
index = list(train_data_matrix.index)
usr_pred = pd.DataFrame(user_prediction_cosine, index=index, columns=columns)
usr_pred.head()

In [None]:
# Recommended items for Reviewer A10175AMUHOQC4
sorted(usr_pred.loc['A10175AMUHOQC4'].to_dict().items(), key=operator.itemgetter(1), reverse=True)[:10]

## Review based Recommender

In [None]:
# Calculate count and mean of each item ( Song )
count = reviews_music_df.groupby("asin", as_index=False).count()
mean = reviews_music_df.groupby("asin", as_index=False).mean()

dfMerged = pd.merge(reviews_music_df, count, how='right', on=['asin'])
dfMerged.head()

In [None]:
# Rename the columns
dfMerged["totalReviewers"] = dfMerged["reviewerID_y"]
dfMerged["overallScore"] = dfMerged["overall_x"]
dfMerged["summaryReview"] = dfMerged["summary_x"]

dfNew = dfMerged[['asin','summaryReview','overallScore',"totalReviewers"]]

In [None]:
# Sort based on total reviews
dfMerged = dfMerged.sort_values(by='totalReviewers', ascending=False)
dfCount = dfMerged[dfMerged.totalReviewers >= 50]
dfCount.head()

In [None]:
# Create review summary for each item
dfProductReview = reviews_music_df.groupby("asin", as_index=False).mean()
ProductReviewSummary = dfCount.groupby("asin")["summaryReview"].apply(list)
ProductReviewSummary = pd.DataFrame(ProductReviewSummary)
ProductReviewSummary.to_csv("ProductReviewSummary.csv")
ProductReviewSummary.head()

In [None]:
# mean ratings of each product
dfProductReview.head()

In [None]:
# Read the csv and merge the reviews
df3 = pd.read_csv("ProductReviewSummary.csv")
df3 = pd.merge(df3, dfProductReview, on="asin", how='inner')

In [None]:
df3 = df3[['asin','summaryReview','overall']]

In [None]:
# Function for tokenizing summary
regEx = re.compile('[^a-z]+')
def cleanReviews(reviewText):
    reviewText = reviewText.lower()
    reviewText = regEx.sub(' ', reviewText).strip()
    return reviewText

In [None]:
# Reset index and drop duplicate rows
df3["summaryClean"] = df3["summaryReview"].apply(cleanReviews)
df3 = df3.drop_duplicates(['overall'], keep='last')
df3 = df3.reset_index()

In [None]:
# Keep only reviews
reviews = df3["summaryClean"] 
countVector = CountVectorizer(max_features = 300, stop_words='english') 
transformedReviews = countVector.fit_transform(reviews) 

dfReviews = pd.DataFrame(transformedReviews.A, columns=countVector.get_feature_names())
dfReviews = dfReviews.astype(int)
dfReviews.head()

In [None]:
# Save as csv
dfReviews.to_csv("dfReviews.csv")

In [None]:
# First let's create a dataset called X
X = np.array(dfReviews)
# create train and test
tpercent = 0.9
tsize = int(np.floor(tpercent * len(dfReviews)))
dfReviews_train = X[:tsize]
dfReviews_test = X[tsize:]
#len of train and test
lentrain = len(dfReviews_train)
lentest = len(dfReviews_test)

In [None]:
# KNN classifier to find similar products
print(lentrain)
print(lentest)

In [None]:
# Algorithm ball_tree
neighbor = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(dfReviews_train)

# Let's find the k-neighbors of each point in object X. To do that we call the kneighbors() function on object X.
distances, indices = neighbor.kneighbors(dfReviews_train)

In [None]:
# Find most related products
for i in range(lentest):
    a = neighbor.kneighbors([dfReviews_test[i]])
    related_product_list = a[1]

    first_related_product = [item[0] for item in related_product_list]
    first_related_product = str(first_related_product).strip('[]')
    first_related_product = int(first_related_product)
    second_related_product = [item[1] for item in related_product_list]
    second_related_product = str(second_related_product).strip('[]')
    second_related_product = int(second_related_product)
    
    print ("Based on product reviews, for ", df3["asin"][lentrain + i] ," average rating is ",df3["overall"][lentrain + i])
    print ("The first similar product is ", df3["asin"][first_related_product] ," average rating is ",df3["overall"][first_related_product])
    print ("The second similar product is ", df3["asin"][second_related_product] ," average rating is ",df3["overall"][second_related_product])
    print ("-----------------------------------------------------------------------------------------")

In [None]:
# Using 3 Neighbours
df5_train_target = df3["overall"][:lentrain]
df5_test_target = df3["overall"][lentrain:lentrain+lentest]
df5_train_target = df5_train_target.astype(int)
df5_test_target = df5_test_target.astype(int)

n_neighbors = 3
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knnclf.fit(dfReviews_train, df5_train_target)
knnpreds_test = knnclf.predict(dfReviews_test)

print(classification_report(df5_test_target, knnpreds_test))

In [None]:
print (accuracy_score(df5_test_target, knnpreds_test))

In [None]:
print(mean_squared_error(df5_test_target, knnpreds_test))

In [None]:
# Using 5 Neighbours
df5_train_target = df3["overall"][:lentrain]
df5_test_target = df3["overall"][lentrain:lentrain+lentest]
df5_train_target = df5_train_target.astype(int)
df5_test_target = df5_test_target.astype(int)

n_neighbors = 5
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knnclf.fit(dfReviews_train, df5_train_target)
knnpreds_test = knnclf.predict(dfReviews_test)
#print (knnpreds_test)

print(classification_report(df5_test_target, knnpreds_test))

In [None]:
print (accuracy_score(df5_test_target, knnpreds_test))

In [None]:
print(mean_squared_error(df5_test_target, knnpreds_test))

In [None]:
# test percent changed
# First let's create a dataset called X
X = np.array(dfReviews)
 # create train and test
tpercent = 0.85
tsize = int(np.floor(tpercent * len(dfReviews)))
dfReviews_train = X[:tsize]
dfReviews_test = X[tsize:]
#len of train and test
lentrain = len(dfReviews_train)
lentest = len(dfReviews_test)

In [None]:
# Next we will instantiate a nearest neighbor object, and call it nbrs. Then we will fit it to dataset X.
neighbor = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(dfReviews_train)

# Let's find the k-neighbors of each point in object X. To do that we call the kneighbors() function on object X.
distances, indices = neighbor.kneighbors(dfReviews_train)

In [None]:
#find most related products
for i in range(lentest):
    a = neighbor.kneighbors([dfReviews_test[i]])
    related_product_list = a[1]

    first_related_product = [item[0] for item in related_product_list]
    first_related_product = str(first_related_product).strip('[]')
    first_related_product = int(first_related_product)
    second_related_product = [item[1] for item in related_product_list]
    second_related_product = str(second_related_product).strip('[]')
    second_related_product = int(second_related_product)
    
    print ("Based on product reviews, for ", df3["asin"][lentrain + i] ," average rating is ",df3["overall"][lentrain + i])
    print ("The first similar product is ", df3["asin"][first_related_product] ," average rating is ",df3["overall"][first_related_product])
    print ("The second similar product is ", df3["asin"][second_related_product] ," average rating is ",df3["overall"][second_related_product])
    print ("-----------------------------------------------------------------------------")

In [None]:
df5_train_target = df3["overall"][:lentrain]
df5_test_target = df3["overall"][lentrain:lentrain+lentest]
df5_train_target = df5_train_target.astype(int)
df5_test_target = df5_test_target.astype(int)

n_neighbors = 5
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knnclf.fit(dfReviews_train, df5_train_target)
knnpreds_test = knnclf.predict(dfReviews_test)
#print (knnpreds_test)

print(classification_report(df5_test_target, knnpreds_test))

In [None]:
print (accuracy_score(df5_test_target, knnpreds_test))

In [None]:
print(mean_squared_error(df5_test_target, knnpreds_test))

In [None]:
# Algorithm brute
neighbor = NearestNeighbors(n_neighbors=3, algorithm='brute').fit(dfReviews_train)

distances, indices = neighbor.kneighbors(dfReviews_train)

In [None]:
# Using 3 Neighbours
df5_train_target = df3["overall"][:lentrain]
df5_test_target = df3["overall"][lentrain:lentrain+lentest]
df5_train_target = df5_train_target.astype(int)
df5_test_target = df5_test_target.astype(int)
n_neighbors = 3
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knnclf.fit(dfReviews_train, df5_train_target)
knnpreds_test = knnclf.predict(dfReviews_test)

print(classification_report(df5_test_target, knnpreds_test))
print ("Accuracy: ",accuracy_score(df5_test_target, knnpreds_test))
print("MSE: ",mean_squared_error(df5_test_target, knnpreds_test))

In [None]:
neighbor = NearestNeighbors(n_neighbors=5, algorithm='kd_tree').fit(dfReviews_train)
distances, indices = neighbor.kneighbors(dfReviews_train)

In [None]:
# Using 5 Neighbours
df5_train_target = df3["overall"][:lentrain]
df5_test_target = df3["overall"][lentrain:lentrain+lentest]
df5_train_target = df5_train_target.astype(int)
df5_test_target = df5_test_target.astype(int)
n_neighbors = 5
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knnclf.fit(dfReviews_train, df5_train_target)
knnpreds_test = knnclf.predict(dfReviews_test)

print(classification_report(df5_test_target, knnpreds_test))
print ("Accuracy: ",accuracy_score(df5_test_target, knnpreds_test))
print("MSE: ",mean_squared_error(df5_test_target, knnpreds_test))

## Wordcloud

In [None]:
# Creating cluster for wordcloud
cluster = reviews_music_df.groupby("overall")["summary"].apply(list)
cluster = pd.DataFrame(cluster)
cluster.to_csv("cluster.csv")
cluster1 = pd.read_csv("cluster.csv")
cluster1["summaryClean"] = cluster1["summary"].apply(cleanReviews)

In [None]:
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
         
        
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))
    
    fig = plt.figure(1, figsize=(8, 8))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
show_wordcloud(cluster1["summaryClean"][0], title = "Review Score One")

In [None]:
show_wordcloud(cluster1["summaryClean"][1] , title = "Review Score Two")

In [None]:
show_wordcloud(cluster1["summaryClean"][2], title = "Review Score Three")

In [None]:
show_wordcloud(cluster1["summaryClean"][3], title = "Review Score Four")

In [None]:
show_wordcloud(cluster1["summaryClean"][4], title = "Review Score Five")