In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np #Importing the required libraries

In [2]:
#1 A. Reading and merging the dataframes.
#Reading the given data files. Using a different encoding standard, as UTF-8 is producing errors. 
df1 = pd.read_csv('phone_user_review_file_1.csv',encoding = "ISO-8859-1")
df2 = pd.read_csv('phone_user_review_file_2.csv',encoding = "ISO-8859-1")
df3 = pd.read_csv('phone_user_review_file_3.csv',encoding = "ISO-8859-1")
df4 = pd.read_csv('phone_user_review_file_4.csv',encoding = "ISO-8859-1")
df5 = pd.read_csv('phone_user_review_file_5.csv',encoding = "ISO-8859-1")
df6 = pd.read_csv('phone_user_review_file_6.csv',encoding = "ISO-8859-1")

In [3]:
df_final = pd.DataFrame()
df_final = df_final.append(df1)
df_final = df_final.append(df2)
df_final = df_final.append(df3)
df_final = df_final.append(df4)
df_final = df_final.append(df5)
df_final = df_final.append(df6)

In [4]:
#1 B. Exploring and understanding the data.
df_final['source'].value_counts() #We can see that the user reviews from Amazon and Yandex form more than half of the total reviews.

Amazon             728471
Yandex             123066
Ciao                59425
Samsung             45585
MercadoLibre        33531
                    ...  
Comtech                 1
Karsilastir             1
Quelle                  1
Elkj??p                 1
RECENZJE.NET.PL         1
Name: source, Length: 331, dtype: int64

In [5]:
df_final['score'].mean()

8.007059551183373

In [6]:
'''Here we can see that there's a bias in the review score by all of the users,
   in which the users seem to be more inclined towards giving higher ratings to 
   the smartphones rather than giving lower ratings'''

"Here we can see that there's a bias in the review score by all of the users,\n   in which the users seem to be more inclined towards giving higher ratings to \n   the smartphones rather than giving lower ratings"

In [7]:
#1 C. Rounding off the scores to the nearest integers 
df_final['score_rounded'] = np.around(df_final['score'])

In [8]:
df_final=df_final.drop('score',axis=1)

In [9]:
#1 D. Checking for and imputing the missing values in the dataset
df_final.isna().any() #Null values are present in the dataset

phone_url        False
date             False
lang             False
country          False
source           False
domain           False
score_max         True
extract           True
author            True
product           True
score_rounded     True
dtype: bool

In [10]:
df_final.isna().sum()

phone_url            0
date                 0
lang                 0
country              0
source               0
domain               0
score_max        63489
extract          19361
author           63202
product              1
score_rounded    63489
dtype: int64

In [11]:
print("Before dropping null values, the shap eof the dataset was :-")
print(df_final.shape)
print("After dropping null values, the shape of the dataset becomes :-")
print(df_final.dropna().shape)

Before dropping null values, the shap eof the dataset was :-
(1415133, 11)
After dropping null values, the shape of the dataset becomes :-
(1275917, 11)


In [12]:
#We can see that we're losing ~10% of the data after dropping the null values altogether.
#Thus, in order to prevent this data loss, we've to impute the missing values wherever possible.

In [13]:
#First, keeping only the features which are relevant to the preparation of recommendation system algorithms, such as author, product and score_rounded.
df_final1 = df_final[['author','product','score_rounded']].reset_index().drop('index',axis=1)
print(df_final1.isna().sum())

author           63202
product              1
score_rounded    63489
dtype: int64


In [14]:
#Imputing the null values in the column 'score_rounded' with the help of median value of the column.
df_final1['score_rounded'] = df_final1['score_rounded'].fillna(df_final['score_rounded'].median())
df_final1.isna().sum()

author           63202
product              1
score_rounded        0
dtype: int64

In [15]:
#Null values in the columns 'author' and 'product' can't be imputed, thus they've to be dropped.
df_final1 = df_final1.dropna()
print(df_final1.isna().sum())
print(df_final1.shape) #No null values are now remaining in the dataset. After dropping the null values, only ~5% of the data was lost.

author           0
product          0
score_rounded    0
dtype: int64
(1351931, 3)


In [16]:
#1 E. Checking for duplicate values in the dataset.
sum(df_final1.duplicated()) #Lots of duplicate records are present in the dataset.

137609

In [17]:
#Removing the duplicated values.
df_final1 = df_final1.drop_duplicates()

In [18]:
sum(df_final1.duplicated()) #No duplicate records are now present in the dataset.

0

In [19]:
#Let's check if a single user has given multiple ratings to the same smartphone model
df_final1[['author','product']].duplicated().value_counts() #We can see that lots of users have given multiple reviews to the same smartphone model

False    1187568
True       26754
dtype: int64

In [20]:
#Let's take mean of all of the ratings given by a single user to the single smartphone.
df_final1=df_final1.groupby(by=['author','product']).mean().reset_index()

In [21]:
#1 F. Keeping only 1 million datapoints with random state = 612.
df_final2 = df_final1.sample(n=1000000,random_state=612).reset_index().drop('index',axis=1)
df_final2.shape

(1000000, 3)

In [22]:
#2 A. Identifying the most rated features (smartphone models)
df_final2['product'].value_counts()

OnePlus 3 (Graphite, 64 GB)                                                                                                   1794
Lenovo Vibe K4 Note (White,16GB)                                                                                              1720
Nokia N95                                                                                                                     1529
Lenovo Vibe K4 Note (Black, 16GB)                                                                                             1486
Samsung Galaxy J3 (8GB)                                                                                                       1482
                                                                                                                              ... 
HTC Amaze 4G (Black)                                                                                                             1
Apple iPhone 6S (128GB, space gray)                                                

In [23]:
#We can see that the smartphone models such as OnePlus 3 and Lenovo Vibe K4 Note have the highest number of reviews.
#Let's see which smartphones have the highest average ratings in the dataset.
df_final2[['product','score_rounded']].groupby(by='product').mean().sort_values(by='score_rounded',ascending=False)

Unnamed: 0_level_0,score_rounded
product,Unnamed: 1_level_1
SAMSUNG Galaxy S6 Edge - Or - 4G+ - 32 Go - Smartphone,10.0
Samsung S8000 Jet Zwart,10.0
LG Optimus 4X HD Zwart,10.0
Samsung S7710 Galaxy SF Xcover 2 titan-grey,10.0
LG Optimus 4X HD (unlocked),10.0
...,...
Samsung Craft SCH-r900 - black (MetroPCS),1.0
Samsung Illusion SCH-I110 (Verizon Wireless),1.0
LG EnV3,1.0
Lenovo Motorola Droid Pro (Verizon Wireless),1.0


In [24]:
#Smartphones with makes such as LG and Samsung are highest rated in this dataset.

In [25]:
#2 B. Identifying the users with most number of reviews.
df_final2_author_count=df_final2[['author','score_rounded']].groupby(by='author').count().sort_values(by='score_rounded',ascending=False)
df_final2_author_count

Unnamed: 0_level_0,score_rounded
author,Unnamed: 1_level_1
Amazon Customer,6998
Cliente Amazon,2616
Anonymous,1038
Client d'Amazon,996
Amazon Kunde,873
...,...
Luis Toledo,1
Luis Toribio,1
Luis Tovar,1
"Luis Turtle ""Luis Turtle""",1


In [26]:
#From this statistic we can see that the anonymous Amazon customers from different countries are the topmost reviewers in this dataset.

In [27]:
#2 C.Selecting the data with products having more than 50 ratings and users who have given more than 50 ratings
#Let's find out the authors who have given more than 50 ratings.
df_final2_authors = df_final2_author_count[df_final2_author_count['score_rounded']>50].drop('score_rounded',axis=1).reset_index()
author_list = df_final2_authors['author'].to_list()

In [28]:
#Let's find out the products with more than 50 ratings.
df_final2_product_count=df_final2[['product','score_rounded']].groupby(by='product').count().sort_values(by='score_rounded',ascending=False)
df_final2_products = df_final2_product_count[df_final2_product_count['score_rounded']>50].drop('score_rounded',axis=1).reset_index()
product_list = df_final2_products['product'].to_list()

In [29]:
df_final_f = df_final2[df_final2['author'].isin(author_list) & df_final2['product'].isin(product_list)].reset_index().drop('index',axis=1)

In [30]:
df_final_f.shape #Shape of the final dataframe

(37987, 3)

In [31]:
#3. Building a popularity based recommendation system and recommending the top 5 smartphone models.
df_final_f_avg_score = df_final_f[['product','score_rounded']].groupby(by = 'product').mean()
#df_final_f[['product','score_rounded']].groupby(by = 'product').count()
df_final_f_avg_score['count'] = df_final_f[['product','score_rounded']].groupby(by = 'product').count()
df_final_f_avg_score.sort_values(by=['score_rounded','count'],ascending=False).head() #Recommending the top 5 highest rated and most popular smartphone models.

Unnamed: 0_level_0,score_rounded,count
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Samsung Galaxy Note5,10.0,28
"Lenovo Motorola RAZR HD Smartphone (11,9 cm (4,7 Zoll) AMOLED, Dual-Core, 1,5GHz, 1GB RAM, 8 Megapixel Kamera, Android 4.0) schwarz",10.0,8
Motorola Defy,10.0,8
Samsung N7100 Galaxy Note II (16Gb),10.0,8
Apple iPhone 4S 32GB negro,10.0,7


In [32]:
#4. Building a collaborative recommendation system using SVD and KNN with means with the help of surprise library
#Building the rating matrix
rating_matrix = df_final_f.pivot(index = 'author', columns ='product', values = 'score_rounded')

In [33]:
nonzero = np.count_nonzero(np.isnan(rating_matrix))
maxvals = rating_matrix.shape[0]*rating_matrix.shape[1]
print('Total non zero values in the matrix : ',nonzero)
print('Total possible values in the matrix : ',maxvals)
print('Matrix density : ',(nonzero/maxvals)*100,'%') #Highly dense matrix.

Total non zero values in the matrix :  2753152
Total possible values in the matrix :  2791139
Matrix density :  98.63901439519852 %


In [34]:
from surprise import Dataset,Reader
from surprise.model_selection import train_test_split
from surprise import SVD, KNNWithMeans
from surprise import accuracy
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_final_f[['author', 'product', 'score_rounded']], reader)
#Loading the dataset into the surprise library supported format

In [35]:
trainset, testset = train_test_split(data, test_size=.25,random_state=123) #Splitting the dataset into training and test sets.

In [36]:
svd_model = SVD(n_factors=5,biased=False) #Training the SVD model using training set
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2c01443b370>

In [37]:
test_pred = svd_model.test(testset)

In [38]:
#Calculating root mean squared error for SVD model.
accuracy.rmse(test_pred)

RMSE: 2.6295


2.6294610716123237

In [39]:
#Training a KNNWithMeans Item-Item based collaborative recommendation system
algo_i = KNNWithMeans(k=10, sim_options={ 'user_based': False}) #10 neighbours
algo_i.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2c01443b9d0>

In [40]:
test_pred=algo_i.test(testset)
print(accuracy.rmse(test_pred))

RMSE: 2.5672
2.5671767536128045


In [41]:
#Inference :-
#We can see that KNNWithMeans gave us better results than SVD, as it came up with lesser test RMSE than the previous one.
#We should try cross validation for lesser RMSE.

In [42]:
#6. Average ratings (actual and predicted) for all smartphones
pd.DataFrame(test_pred).drop(['uid','details'],axis=1).groupby('iid').mean()

Unnamed: 0_level_0,r_ui,est
iid,Unnamed: 1_level_1,Unnamed: 2_level_1
5.5-Inch Unlocked Lenovo A850 3G Smartphone-(960x540) Quad Core 4GB MT6582m 1331MHz Android 4.2 Dual Camera +Dual SIM -Black (Rooted + Google Play),4.000000,6.646559
"AICEK Coque ASUS ZenFone 3 Max ZC520TL, AICEK Etui Silicone Gel ASUS ZenFone 3 Max Housse Antichoc ZenFone 3 Max Transparente Souple Coque de Protection pour ASUS ZenFone 3 Max(5.2 Pouces)",10.000000,7.563131
"AICEK Coque Samsung Galaxy A3 2016, AICEK Etui Silicone Gel Samsung Galaxy A3 2016 (A310F) Housse Antichoc Samsung A3 Transparente Souple Coque De Protection Pour Samsung Galaxy A3 2016 (4,7 pouces)",6.000000,8.502999
"AICEK Coque Samsung Galaxy J3 2016, AICEK Etui Silicone Gel Samsung Galaxy J3 2016 (J320F) Housse Antichoc Samsung J3 Transparente Souple Coque De Protection Pour Samsung Galaxy J3 2016 (5,0 Pouces)",10.000000,7.288846
"APPLE iPhone 6s Plus - 128 GB, Rose Gold",9.545455,9.431761
...,...,...
ÐÐ¾Ð±Ð¸Ð»ÑÐ½ÑÐ¹ ÑÐµÐ»ÐµÑÐ¾Ð½ Lenovo A1000 White (PA1R0019UA),8.750000,7.782227
ÐÐ¾Ð±Ð¸Ð»ÑÐ½ÑÐ¹ ÑÐµÐ»ÐµÑÐ¾Ð½ Samsung J700H/DS Galaxy J7 Duos White (SM-J700HZWDSEK),9.600000,9.576623
Ð¡Ð¾ÑÐ¾Ð²ÑÐ¹ ÑÐµÐ»ÐµÑÐ¾Ð½ ASUS Zenfone 2 Laser,10.000000,8.689501
Ð¡Ð¾ÑÐ¾Ð²ÑÐ¹ ÑÐµÐ»ÐµÑÐ¾Ð½ Asus ZenFone 2,9.000000,9.513736


In [43]:
#8. Predicting top 5 results for a user, say 'Nina'.
pred = pd.DataFrame(test_pred)
pred[pred['uid'] == 'Nina'][['iid', 'r_ui','est']].sort_values(by = 'r_ui',ascending = False).head(5)

Unnamed: 0,iid,r_ui,est
1,Samsung âSamsung Galaxy A3 Smartphone (12 cm...,10.0,7.570398
49,"Samsung Galaxy Ace Plus S7500 Smartphone (9,3 ...",10.0,7.496931
778,Nokia Lumia 820 8GB GSM 4G LTE Windows 8 Smart...,10.0,9.510744
7976,"Sony Xperia Z1 Compact Smartphone (4,3 Zoll (1...",10.0,6.647608
8353,"APPLE iPhone 7 - Silver, 128 GB",10.0,7.680066


In [44]:
#9. Trying cross validation
from surprise.model_selection import cross_validate
dict1 = cross_validate(algo_i, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.5587  2.5371  2.5986  2.5178  2.5499  2.5524  0.0269  
MAE (testset)     1.9062  1.8923  1.9306  1.8723  1.9123  1.9027  0.0196  
Fit time          0.98    0.91    0.93    1.02    1.04    0.98    0.05    
Test time         1.16    1.29    1.17    1.28    1.17    1.21    0.06    


In [45]:
dict1 #Test data results can be seen here.

{'test_rmse': array([2.55867398, 2.53709758, 2.59861332, 2.51784648, 2.54992589]),
 'test_mae': array([1.90619635, 1.89231041, 1.93062834, 1.87230087, 1.9123128 ]),
 'fit_time': (0.979832649230957,
  0.9111647605895996,
  0.9300198554992676,
  1.0159988403320312,
  1.038663387298584),
 'test_time': (1.1560840606689453,
  1.2864670753479004,
  1.1717016696929932,
  1.2776610851287842,
  1.1712894439697266)}

In [46]:
dict1['test_rmse'].mean() #Mean RMSE for test data with 5 folds CV.

2.552431450162284

In [47]:
#We can see that this RMSE is even lesser than the same model without CV, from which we can infer that CV has improved the results.

In [48]:
'''10. We use popularity based recommendation systems whenever a new user has been introduced in the system.
       In other words, whenever there's no previous record of a user, it is impossible to predict what he/she would like, as there's
       no data of that person. In that scenario, an assumption is taken into consideration that this user should have the taste similar
       to the taste of other users with same basic characteristics, for example same geographical location.
       Thus whatever the other users like on an average, the same things are recommended to this new user.
       For example, a new streamer is added on Netflix in India, then he/she would be recommended the content which is famous in India, say,
       the Indian web series Sacred Games.'''

"10. We use popularity based recommendation systems whenever a new user has been introduced in the system.\n       In other words, whenever there's no previous record of a user, it is impossible to predict what he/she would like, as there's\n       no data of that person. In that scenario, an assumption is taken into consideration that this user should have the taste similar\n       to the taste of other users with same basic characteristics, for example same geographical location.\n       Thus whatever the other users like on an average, the same things are recommended to this new user.\n       For example, a new streamer is added on Netflix in India, then he/she would be recommended the content which is famous in India, say,\n       the Indian web series Sacred Games."

In [49]:
'''11. Classification based system is used in such a way that the data of both of a specific user and a specific product is
       clubbed together and a conventional classification based model is trained on that data in order to predict whether the
       user should be recommended that product or not.
       A particular problem with this system is that if there are 1000 users in the system and if there are 1000 products as well,
       then hundreds of thousands of models will be needed to be trained in order to build a CF based model for every possible combination
       of a user and a product.
       Thus it can be used wherever there are very low number of users and products in the system.'''

'11. Classification based system is used in such a way that the data of both of a specific user and a specific product is\n       clubbed together and a conventional classification based model is trained on that data in order to predict whether the\n       user should be recommended that product or not.\n       A particular problem with this system is that if there are 1000 users in the system and if there are 1000 products as well,\n       then hundreds of thousands of models will be needed to be trained in order to build a CF based model for every possible combination\n       of a user and a product.\n       Thus it can be used wherever there are very low number of users and products in the system.'

In [50]:
'''Methods to further improve the recommendation to different user :-
   1. There are several hyperparameters present in all of these algorithms. In order to further optimize the model performances,
      hyperparameter tuning can be done.
   2. After data cleanup, the data with more than 1 million entries was shrinked to only ~38 thousand entries. Larger dataset with
      cleaner data might have improved the results.'''

'Methods to further improve the recommendation to different user :-\n   1. There are several hyperparameters present in all of these algorithms. In order to further optimize the model performances,\n      hyperparameter tuning can be done.\n   2. After data cleanup, the data with more than 1 million entries was shrinked to only ~38 thousand entries. Larger dataset with\n      cleaner data might have improved the results.'