In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rake_nltk import Rake
import pandas as pd
import numpy as np
import nltk
import pandasql as ps
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
bookdata=pd.read_csv('Cleaned datasets/BOOKMASTER.csv')

In [3]:
bookdata.head()

Unnamed: 0,BookID,GENRE,AUTHORDESC,AUTHOR,BOOKTITLE,USERRATINGS,Popularity,PAGES,published_year,SUMMARY
0,114530,Fiction,"Mark Haddon is a British novelist and poet, be...",Mark Haddon,The Curious Incident of the Dog in the Night-T...,3.87,15417.96852,226,2004,Christopher John Francis Boone knows all the c...
1,133131,Science Fiction,Vernor Steffen Vinge is a retired San Diego St...,Vernor Vinge,Rainbows End by Vernor Vinge,3.76,218.146718,381,2007,Robert Gu is a recovering Alzheimer's patient....
2,153927,Nonfiction,ROBERT B. BAER is one of the most accomplished...,Robert B. Baer,See No Evil: The True Story of a Ground Soldie...,3.93,64.196614,320,2003,"In his explosive New York Times bestseller, to..."
3,160262,History,Anthony Everitt is a British academic. He stud...,Anthony Everitt,Augustus: The Life of Rome's First Emperor by ...,4.03,77.324027,327,2006,He found Rome made of clay and left it made of...
4,133451,Mystery,P.J. Parrish is the New York Times bestselling...,P.J. Parrish,"An Unquiet Grave (Louis Kincaid, #7) by P.J. P...",4.12,10.45441,384,2006,Not Every Soul Rest in PeaceIn a remote corner...


In [4]:
len(bookdata)

2329

In [5]:
bookdata.columns

Index(['BookID', 'GENRE', 'AUTHORDESC', 'AUTHOR', 'BOOKTITLE', 'USERRATINGS',
       'Popularity', 'PAGES', 'published_year', 'SUMMARY'],
      dtype='object')

**Create a keywords class to add all the keywords related to a book as a feature**

In [6]:
bookdata['keywords']=''

In [7]:
for index,record in bookdata.iterrows():
    rake=Rake()
    summary=record['SUMMARY']
    rake.extract_keywords_from_text(summary)
    keyword_count=rake.get_ranked_phrases()
    bookdata.at[index,'keywords']=list(keyword_count)

In [8]:
bookdata['keywords'].head()

0    [christopher john francis boone knows, old chr...
1    [digital contextthrough smart contact lenses, ...
2    [top cia operative robert baer paints, nationa...
3    [roman power two thousand years ago laid, two ...
4    [behind rusted iron gates, much louis kincaid ...
Name: keywords, dtype: object

So now we can drop the summary column

In [9]:
bookdata.drop('SUMMARY',inplace=True,axis=1)

In [10]:
bookdata.head()

Unnamed: 0,BookID,GENRE,AUTHORDESC,AUTHOR,BOOKTITLE,USERRATINGS,Popularity,PAGES,published_year,keywords
0,114530,Fiction,"Mark Haddon is a British novelist and poet, be...",Mark Haddon,The Curious Incident of the Dog in the Night-T...,3.87,15417.96852,226,2004,"[christopher john francis boone knows, old chr..."
1,133131,Science Fiction,Vernor Steffen Vinge is a retired San Diego St...,Vernor Vinge,Rainbows End by Vernor Vinge,3.76,218.146718,381,2007,"[digital contextthrough smart contact lenses, ..."
2,153927,Nonfiction,ROBERT B. BAER is one of the most accomplished...,Robert B. Baer,See No Evil: The True Story of a Ground Soldie...,3.93,64.196614,320,2003,"[top cia operative robert baer paints, nationa..."
3,160262,History,Anthony Everitt is a British academic. He stud...,Anthony Everitt,Augustus: The Life of Rome's First Emperor by ...,4.03,77.324027,327,2006,"[roman power two thousand years ago laid, two ..."
4,133451,Mystery,P.J. Parrish is the New York Times bestselling...,P.J. Parrish,"An Unquiet Grave (Louis Kincaid, #7) by P.J. P...",4.12,10.45441,384,2006,"[behind rusted iron gates, much louis kincaid ..."


In [11]:
bookdata.columns

Index(['BookID', 'GENRE', 'AUTHORDESC', 'AUTHOR', 'BOOKTITLE', 'USERRATINGS',
       'Popularity', 'PAGES', 'published_year', 'keywords'],
      dtype='object')

In [12]:
cols=['BookID', 'GENRE', 'BOOKTITLE','AUTHOR','published_year','PAGES','keywords']

In [13]:
bookdata=bookdata[cols]

In [14]:
bookdata.head()

Unnamed: 0,BookID,GENRE,BOOKTITLE,AUTHOR,published_year,PAGES,keywords
0,114530,Fiction,The Curious Incident of the Dog in the Night-T...,Mark Haddon,2004,226,"[christopher john francis boone knows, old chr..."
1,133131,Science Fiction,Rainbows End by Vernor Vinge,Vernor Vinge,2007,381,"[digital contextthrough smart contact lenses, ..."
2,153927,Nonfiction,See No Evil: The True Story of a Ground Soldie...,Robert B. Baer,2003,320,"[top cia operative robert baer paints, nationa..."
3,160262,History,Augustus: The Life of Rome's First Emperor by ...,Anthony Everitt,2006,327,"[roman power two thousand years ago laid, two ..."
4,133451,Mystery,"An Unquiet Grave (Louis Kincaid, #7) by P.J. P...",P.J. Parrish,2006,384,"[behind rusted iron gates, much louis kincaid ..."


In [15]:
b=bookdata.copy()

In [16]:
bookdata.dtypes

BookID             int64
GENRE             object
BOOKTITLE         object
AUTHOR            object
published_year     int64
PAGES              int64
keywords          object
dtype: object

In [17]:
bookdata['published_year'] = bookdata['published_year'].astype(str)
bookdata['PAGES'] = bookdata['PAGES'].astype(str)

In [18]:
bookdata['word_dictionary']=''

Add data of every column to the final word dictionary column

In [19]:
bookdata.dtypes

BookID              int64
GENRE              object
BOOKTITLE          object
AUTHOR             object
published_year     object
PAGES              object
keywords           object
word_dictionary    object
dtype: object

In [20]:
bookdata= bookdata.set_index('BookID')

In [21]:
bookdata.columns

Index(['GENRE', 'BOOKTITLE', 'AUTHOR', 'published_year', 'PAGES', 'keywords',
       'word_dictionary'],
      dtype='object')

In [22]:
columns = bookdata.columns
for index, record in bookdata.iterrows():
    words = ''
    for col in columns:
        if col == 'keywords':
            words = words + ' '.join(record[col])+ ' '
        else:
            words = words + record[col]+ ' '

    bookdata.at[index,'word_dictionary'] = words

In [23]:
bookdata.head()

Unnamed: 0_level_0,GENRE,BOOKTITLE,AUTHOR,published_year,PAGES,keywords,word_dictionary
BookID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
114530,Fiction,The Curious Incident of the Dog in the Night-T...,Mark Haddon,2004,226,"[christopher john francis boone knows, old chr...",Fiction The Curious Incident of the Dog in the...
133131,Science Fiction,Rainbows End by Vernor Vinge,Vernor Vinge,2007,381,"[digital contextthrough smart contact lenses, ...",Science Fiction Rainbows End by Vernor Vinge V...
153927,Nonfiction,See No Evil: The True Story of a Ground Soldie...,Robert B. Baer,2003,320,"[top cia operative robert baer paints, nationa...",Nonfiction See No Evil: The True Story of a Gr...
160262,History,Augustus: The Life of Rome's First Emperor by ...,Anthony Everitt,2006,327,"[roman power two thousand years ago laid, two ...",History Augustus: The Life of Rome's First Emp...
133451,Mystery,"An Unquiet Grave (Louis Kincaid, #7) by P.J. P...",P.J. Parrish,2006,384,"[behind rusted iron gates, much louis kincaid ...","Mystery An Unquiet Grave (Louis Kincaid, #7) b..."


Now we can drop all the other columns since we have its data in the word dictionary

In [24]:
cols=bookdata.columns

In [25]:
cols

Index(['GENRE', 'BOOKTITLE', 'AUTHOR', 'published_year', 'PAGES', 'keywords',
       'word_dictionary'],
      dtype='object')

In [26]:
cols=cols.delete(len(cols)-1)

In [27]:
bookdata.drop(cols,axis=1,inplace=True)

In [28]:
bookdata.head()

Unnamed: 0_level_0,word_dictionary
BookID,Unnamed: 1_level_1
114530,Fiction The Curious Incident of the Dog in the...
133131,Science Fiction Rainbows End by Vernor Vinge V...
153927,Nonfiction See No Evil: The True Story of a Gr...
160262,History Augustus: The Life of Rome's First Emp...
133451,"Mystery An Unquiet Grave (Louis Kincaid, #7) b..."


In [29]:
bookdata.shape

(2329, 1)

In [30]:
bookdata['BookID']= bookdata.index

In [31]:
bookdata=bookdata.reset_index(drop='index')

In [33]:
bookdata=bookdata[['BookID','word_dictionary']]

In [34]:
bookdata.head()

Unnamed: 0,BookID,word_dictionary
0,114530,Fiction The Curious Incident of the Dog in the...
1,133131,Science Fiction Rainbows End by Vernor Vinge V...
2,153927,Nonfiction See No Evil: The True Story of a Gr...
3,160262,History Augustus: The Life of Rome's First Emp...
4,133451,"Mystery An Unquiet Grave (Louis Kincaid, #7) b..."


In [35]:
b.head()

Unnamed: 0,BookID,GENRE,BOOKTITLE,AUTHOR,published_year,PAGES,keywords
0,114530,Fiction,The Curious Incident of the Dog in the Night-T...,Mark Haddon,2004,226,"[christopher john francis boone knows, old chr..."
1,133131,Science Fiction,Rainbows End by Vernor Vinge,Vernor Vinge,2007,381,"[digital contextthrough smart contact lenses, ..."
2,153927,Nonfiction,See No Evil: The True Story of a Ground Soldie...,Robert B. Baer,2003,320,"[top cia operative robert baer paints, nationa..."
3,160262,History,Augustus: The Life of Rome's First Emperor by ...,Anthony Everitt,2006,327,"[roman power two thousand years ago laid, two ..."
4,133451,Mystery,"An Unquiet Grave (Louis Kincaid, #7) by P.J. P...",P.J. Parrish,2006,384,"[behind rusted iron gates, much louis kincaid ..."


In [36]:
b['CombinedFeature']=b['BOOKTITLE']+' '+b['GENRE']

In [37]:
b.head()

Unnamed: 0,BookID,GENRE,BOOKTITLE,AUTHOR,published_year,PAGES,keywords,CombinedFeature
0,114530,Fiction,The Curious Incident of the Dog in the Night-T...,Mark Haddon,2004,226,"[christopher john francis boone knows, old chr...",The Curious Incident of the Dog in the Night-T...
1,133131,Science Fiction,Rainbows End by Vernor Vinge,Vernor Vinge,2007,381,"[digital contextthrough smart contact lenses, ...",Rainbows End by Vernor Vinge Science Fiction
2,153927,Nonfiction,See No Evil: The True Story of a Ground Soldie...,Robert B. Baer,2003,320,"[top cia operative robert baer paints, nationa...",See No Evil: The True Story of a Ground Soldie...
3,160262,History,Augustus: The Life of Rome's First Emperor by ...,Anthony Everitt,2006,327,"[roman power two thousand years ago laid, two ...",Augustus: The Life of Rome's First Emperor by ...
4,133451,Mystery,"An Unquiet Grave (Louis Kincaid, #7) by P.J. P...",P.J. Parrish,2006,384,"[behind rusted iron gates, much louis kincaid ...","An Unquiet Grave (Louis Kincaid, #7) by P.J. P..."


### Vectorize the data in the word dictionary feature column  

In [58]:
cm=CountVectorizer(analyzer='word',stop_words='english').fit_transform(bookdata['word_dictionary'])

In [59]:
cm

<2329x36262 sparse matrix of type '<class 'numpy.int64'>'
	with 186114 stored elements in Compressed Sparse Row format>

In [60]:
cs=cosine_similarity(cm)

In [61]:
print(cs)

[[1.         0.11914264 0.02407661 ... 0.05692384 0.02359938 0.        ]
 [0.11914264 1.         0.09850604 ... 0.07991524 0.03313111 0.04603415]
 [0.02407661 0.09850604 1.         ... 0.05075542 0.04208417 0.02894169]
 ...
 [0.05692384 0.07991524 0.05075542 ... 1.         0.02261335 0.00814598]
 [0.02359938 0.03313111 0.04208417 ... 0.02261335 1.         0.00675429]
 [0.         0.04603415 0.02894169 ... 0.00814598 0.00675429 1.        ]]


In [62]:
import json
fileName='Cleaned datasets/fin.json'
purchaseData = json.loads(open(fileName).read())

In [63]:
def recommendations(titles):
    scores=[]
    recommendedBooks=[]
    arr=np.array(titles)
    count=len(titles)
    for title in titles:
        index=bookdata.index[b.CombinedFeature== title[1]]
        try:
            index=index.values[0]
        except:
            pass
        scores+=list(enumerate(cs[index]))
    sorted_scores=sorted(scores,key=lambda x:x[1],reverse=True)   
    sorted_scores=sorted_scores[count:count+10]
    for idx in sorted_scores:
        book=bookdata['BookID'].iloc[idx[0]]
        accuracy=round(idx[1],2)
#         val=f"{book} with ({round(idx[1],2)}%) match to your taste"
        if book not in recommendedBooks and book not in arr[:,1] :
             recommendedBooks.append(book)
             
#             recommendedBooks.append((book,accuracy))
    return recommendedBooks

In [64]:
def showRecommendations(userid):
    titles=purchaseData[userid]
    if len(titles)>0:
        bookids=recommendations(titles)
    reclen=len(bookids)
    if reclen<10:
        bookids.extend(Top10Default[:10-reclen])
    return bookids

### Get Default recommendations

In [65]:
books=pd.read_csv('Cleaned datasets/BOOKMASTER.csv')

In [66]:
books.head()

Unnamed: 0,BookID,GENRE,AUTHORDESC,AUTHOR,BOOKTITLE,USERRATINGS,Popularity,PAGES,published_year,SUMMARY
0,114530,Fiction,"Mark Haddon is a British novelist and poet, be...",Mark Haddon,The Curious Incident of the Dog in the Night-T...,3.87,15417.96852,226,2004,Christopher John Francis Boone knows all the c...
1,133131,Science Fiction,Vernor Steffen Vinge is a retired San Diego St...,Vernor Vinge,Rainbows End by Vernor Vinge,3.76,218.146718,381,2007,Robert Gu is a recovering Alzheimer's patient....
2,153927,Nonfiction,ROBERT B. BAER is one of the most accomplished...,Robert B. Baer,See No Evil: The True Story of a Ground Soldie...,3.93,64.196614,320,2003,"In his explosive New York Times bestseller, to..."
3,160262,History,Anthony Everitt is a British academic. He stud...,Anthony Everitt,Augustus: The Life of Rome's First Emperor by ...,4.03,77.324027,327,2006,He found Rome made of clay and left it made of...
4,133451,Mystery,P.J. Parrish is the New York Times bestselling...,P.J. Parrish,"An Unquiet Grave (Louis Kincaid, #7) by P.J. P...",4.12,10.45441,384,2006,Not Every Soul Rest in PeaceIn a remote corner...


In [129]:
q='select BookID from books where USERRATINGS>4 and Popularity>29000 order by Popularity desc'

In [130]:
ids=ps.sqldf(q)

In [131]:
ids

Unnamed: 0,BookID
0,119228
1,150279
2,171049
3,174056
4,156940
5,149888
6,141668
7,149428
8,168295
9,101133


In [132]:
Top10Default=list(ids['BookID'])

### Generate Recommendations for all the users

In [71]:
users=purchaseData.keys()

In [72]:
users=list(users)

In [73]:
def generateRecs(users):
    recs={}
    for user in users:
        _recs=','.join(map(str, showRecommendations(user))) 
        recs[user]=_recs
    return recs
    

In [74]:
recs=generateRecs(users)

**recs is the dictionary having user and the recommendations as key-value pairs**

In [76]:
recommendations=pd.DataFrame(recs,index=['Recommendations'])

In [77]:
recommendations=recommendations.T

In [78]:
recommendations

Unnamed: 0,Recommendations
608502,"107517,135185,178940,158208,103259,128314,1635..."
641650,"135185,179241,177188,167731,145619,114643,1032..."
619531,"145345,129371,141618,110105,169704,171644,1217..."
628661,"129769,105425,172969,118410,119688,118204,1427..."
622287,"100899,160887,164329,144974,139800,152007,1579..."
...,...
649637,"129944,161430,107050,100883,138442,156315,1046..."
669681,"157022,167798,119688,156906,132173,132764,1360..."
619346,"114660,118406,158364,157955,160350,120912,1341..."
630708,"143050,100327,130931,121497,140739,160363,1548..."


In [79]:
recommendations['UserID']=recommendations.index

In [80]:
recommendations=recommendations[['UserID','Recommendations']]

In [81]:
recommendations

Unnamed: 0,UserID,Recommendations
608502,608502,"107517,135185,178940,158208,103259,128314,1635..."
641650,641650,"135185,179241,177188,167731,145619,114643,1032..."
619531,619531,"145345,129371,141618,110105,169704,171644,1217..."
628661,628661,"129769,105425,172969,118410,119688,118204,1427..."
622287,622287,"100899,160887,164329,144974,139800,152007,1579..."
...,...,...
649637,649637,"129944,161430,107050,100883,138442,156315,1046..."
669681,669681,"157022,167798,119688,156906,132173,132764,1360..."
619346,619346,"114660,118406,158364,157955,160350,120912,1341..."
630708,630708,"143050,100327,130931,121497,140739,160363,1548..."


In [82]:
recommendations=recommendations.reset_index(drop='index')

In [83]:
recommendations

Unnamed: 0,UserID,Recommendations
0,608502,"107517,135185,178940,158208,103259,128314,1635..."
1,641650,"135185,179241,177188,167731,145619,114643,1032..."
2,619531,"145345,129371,141618,110105,169704,171644,1217..."
3,628661,"129769,105425,172969,118410,119688,118204,1427..."
4,622287,"100899,160887,164329,144974,139800,152007,1579..."
...,...,...
7494,649637,"129944,161430,107050,100883,138442,156315,1046..."
7495,669681,"157022,167798,119688,156906,132173,132764,1360..."
7496,619346,"114660,118406,158364,157955,160350,120912,1341..."
7497,630708,"143050,100327,130931,121497,140739,160363,1548..."


In [84]:
recommendations[['1','2','3','4','5','6','7','8','9','10']] = recommendations['Recommendations'].str.split(',',expand=True)

In [85]:
recommendations.drop('Recommendations',axis=1,inplace=True)

In [86]:
recommendations.head()

Unnamed: 0,UserID,1,2,3,4,5,6,7,8,9,10
0,608502,107517,135185,178940,158208,103259,128314,163566,114334,159439,135898
1,641650,135185,179241,177188,167731,145619,114643,103259,179744,171027,163566
2,619531,145345,129371,141618,110105,169704,171644,121705,170644,159600,126066
3,628661,129769,105425,172969,118410,119688,118204,142785,166469,104554,174897
4,622287,100899,160887,164329,144974,139800,152007,157955,178793,109725,144902


**In the above result the recommendations dataframe consists of the user id and 10 book recommendations to the user**

#### A method to find the recommendations for a particular user

In [87]:
def findRecs(userid):
    try:
        recs=recommendations[recommendations['UserID']==str(userid)][['1','2','3','4','5','6','7','8','9','10']]
        recommendedBooks=[]
        for i in range(1,11):
            _i=recs[f'{i}']
            recommendedBooks.append(_i[0])
    except:
        return Top10Default
    return recommendedBooks

In [88]:
ls=findRecs(608502)

In [89]:
ls

['107517',
 '135185',
 '178940',
 '158208',
 '103259',
 '128314',
 '163566',
 '114334',
 '159439',
 '135898']

## Final submission data for content based model

In [90]:
users=pd.read_csv("raw datasets/SAMPLESUBMISSION.CSV")

In [91]:
users

Unnamed: 0,USERID,PURCHASEDBOOKID
0,600003,
1,600008,
2,600011,
3,600017,
4,600020,
...,...,...
14995,674975,
14996,674984,
14997,674986,
14998,674992,


When we observe this, we can find that the user count is 15000, but the we got only 7500 user recommendations only

So lets find the missing userids and recommend them the default top10 books

In [92]:
userids=list(map(str,users['USERID']))

In [93]:
purchasedf=pd.read_csv('Cleaned datasets/purchaseHistory.csv')

In [94]:
purchasedf

Unnamed: 0,BookID,UserID,SUBSTATE
0,140361,608502,Santa Clara
1,118603,673204,San Diego
2,170523,641650,Oklahoma
3,111924,619531,Orlando
4,178056,628661,Orange
...,...,...,...
43067,178653,645595,Alabama
43068,168913,651099,New York
43069,156125,663991,Los Angeles
43070,157778,635431,San Bernardino


In [95]:
purchasedf['UserID']=purchasedf['UserID'].astype(str)

In [96]:
purchasedf.describe(include='object')

Unnamed: 0,UserID,SUBSTATE
count,43072,43072
unique,10877,55
top,601424,San Diego
freq,52,6232


we can observe that we have 10877 unique user's purchase history, so the remaining users data is not available

In [97]:
usersPurchased=purchasedf['UserID'].unique()

In [101]:
recc=recommendations['UserID']

**A method to find the missing users in the recommendations**

In [98]:
def diff(l1,l2):
    return list(set(l1)-set(l2))+list(set(l2)-set(l1))

In [114]:
missing_users=diff(usersPurchased,userids)

In [110]:
len(missing_users)

4123

In [113]:
len()

3378

In [115]:
missing_users.extend(diff(recc,usersPurchased))

In [116]:
len(missing_users)

7501

Check for fault in the calculation

In [118]:
A1=[1,2,3]
A2=[4,5,6]
diff(A1,A2)

[1, 2, 3, 4, 5, 6]

- From the above example we can see that we get the concatinated array if the two arrays are mutually exclusive
- So we can use the same technique with the recommended users vs missing users to find if they are mutually exclusive

In [120]:
if not (len(diff(recc,missing_users))-len(recc)==0):
    print('recommended users and missing users are mututally exclusive')
else:
    print('There is fault in the process')

recommended users and missing users are mututally exclusive


In [121]:
missing_users_rec=pd.DataFrame(pd.Series(missing_users),columns=['UserID'])

In [122]:
missing_users_rec

Unnamed: 0,UserID
0,646629
1,620050
2,629077
3,639886
4,668757
...,...
7496,627269
7497,618228
7498,619355
7499,601493


In [133]:
Top10Default

[119228,
 150279,
 171049,
 174056,
 156940,
 149888,
 141668,
 149428,
 168295,
 101133]

In [134]:
Top10DefaultString=','.join(map(str, Top10Default))

In [135]:
Top10DefaultString

'119228,150279,171049,174056,156940,149888,141668,149428,168295,101133'

In [136]:
missing_users_rec['Recommendations']=Top10DefaultString

In [137]:
missing_users_rec.head()

Unnamed: 0,UserID,Recommendations
0,646629,"119228,150279,171049,174056,156940,149888,1416..."
1,620050,"119228,150279,171049,174056,156940,149888,1416..."
2,629077,"119228,150279,171049,174056,156940,149888,1416..."
3,639886,"119228,150279,171049,174056,156940,149888,1416..."
4,668757,"119228,150279,171049,174056,156940,149888,1416..."


In [138]:
missing_users_rec[['1','2','3','4','5','6','7','8','9','10']] = missing_users_rec['Recommendations'].str.split(',',expand=True)

In [139]:
missing_users_rec.drop('Recommendations',axis=1,inplace=True)

In [140]:
missing_users_rec

Unnamed: 0,UserID,1,2,3,4,5,6,7,8,9,10
0,646629,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
1,620050,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
2,629077,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
3,639886,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
4,668757,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
...,...,...,...,...,...,...,...,...,...,...,...
7496,627269,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
7497,618228,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
7498,619355,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
7499,601493,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133


In [141]:
recommendations=recommendations.append(missing_users_rec)

In [142]:
recommendations.reset_index(drop="index",inplace=True)

In [143]:
recommendations.head()

Unnamed: 0,UserID,1,2,3,4,5,6,7,8,9,10
0,608502,107517,135185,178940,158208,103259,128314,163566,114334,159439,135898
1,641650,135185,179241,177188,167731,145619,114643,103259,179744,171027,163566
2,619531,145345,129371,141618,110105,169704,171644,121705,170644,159600,126066
3,628661,129769,105425,172969,118410,119688,118204,142785,166469,104554,174897
4,622287,100899,160887,164329,144974,139800,152007,157955,178793,109725,144902


In [144]:
q='select * from recommendations order by UserID'

In [145]:
recommendations=ps.sqldf(q)

In [146]:
recommendations

Unnamed: 0,UserID,1,2,3,4,5,6,7,8,9,10
0,600003,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
1,600008,139926,124775,173979,137665,117851,173926,169633,153927,107533,144648
2,600011,175781,102067,177642,172597,114660,143722,101912,148729,136986,179020
3,600017,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
4,600020,129177,160284,170462,130931,139926,114112,153115,147460,101974,155183
...,...,...,...,...,...,...,...,...,...,...,...
14995,674975,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
14996,674984,135185,103259,145619,157261,125711,134518,166626,150780,167731,154123
14997,674986,155103,178107,129279,154125,170696,154696,117532,102479,158752,159284
14998,674992,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133


In [147]:
recommendations.describe()

Unnamed: 0,UserID,1,2,3,4,5,6,7,8,9,10
count,15000,15000,15000,15000,15000,15000,15000,15000,15000,15000,15000
unique,15000,747,925,1004,1043,1111,1104,1150,1160,1157,1176
top,600003,119228,150279,171049,174056,156940,149888,141668,149428,168295,101133
freq,1,7544,7513,7502,7503,7502,7502,7505,7504,7505,7505


In [148]:
recommendations.to_csv('Cleaned datasets/SUBMISSION.csv',index=False)

- Advantage
    1. Model doesn’t need data of other users since recommendations are specific to a single user.
    2. It makes it easier to scale to a large number of users.
    3. The model can Capture the specific Interests of the user and can recommend items that very few other users are interested in.
- Disadvantage
    1. Feature representation of items is hand-engineered to some extent, this tech requires a lot of domain knowledge.
    2. The model can only make recommendations based on the existing interest of a user. In other words, the model has limited ability to expand on the user’s existing interests.