# Data Cleaning

In [1]:
import numpy as np
import pandas as pd

def convert_year(in_string):
    '''Returns input as integer if possible, else None'''
    try:
        return int(in_string)
    except:
        return None

def get_country(in_string):
    '''Return the country element from the location.'''
    try:
        return in_string.rsplit(',', 1)[1].strip('.;-')
    except:
        return in_string

def get_province(in_string):
    '''Return the province/state/area element from the location'''
    try:
        return in_string.rsplit(',', 2)[1].strip('.;-')
    except:
        return None

def get_clean_data(path='./data/'):
    '''
    Returns 3 cleaned datasets. Enter the path if the csv files is not under
    \data\ in your system
    :return:
    DataFrame - pandas dataframe of books
    DataFrame - pandas dataframe of users
    DataFrame - pandas dataframe of ratings
    '''
    # skip some lines. Only like 5 of them. Errors likely because there
    # are semicolons in the title and pandas recognizes it as another column
    df_books = pd.read_csv(
        path + "BX-Books.csv", sep=';', encoding="ISO-8859-1", error_bad_lines=False
    )
    df_users = pd.read_csv(path + "BX-Users.csv", sep=';', encoding="ISO-8859-1")
    df_ratings = pd.read_csv(
        path + "BX-Book-Ratings.csv", sep=';', encoding="ISO-8859-1"
    )
    df_books.columns = [
        'isbn', 'title', 'author', 'pub_year', 'publisher', 'url_s', 'url_m',
        'url_l'
    ]
    df_ratings.columns = ['user', 'isbn', 'rating']
    df_users.columns = ['user', 'location', 'age']
    df_books.pub_year = (
        df_books.pub_year.apply(convert_year)
    )
    # Drop the 3 bad rows
    df_books = df_books[~df_books.pub_year.isna()]

    # pub_year 0 most certainly means unknown value or null
    # anything > 2018 don't make sense either
    df_books.pub_year[
        (df_books.pub_year > 2018) | (df_books.pub_year == 0)
        ] = None

    # Age 0 doesnt make sense and is most likely unknown or unrecorded value
    # Age > 122 doesnt make sense either as 122 is the recorded oldest person
    # on earth. (Prolly a lot of those over 100 are errors too but we cant
    # tell)
    df_users.age[(df_users.age == 0) | (df_users.age > 122)] = None
    df_users["country"] = df_users.location.apply(get_country)
    df_users["province"] = df_users.location.apply(get_province)
    return df_books, df_users, df_ratings

In [2]:
df_books, df_users, df_ratings=get_clean_data()
df_books=df_books.drop(['url_s','url_m','url_l'],axis=1)
df_users=df_users.drop(['location','province'],axis=1)
df_users_withage=df_users
df_user_no_age=df_users=df_users.drop(['age'],axis=1)
df_ratings=df_ratings[df_ratings['rating']!=0]
df_ratings=df_ratings.merge(df_books)
df_ratings.columns=['userid','isbn','ratings','title','author','year','publisher']

genre = pd.read_csv('data/isbn_genre.csv')
genre=genre.drop('Unnamed: 0',axis=1)
df_item = df_ratings[['isbn','author']]
df_item.drop_duplicates(inplace=True)
df_item = pd.merge(right=df_item, left=genre,how='left')
df_ratings.sort_values('userid',inplace=True)
df_item.dropna(inplace=True)
df_ratings.dropna(inplace=True)

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  if (yield from self.run_code(code, result)):
A value is trying to be set

# One hot encode book authors as an item feature

In [5]:
from sklearn.preprocessing import OneHotEncoder
encode = OneHotEncoder(sparse=True)
item_features=encode.fit_transform(df_ratings[['author','isbn']])

In [6]:
item_features
#created a sparse matrix of item feature to fit in LightFM model

<378026x208323 sparse matrix of type '<class 'numpy.float64'>'
	with 756052 stored elements in Compressed Sparse Row format>

# Fit Lightfm hybrid model with author

In [8]:
from lightfm.data import Dataset
dataset=Dataset()
dataset.fit(df_ratings.userid.values,df_ratings.isbn.values,item_features = df_item['author'].values)
# fit ratings, book isbn and book features to the model
# 



In [10]:
item_sub = df_ratings[['isbn', 'author']]
item_tuples = [tuple(x) for x in item_sub.values]

user_sub = df_ratings[['userid', 'isbn']]
user_tuples = [tuple(x) for x in user_sub.values]

In [11]:
(interactions, weights) = dataset.build_interactions(user_tuples)
interactions
# build interaction on what item the user rated and the cooresponing item feature

<67071x147100 sparse matrix of type '<class 'numpy.int32'>'
	with 378026 stored elements in COOrdinate format>

In [13]:
from lightfm.cross_validation import random_train_test_split
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(seed=111))

In [14]:
from lightfm import LightFM
# Define a new model instance
model = LightFM(loss='warp',
                no_components=20)

# Fit the hybrid model, remember to pass in item features.
model = model.fit(train,
                item_features=item_features,
                epochs=10,
                num_threads=4)

In [15]:
from lightfm.evaluation import auc_score
# Don't forget the pass in the item features again!
train_auc = auc_score(model,
                      train,
                      item_features=item_features).mean()
print('Hybrid training set AUC: %s' % train_auc)

Hybrid training set AUC: 0.9846635


In [17]:
test_auc = auc_score(model,
                    test,
                    train_interactions=train,
                    item_features=item_features).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid test set AUC: 0.633271


# Recoomedations 

In [None]:
#retriving the mappings of isbn and 
item_ids = list(dataset.mapping()[0].values())

In [None]:
#make predictions, noted the user_ids is not the mapped userid, need to transform back to the original userid
pred = model.predict(user_ids=66557,item_ids=list(dataset.mapping()[2].values()))
l=list(zip(list(dataset.mapping()[2].keys()),list(pred)))

# top 5 recommendation for user 276847

In [120]:
pred_df = pd.DataFrame(l,columns = ['isbn','recommendation_scores'])
pred_df = pred_df.sort_values("recommendation_scores",ascending=False)
pred_df.head()

Unnamed: 0,isbn,recommendation_scores
76486,440192463,0.317054
135578,312272103,0.246908
104677,373226403,0.185529
114205,156013487,0.092207
98574,373122772,0.034806


In [74]:
df_ratings[df_ratings.userid == 276847].head(10)

Unnamed: 0,userid,isbn,ratings,title,author,year,publisher
1194,276847,3551551677,10,Harry Potter und der Stein der Weisen,Joanne K. Rowling,1999.0,Carlsen Verlag GmbH
1182,276847,347354034X,7,Die Welle,Rhue,1998.0,"Ullstein-Taschenbuch-Verlag, Zweigniederlassun..."
1190,276847,3499222213,7,Im Keller.,Jan Philipp Reemtsma,1998.0,Rowohlt Tb.
1191,276847,3499228297,6,"Alte Freunde, neue Feinde. Ein Fall fÃ?Â¼r Ber...",Philip Kerr,2000.0,Rowohlt Tb.
1193,276847,3506464078,10,Le Petit Prince. (FranzÃ?Â¶sische Ausgabe). (L...,Antoine de Saint-Exupery,1981.0,"F. SchÃ?Â¶ningh, Paderborn"
1209,276847,3551551685,10,Harry Potter und die Kammer des Schreckens,Joanne K. Rowling,2000.0,Carlsen Verlag GmbH
1220,276847,3551551693,10,Harry Potter und der Gefangene von Azkaban,J. K. Rowling,1999.0,Carlsen Verlag GmbH
1230,276847,3551551936,10,Harry Potter Und Der Feuerkelch,Joanne K. Rowling,1999.0,Carlsen Verlag GmbH
1244,276847,3608932240,7,Der Herr der Ringe. AnhÃ?Â¤nge und Register.,John Ronald Reuel Tolkien,2000.0,Klett-Cotta
1245,276847,360893541X,10,Die Gefahrten I,J. R. R. Tolkien,2001.0,Distribooks


In [71]:
#output!
top5 = ['0440192463','0312272103','0373226403','0156013487','0373122772']

for x in top5:
    print(df_books[['isbn','title']][df_books.isbn==x])

             isbn        title
60888  0440192463  Valediction
              isbn                                              title
120704  0312272103  McCarthy's Bar: A Journey of Discovery in the ...
             isbn                                              title
61364  0373226403  Hidden Hearts (Hide And Seek) (Harlequin Intri...
              isbn                                              title
196230  0156013487  It's the Little Things: Everyday Interactions ...
             isbn                                      title
31915  0373122772  The Disobedient Mistress  (Sister Brides)
