In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

### Loading Files

In [None]:
books = pd.read_csv('./data/BX-Books.csv', sep=';', quotechar='"', escapechar='\\', header=0)
users = pd.read_csv('./data/BX-Users.csv')
ratings_train = pd.read_csv('./data/BX_train.csv',
                            header=None, names=['UserID', 'BookID', 'Rating'])
ratings_test = pd.read_csv('./data/BX_test.csv',
                           header=None, names=['UserID', 'BookID', 'Rating'])

Rebuilding train/test

In [None]:
test_submission = ratings_test[ratings_test.Rating == 55]
tmp = ratings_test[ratings_test.Rating != 55]
train_explicit = ratings_train[ratings_train.Rating != 0]
train_explicit = train_explicit.append(tmp)
train_implicit = ratings_train[ratings_train.Rating == 0]
#pickle.dump(train_explicit, open('./data/input_1.pcl', 'wb'))
#pickle.dump(test_submission, open('./data/submission.pcl', 'wb'))

loading saved inputs (run this to skip "converting implicit feedbacks" section)

In [2]:
train_explicit = pickle.load(open('./data/input_1.pcl', 'rb'))
train_all = pickle.load(open('./data/input_2.pcl', 'rb'))
test_submission = pickle.load(open('./data/submission.pcl', 'rb'))
# user_item_matrix_1 = pickle.load(open('user_item_matrix_1.pcl', 'wb'))
# user_item_matrix_2 = pickle.load(open('user_item_matrix_2.pcl', 'wb'))

### Converting implicit feedbacks to ratings

Assume that we want to convert the implicit rating for item i for user u, and I is the set of other users who rated item i. Then:

$$ R_{ui} = \mu_{u} + \frac{\sum_{s\in{I}}{(R_{si} - \mu_{s})}}{len(I)} + \lambda $$

* The first term is the average rating value for user u
* The second term is the unbiased goodness of item i, which is the average unbiased current explicit ratings. To calculate this, assume that I is the set of users who explicitly rated the item i. Then the average unbiased ratings of item i would be the average distance of explicit ratings and average user ratings. 
* The third term (lambda) is the implicit rating constant. This can be set to 1 in a rating scale of 10.
* For 'new user' situation we consider average of all ratings (~7.57) for the first term.

Improvement: change $$ \lambda \space --> \space \lambda * number\space of\space implicit\space feedback\space for\space item\space i $$

In [None]:
train_explicit_grouped_by_user = train_explicit[['UserID', 'Rating']].groupby('UserID')
adjusted_ratings = train_explicit.copy()
adjusted_ratings['Rating'] = train_explicit['Rating'] - \
                            train_explicit_grouped_by_user.transform('mean')['Rating']

In [None]:
train_explicit_item_mean = adjusted_ratings.groupby('BookID').mean()['Rating'].to_dict()
train_explicit_user_mean = train_explicit_grouped_by_user.mean()['Rating'].to_dict()

In [None]:
train_implicit_conversion = train_implicit.copy()
train_implicit_conversion['Rating'] = train_implicit_conversion.Rating.astype(pd.np.float64)
counter = 1
max_counter = len(train_implicit_conversion)
update_step = max_counter / 10
baseline_rating = train_explicit.Rating.mean()

In [None]:
for i, row in train_implicit_conversion.iterrows():
    term1 = 0
    if row.UserID in train_explicit_user_mean:
        term1 = train_explicit_user_mean[row.UserID]
    else:
        term1 = baseline_rating
    term2 = 0
    if row.BookID in train_explicit_item_mean:
        term2 = train_explicit_item_mean[row.BookID]
    term3 = 1 if (term1 + term2) < 9 else 0
    rating = term1 + term2 + term3
    if rating < 1:
        rating = 1
    if rating > 10: # impossible!
        rating = 10
    train_implicit_conversion.set_value(i, 'Rating', rating)
    if counter % update_step == 0:
        clear_output()
        print str(counter * 100 / max_counter) + "%"
    counter = counter + 1

In [None]:
train_all = train_explicit.append(train_implicit_conversion)
pickle.dump(train_all, open('./data/input_2.pcl', 'wb'))

### Fixing Book ID padding problem

In [3]:
train_explicit['BookID_org'] = train_explicit['BookID']
train_explicit['BookID'] = train_explicit['BookID'].apply(lambda x: x.zfill(10))
train_all['BookID_org'] = train_all['BookID']
train_all['BookID'] = train_all['BookID'].apply(lambda x: x.zfill(10))
test_submission['BookID_org'] = test_submission['BookID']
test_submission['BookID'] = test_submission['BookID'].apply(lambda x: x.zfill(10))

### Replacing BookID with books' features

In [None]:
train_explicit_1 = train_explicit.merge(books, left_on='BookID', right_on='ISBN', how='left')
train_all_1 = train_all.merge(books, left_on='BookID', right_on='ISBN', how='left')
test_submission_1 = test_submission.merge(books, left_on='BookID', right_on='ISBN', how='left')

In [None]:
train_explicit_missing_item = train_explicit_1[train_explicit_1.isnull().any(axis=1)]
train_all_missing_item = train_all_1[train_all_1.isnull().any(axis=1)]
test_submission_missing_item = test_submission_1[test_submission_1.isnull().any(axis=1)]

In [None]:
print(str(len(train_explicit_missing_item)) + " / " + str(len(train_explicit)))
print(str(len(train_all_missing_item)) + " / " + str(len(train_all)))
print(str(len(test_submission_missing_item)) + " / " + str(len(test_submission)))

### Replacing UserID with users' features

In [None]:
train_explicit_2 = train_explicit_1.merge(users,
                                        left_on='UserID', right_on='User-ID', how='left')
train_all_2 = train_all_1.merge(users, left_on='UserID', right_on='User-ID', how='left')
test_submission_2 = test_submission_1.merge(users,
                                          left_on='UserID', right_on='User-ID', how='left')

In [None]:
train_explicit_2['Age'] = pd.to_numeric(train_explicit_2.Age, errors='coerce')
train_all_2['Age'] = pd.to_numeric(train_all_2.Age, errors='coerce')
test_submission_2['Age'] = pd.to_numeric(test_submission_2.Age, errors='coerce')

In [None]:
average_age = train_all_2['Age'].mean()
train_explicit_2['Age'].fillna(average_age, inplace=True)
train_all_2['Age'].fillna(average_age, inplace=True)
test_submission_2['Age'].fillna(average_age, inplace=True)

In [None]:
train_explicit_missing_user = train_explicit_2[train_explicit_2.isnull().any(axis=1)]
train_all_missing_user = train_all_2[train_all_2.isnull().any(axis=1)]
test_submission_missing_user = test_submission_2[test_submission_2.isnull().any(axis=1)]

In [None]:
print(str(len(train_explicit_missing_user)) + " / " + str(len(train_explicit)))
print(str(len(train_all_missing_user)) + " / " + str(len(train_all)))
print(str(len(test_submission_missing_user)) + " / " + str(len(test_submission)))

### Extracting features

In [None]:
features = ['Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
            'Location', 'Age',
            'Rating']

In [None]:
train_explicit_2 = train_explicit_2[features]
train_all_2 = train_all_2[features]
test_submission_2 = test_submission_2[features]

### Saving Inputs

In [None]:
pickle.dump(train_explicit_2, open('./data/feature-based/train_explicit.pcl', 'wb'))
pickle.dump(train_all_2, open('./data/feature-based/train_all.pcl', 'wb'))
pickle.dump(test_submission_2, open('./data/feature-based/test_submission.pcl', 'wb'))

### Creating Inputs for SVD & NN

re-create user ids, and book ids

In [4]:
print len(train_explicit.UserID.unique()), len(train_all.UserID.unique())
print len(train_explicit.BookID.unique()), len(train_all.BookID.unique())

41636 56489
113326 232625


In [7]:
train_explicit_ws = train_explicit.append(test_submission)
df_users = pd.DataFrame(train_explicit_ws.UserID.unique()).reset_index()
df_books = pd.DataFrame(train_explicit_ws.BookID.unique()).reset_index()
df_users.columns = ['user_id', 'id']
df_books.columns = ['book_id', 'id']

train_explicit_m = train_explicit_ws.merge(df_books, left_on='BookID', right_on='id', how='left')
train_explicit_m = train_explicit_m.merge(df_users, left_on='UserID', right_on='id', how='left')
train_explicit_m = train_explicit_m[['UserID', 'BookID', 'BookID_org',
                                     'user_id', 'book_id',
                                     'Rating']]

In [9]:
train_all_ws = train_all.append(test_submission)
df_users = pd.DataFrame(train_all_ws.UserID.unique()).reset_index()
df_books = pd.DataFrame(train_all_ws.BookID.unique()).reset_index()
df_users.columns = ['user_id', 'id']
df_books.columns = ['book_id', 'id']

train_all_m = train_all_ws.merge(df_books, left_on='BookID', right_on='id', how='left')
train_all_m = train_all_m.merge(df_users, left_on='UserID', right_on='id', how='left')
train_all_m = train_all_m[['UserID', 'BookID', 'BookID_org',
                           'user_id', 'book_id',
                           'Rating']]

In [None]:
# train_explicit_m = train_explicit.copy()
# train_explicit_m['BookID'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
# train_explicit_m['BookID'] = pd.to_numeric(train_explicit_m.BookID, errors='raise')
# train_explicit_m = train_explicit_m.dropna()
# print len(train_explicit_m), " / ", len(train_explicit)

In [None]:
# train_all_m = train_all.copy()
# train_all_m['BookID'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
# train_all_m['BookID'] = pd.to_numeric(train_all_m.BookID, errors='raise')
# train_all_m = train_all_m.dropna()
# print len(train_all_m), " / ", len(train_all)

In [None]:
# test_submission_m = test_submission.copy()
# test_submission_m['BookID'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
# test_submission_m['BookID'] = pd.to_numeric(test_submission_m.BookID, errors='raise')
# test_submission_m = test_submission_m.dropna()
# print len(test_submission_m), " / ", len(test_submission)

In [10]:
train_all_m.head()

Unnamed: 0,UserID,BookID,BookID_org,user_id,book_id,Rating
0,126788,553284363,553284363,0,0,9.0
1,126788,743458680,743458680,0,1,7.0
2,126788,752848062,752848062,0,2,9.0
3,126788,786866020,786866020,0,3,10.0
4,126788,1857238583,1857238583,0,4,10.0


In [11]:
pickle.dump(train_explicit_m, open('./data/id-based/train_explicit_m.pcl', 'wb'))
pickle.dump(train_all_m, open('./data/id-based/train_all_m.pcl', 'wb'))

### Debugging and Visualizations

Distribution of ratings

In [None]:
train_explicit_grouped_by_user.mean()['Rating'].hist()

In [None]:
train_implicit_conversion.groupby('UserID').mean()['Rating'].hist()

In [None]:
adjusted_ratings.groupby('BookID').mean()['Rating'].hist()

multiple ratings for same user same item

In [None]:
g = train_explicit.groupby(['UserID', 'BookID'])
duplicates = g.filter(lambda x: len(x) > 1)

### Building user-item matrix

In [None]:
user_item_matrix_1 = train_explicit.pivot_table(index='UserID', columns='BookID', values='Rating')
pickle.dump(user_item_matrix_1, open('user_item_matrix_1.pcl', 'wb'))

In [None]:
user_item_matrix_2 = train_all.pivot(index='UserID', columns='BookID', values="Rating")
pickle.dump(user_item_matrix_2, open('user_item_matrix_2.pcl', 'wb'))

In [None]:
df_test = pickle.load(open('./data/test_submission.pcl', 'rb'))