In [31]:
%matplotlib inline

import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

DATA_DIR = '../data/raw/'

np.random.seed(1234)

In [6]:
# load files
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
words = pd.read_csv(os.path.join(DATA_DIR, 'words.csv'), encoding='ISO-8859-1')
users = pd.read_csv(os.path.join(DATA_DIR, 'users.csv'))

### Check to see the representation of artists and users in the training and test set.

This would help us to construct our cross-validation scheme.

In [11]:
# Number of artists that are present both in training as well as test set.
len(set(train.Artist.unique()) & set(test.Artist.unique()))

50

In [12]:
# Number of users that are present both in training and test set
len(set(train.User.unique()) & set(test.User.unique()))

44643

In [15]:
print('Number of unique users in the training set: %d'%(len(train.User.unique())))
print('Number of unique users in the test set: %d'%(len(test.User.unique())))

Number of unique users in the training set: 49479
Number of unique users in the test set: 46092


In [16]:
new_users = len(set(test.User.unique()) - set(train.User.unique()))
print('Number of users that are in the test set but not in the training set: %d'%(new_users))

Number of users that are in the test set but not in the training set: 1449


In [18]:
# Lets look at the (artist, user) pair that are in training and test set
def check_membership(artist, user):
    return int(test.loc[(test.Artist == artist) & (test.User == user)].shape[0] != 0)

def count_pairs():
    pairs = 0
    for artist, user in zip(train.Artist, train.User):
        pairs += check_membership(artist, user)
    
    return pairs

common_pairs = count_pairs()
print('Number of (artist, user) pair in training and test set are: %d'%(common_pairs))

Number of (artist, user) pair in training and test set are: 116439


In [23]:
artist_user_mean_ratings = train.groupby(['Artist', 'User'])['Rating'].mean().to_dict()

In [33]:
mean_rating = train.Rating.mean() # mean rating irrespective of artist and user information.

** Since there is good amount of (artist, user) pair in the test set which are in the training set, we could create a normal KFold Cross validation set which would be representative of the test set. **

In [30]:
features = ['Artist', 'User'] # only consider these two features for now.

X = train[features]
y = train.Rating

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1324)

In [35]:
def basic_model(row):
    artist = row['Artist']
    user = row['User']
    
    if (artist, user) in artist_user_mean_ratings:
        return artist_user_mean_ratings[(artist, user)]
    else:
        return mean_rating
    
y_preds = X_test.apply(basic_model, axis=1)

In [38]:
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print('RMSE on the test set: %f'%(rmse))

RMSE on the test set: 8.123730


### Basic Model

<p>Lookup the mean rating for (artist, user) which exist in the training set, for those pairs which are not in the training set return mean rating as the prediction</p>